In [1]:
import pandas as pd
from pyspark.sql import SparkSession

#### Start a Spark Session

In [2]:
spark = SparkSession.builder.appName('flight EDA').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/16 19:18:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

#### Read the dataset

1. Converting Excel to CSV for easy reading using pandas

In [4]:
pd.read_excel('/Applications/Repos/Repo/expedition/Spark/flight_price.xlsx').to_csv('flight_price.csv',index=False)

In [5]:
df_spark = spark.read.csv('flight_price.csv',header=True,inferSchema=True)

In [6]:
df_spark.describe()

DataFrame[summary: string, Airline: string, Date_of_Journey: string, Source: string, Destination: string, Route: string, Arrival_Time: string, Duration: string, Total_Stops: string, Additional_Info: string, Price: string]

2. Split Date of Journey to Date, Month and Year

In [7]:
from pyspark.sql.functions import split, col 

In [8]:
df_spark = df_spark.withColumn('Date',split(col('Date_of_Journey'),'/').getItem(0).cast('int')) \
                   .withColumn('Month',split(col('Date_Of_Journey'),'/').getItem(1).cast('int')) \
                   .withColumn('Year',split(col('Date_of_Journey'),'/').getItem(2).cast('int'))

In [9]:
df_spark.show(10)

+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+----+-----+----+
|          Airline|Date_of_Journey|  Source|Destination|               Route|           Dep_Time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|
+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+----+-----+----+
|           IndiGo|     24/03/2019|Banglore|  New Delhi|           BLR → DEL|2025-07-16 22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|
|        Air India|      1/05/2019| Kolkata|   Banglore|CCU → IXR → BBI →...|2025-07-16 05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|
|      Jet Airways|      9/06/2019|   Delhi|     Cochin|DEL → LKO → BOM →...|2025-07-16 09:25:00|04:25 10 Jun|     19h|    2 

3. Dropping unnessary columns

In [10]:
df_spark = df_spark.drop('Date_of_Journey')

In [11]:
df_spark = df_spark.drop('Route')

4. Spliting Departure time and Arrivial time

In [12]:
df_spark = df_spark.withColumn('Dep_time',split(col('Dep_Time'),' ').getItem(1))

In [13]:
df_spark.show()

+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+
|          Airline|  Source|Destination|Dep_time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|
+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+
|           IndiGo|Banglore|  New Delhi|22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|
|        Air India| Kolkata|   Banglore|05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|
|      Jet Airways|   Delhi|     Cochin|09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|   9|    6|2019|
|           IndiGo| Kolkata|   Banglore|18:05:00|       23:30|  5h 25m|     1 stop|             No info| 6218|  12|    5|2019|
|           IndiGo|Banglore|  New Delhi|16:50:00|       21:35|  4h 45m|     1 stop|             No info|13302| 

In [14]:
df_spark = df_spark.withColumn('Dep_hour',split(col('Dep_time'),':').getItem(0))\
                  .withColumn('Dep_Min',split(col('Dep_time'),':').getItem(1))

In [15]:
df_spark = df_spark.withColumn('Arrival_time',split(col('Arrival_Time'),' ').getItem(0))

In [16]:
df_spark.show()

+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+
|          Airline|  Source|Destination|Dep_time|Arrival_time|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|Dep_hour|Dep_Min|
+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+
|           IndiGo|Banglore|  New Delhi|22:20:00|       01:10|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|      22|     20|
|        Air India| Kolkata|   Banglore|05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|      05|     50|
|      Jet Airways|   Delhi|     Cochin|09:25:00|       04:25|     19h|    2 stops|             No info|13882|   9|    6|2019|      09|     25|
|           IndiGo| Kolkata|   Banglore|18:05:00|       23:30|  5h 25m|     1 stop|             No info| 6218|  12|    5|2019|      18| 

In [17]:
df_spark = df_spark.withColumn('Arrival_Hour',split(col('Arrival_time'),':').getItem(0))\
        .withColumn('Arrival_Min',split(col('Arrival_time'),':').getItem(1))

In [18]:
df_spark = df_spark.drop('Dep_time','Arrival_time')

In [19]:
df_spark = df_spark.withColumnRenamed('Dep_hour','Dep_Hour')

5. Checking for NULL values

In [20]:
# for a single specific column

df_spark.filter(col("Total_Stops").isNull()).show()

+---------+------+-----------+--------+-----------+---------------+-----+----+-----+----+--------+-------+------------+-----------+
|  Airline|Source|Destination|Duration|Total_Stops|Additional_Info|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|
+---------+------+-----------+--------+-----------+---------------+-----+----+-----+----+--------+-------+------------+-----------+
|Air India| Delhi|     Cochin| 23h 40m|       NULL|        No info| 7480|   6|    5|2019|      09|     45|          09|         25|
+---------+------+-----------+--------+-----------+---------------+-----+----+-----+----+--------+-------+------------+-----------+



In [21]:
# to check in the whole DataFrame

from pyspark.sql.functions import col, when, sum

columns_to_check = ['Airline',
 'Source',
 'Destination',
 'Duration',
 'Total_Stops',
 'Additional_Info',
 'Price',
 'Date',
 'Month',
 'Year',
 'Dep_Hour',
 'Dep_Min',
 'Arrival_Hour',
 'Arrival_Min']

# Apply list comprehension correctly

null_counts = df_spark.select([sum(when(col(c).isNull(),1).otherwise(0)) for c in columns_to_check])

In [22]:
null_counts.show()

+--------------------------------------------------+-------------------------------------------------+------------------------------------------------------+---------------------------------------------------+------------------------------------------------------+----------------------------------------------------------+------------------------------------------------+-----------------------------------------------+------------------------------------------------+-----------------------------------------------+---------------------------------------------------+--------------------------------------------------+-------------------------------------------------------+------------------------------------------------------+
|sum(CASE WHEN (Airline IS NULL) THEN 1 ELSE 0 END)|sum(CASE WHEN (Source IS NULL) THEN 1 ELSE 0 END)|sum(CASE WHEN (Destination IS NULL) THEN 1 ELSE 0 END)|sum(CASE WHEN (Duration IS NULL) THEN 1 ELSE 0 END)|sum(CASE WHEN (Total_Stops IS NULL) THEN 1 ELSE 0 END)|sum(

6. Dropping the Null rows

In [23]:
df_spark = df_spark.dropna(how='any')

Check if dropped

In [24]:
df_spark.filter(col('Total_Stops').isNull()).show()

+-------+------+-----------+--------+-----------+---------------+-----+----+-----+----+--------+-------+------------+-----------+
|Airline|Source|Destination|Duration|Total_Stops|Additional_Info|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|
+-------+------+-----------+--------+-----------+---------------+-----+----+-----+----+--------+-------+------------+-----------+
+-------+------+-----------+--------+-----------+---------------+-----+----+-----+----+--------+-------+------------+-----------+



In [25]:
unique = df_spark.select('Total_Stops').distinct()
unique.show()

+-----------+
|Total_Stops|
+-----------+
|    4 stops|
|   non-stop|
|    2 stops|
|     1 stop|
|    3 stops|
+-----------+



In [26]:
df_spark = df_spark.replace(['non-stop','1 stop','2 stops','3 stops','4 stops'],['0','1','2','3','4'],'Total_Stops')

In [27]:
df_spark.show()

+-----------------+--------+-----------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+------------+-----------+
|          Airline|  Source|Destination|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|
+-----------------+--------+-----------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+------------+-----------+
|           IndiGo|Banglore|  New Delhi|  2h 50m|          0|             No info| 3897|  24|    3|2019|      22|     20|          01|         10|
|        Air India| Kolkata|   Banglore|  7h 25m|          2|             No info| 7662|   1|    5|2019|      05|     50|          13|         15|
|      Jet Airways|   Delhi|     Cochin|     19h|          2|             No info|13882|   9|    6|2019|      09|     25|          04|         25|
|           IndiGo| Kolkata|   Banglore|  5h 25m|          1|             No info| 6218|  12|    5|2019|      18|     

In [28]:
df_spark.dtypes

[('Airline', 'string'),
 ('Source', 'string'),
 ('Destination', 'string'),
 ('Duration', 'string'),
 ('Total_Stops', 'string'),
 ('Additional_Info', 'string'),
 ('Price', 'int'),
 ('Date', 'int'),
 ('Month', 'int'),
 ('Year', 'int'),
 ('Dep_Hour', 'string'),
 ('Dep_Min', 'string'),
 ('Arrival_Hour', 'string'),
 ('Arrival_Min', 'string')]

In [29]:
df_spark = df_spark.withColumn('Dep_Hour',col('Dep_Hour').cast('int'))

In [30]:
df_spark = df_spark.withColumn('Dep_Min',col('Dep_Min').cast('int')) 
df_spark = df_spark.withColumn('Arrival_Hour',col('Arrival_Hour').cast('int')) 
df_spark = df_spark.withColumn('Arrival_Min',col('Arrival_Min').cast('int'))
df_spark = df_spark.withColumn('Total_Stops',col('Total_Stops').cast('int'))

In [31]:
df_spark = df_spark.drop('Additional_Info')

In [32]:
df_spark.show()

+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+
|          Airline|  Source|Destination|Duration|Total_Stops|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|
+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+
|           IndiGo|Banglore|  New Delhi|  2h 50m|          0| 3897|  24|    3|2019|      22|     20|           1|         10|
|        Air India| Kolkata|   Banglore|  7h 25m|          2| 7662|   1|    5|2019|       5|     50|          13|         15|
|      Jet Airways|   Delhi|     Cochin|     19h|          2|13882|   9|    6|2019|       9|     25|           4|         25|
|           IndiGo| Kolkata|   Banglore|  5h 25m|          1| 6218|  12|    5|2019|      18|      5|          23|         30|
|           IndiGo|Banglore|  New Delhi|  4h 45m|          1|13302|   1|    3|2019|      16|     50|          21|     

In [33]:
df_spark = df_spark.withColumn('Duraiton_Hour',split(col('Duration'),' ').getItem(0))

In [34]:
df_spark.show()

+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+
|          Airline|  Source|Destination|Duration|Total_Stops|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|Duraiton_Hour|
+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+
|           IndiGo|Banglore|  New Delhi|  2h 50m|          0| 3897|  24|    3|2019|      22|     20|           1|         10|           2h|
|        Air India| Kolkata|   Banglore|  7h 25m|          2| 7662|   1|    5|2019|       5|     50|          13|         15|           7h|
|      Jet Airways|   Delhi|     Cochin|     19h|          2|13882|   9|    6|2019|       9|     25|           4|         25|          19h|
|           IndiGo| Kolkata|   Banglore|  5h 25m|          1| 6218|  12|    5|2019|      18|      5|          23|         30|           5h|
|           IndiGo|B

In [35]:
from pyspark.sql.functions import regexp_extract, col

# Extract digits before 'h' and 'm'
df_spark = df_spark.withColumn("Duration_Hour", regexp_extract(col("Duration"), r'(\d+)h', 1))
df_spark = df_spark.withColumn("Duration_Min", regexp_extract(col("Duration"), r'(\d+)m', 1))

In [36]:
df_spark.show()

+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+-------------+------------+
|          Airline|  Source|Destination|Duration|Total_Stops|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|Duraiton_Hour|Duration_Hour|Duration_Min|
+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+-------------+------------+
|           IndiGo|Banglore|  New Delhi|  2h 50m|          0| 3897|  24|    3|2019|      22|     20|           1|         10|           2h|            2|          50|
|        Air India| Kolkata|   Banglore|  7h 25m|          2| 7662|   1|    5|2019|       5|     50|          13|         15|           7h|            7|          25|
|      Jet Airways|   Delhi|     Cochin|     19h|          2|13882|   9|    6|2019|       9|     25|           4|         25|          19h|           19|            

In [37]:
df_spark = df_spark.drop('Duraiton_Hour')

In [38]:
df_spark.show()

+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+------------+
|          Airline|  Source|Destination|Duration|Total_Stops|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|Duration_Hour|Duration_Min|
+-----------------+--------+-----------+--------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+------------+
|           IndiGo|Banglore|  New Delhi|  2h 50m|          0| 3897|  24|    3|2019|      22|     20|           1|         10|            2|          50|
|        Air India| Kolkata|   Banglore|  7h 25m|          2| 7662|   1|    5|2019|       5|     50|          13|         15|            7|          25|
|      Jet Airways|   Delhi|     Cochin|     19h|          2|13882|   9|    6|2019|       9|     25|           4|         25|           19|            |
|           IndiGo| Kolkata|   Banglore|  5h 25m|          1| 6218|  12|    5|2019

In [39]:
df_spark = df_spark.replace([''],['0'],'Duration_Min')

In [40]:
df_spark = df_spark.withColumn('Duration_Min',col('Duration_Min').cast('int'))

In [41]:
df_spark = df_spark.withColumn('Duration_Hour',col('Duration_Hour').cast('int'))

In [42]:
df_spark.printSchema()

root
 |-- Airline: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Total_Stops: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Date: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Dep_Hour: integer (nullable = true)
 |-- Dep_Min: integer (nullable = true)
 |-- Arrival_Hour: integer (nullable = true)
 |-- Arrival_Min: integer (nullable = true)
 |-- Duration_Hour: integer (nullable = true)
 |-- Duration_Min: integer (nullable = true)



In [43]:
df_spark = df_spark.drop('Duration')


In [44]:
df_spark.columns

['Airline',
 'Source',
 'Destination',
 'Total_Stops',
 'Price',
 'Date',
 'Month',
 'Year',
 'Dep_Hour',
 'Dep_Min',
 'Arrival_Hour',
 'Arrival_Min',
 'Duration_Hour',
 'Duration_Min']

1. Which airline offers the cheapest average price?

In [45]:
from pyspark.sql.functions import avg, col , round

df_spark.groupBy('Airline').agg(round(avg('Price'),2).alias('Avg_Price'))\
.orderBy(col('Avg_price').asc()).show()

+--------------------+---------+
|             Airline|Avg_Price|
+--------------------+---------+
|              Trujet|   4140.0|
|            SpiceJet|  4338.28|
|            Air Asia|  5590.26|
|              IndiGo|  5673.68|
|               GoAir|  5861.06|
|             Vistara|  7796.35|
|Vistara Premium e...|  8962.33|
|           Air India|  9612.43|
|   Multiple carriers| 10902.68|
|Multiple carriers...| 11418.85|
|         Jet Airways| 11643.92|
|Jet Airways Business| 58358.67|
+--------------------+---------+



2. Which airline has the longest average duration?

3. Which routes (Source → Destination) are the most popular?

In [46]:
from pyspark.sql.functions import count

df_spark.groupBy(['Source','Destination']).agg(count('*').alias('Flight_count')).orderBy('Flight_Count',ascending=False).show()

+--------+-----------+------------+
|  Source|Destination|Flight_count|
+--------+-----------+------------+
|   Delhi|     Cochin|        4536|
| Kolkata|   Banglore|        2871|
|Banglore|      Delhi|        1265|
|Banglore|  New Delhi|         932|
|  Mumbai|  Hyderabad|         697|
| Chennai|    Kolkata|         381|
+--------+-----------+------------+



4. Which routes are the most expensive on average?

In [47]:
df_spark.groupBy(['Source','Destination']).agg(round(avg('Price'),2).alias('Avg_price')).orderBy('Avg_price',ascending=False).show()

+--------+-----------+---------+
|  Source|Destination|Avg_price|
+--------+-----------+---------+
|Banglore|  New Delhi| 11917.72|
|   Delhi|     Cochin| 10540.11|
| Kolkata|   Banglore|  9158.39|
|Banglore|      Delhi|  5143.92|
|  Mumbai|  Hyderabad|  5059.71|
| Chennai|    Kolkata|  4789.89|
+--------+-----------+---------+



5. How does the number of stops affect flight prices?

In [48]:
df_spark.groupBy('Total_Stops').agg(round(avg('Price'),2).alias('Avg_Price')).orderBy('Total_Stops',ascending=True).show()

+-----------+---------+
|Total_Stops|Avg_Price|
+-----------+---------+
|          0|   5024.9|
|          1| 10594.12|
|          2| 12715.81|
|          3|  13112.0|
|          4|  17686.0|
+-----------+---------+



6. Which airlines offer the most non-stop flights?

In [49]:
df_spark.filter(df_spark['Total_Stops'] == '0')\
.groupBy('Airline').agg(count('Total_Stops').alias('Non-stop_flights'))\
.orderBy('Non-stop_flights').show()

+--------------------+----------------+
|             Airline|Non-stop_flights|
+--------------------+----------------+
|Vistara Premium e...|               3|
|               GoAir|              92|
|            Air Asia|             181|
|             Vistara|             264|
|           Air India|             417|
|         Jet Airways|             623|
|            SpiceJet|             670|
|              IndiGo|            1241|
+--------------------+----------------+



7. What time of day has the cheapest departures?

In [50]:
from pyspark.sql.functions import col, when, avg

# 1. Build the time‑slot column correctly

df_spark = df_spark.withColumn('Dep_Time_Slot',
                                when((col('Dep_hour') >= 5) & (col('Dep_hour') < 12 ),'Morning')
                                .when((col('Dep_hour') >= 12) & (col('Dep_hour') < 17),'Afternoon')
                                .when((col('Dep_hour') >= 17) & (col('Dep_hour') < 21),'Evening')
                                .otherwise('Night')
)

# Calulate the average price per slot

avg_price_by_slot = df_spark.groupBy('Dep_Time_Slot').agg(round(avg('Price'),2).alias('Avg_price')).orderBy('Avg_price')


avg_price_by_slot.show()


+-------------+---------+
|Dep_Time_Slot|Avg_price|
+-------------+---------+
|        Night|  8179.47|
|      Evening|  9178.83|
|      Morning|  9202.63|
|    Afternoon|  9392.82|
+-------------+---------+



8. Is there a pattern in prices based on the month of travel?

In [51]:
df_spark.groupBy('Month').agg(round(avg('Price'),2).alias('Avg_Price')).orderBy('Avg_Price').show()

+-----+---------+
|Month|Avg_Price|
+-----+---------+
|    4|  5770.85|
|    6|   8828.8|
|    5|  9127.72|
|    3| 10673.21|
+-----+---------+



In [53]:
df_spark.show(2
              )

+---------+--------+-----------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+------------+-------------+-----------------+------------------+
|  Airline|  Source|Destination|Total_Stops|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|Duration_Hour|Duration_Min|Dep_Time_Slot|Arrival_time_slot|Total_duration_min|
+---------+--------+-----------+-----------+-----+----+-----+----+--------+-------+------------+-----------+-------------+------------+-------------+-----------------+------------------+
|   IndiGo|Banglore|  New Delhi|          0| 3897|  24|    3|2019|      22|     20|           1|         10|            2|          50|        Night|            Night|               170|
|Air India| Kolkata|   Banglore|          2| 7662|   1|    5|2019|       5|     50|          13|         15|            7|          25|      Morning|        Afternoon|               445|
+---------+--------+-----------+-----------+-----+----+-----+----

In [60]:
df_spark.dtypes

[('Airline', 'string'),
 ('Source', 'string'),
 ('Destination', 'string'),
 ('Total_Stops', 'int'),
 ('Price', 'int'),
 ('Date', 'int'),
 ('Month', 'int'),
 ('Year', 'int'),
 ('Dep_Hour', 'int'),
 ('Dep_Min', 'int'),
 ('Arrival_Hour', 'int'),
 ('Arrival_Min', 'int'),
 ('Duration_Hour', 'int'),
 ('Duration_Min', 'int'),
 ('Dep_Time_Slot', 'string'),
 ('Arrival_time_slot', 'string'),
 ('Total_duration_min', 'int')]

9. Which time combinations (Dep → Arrival) have the longest durations?

In [64]:
df_spark = df_spark.withColumn('Dep_Time_Slot',
                                when((col('Dep_hour') >= 5) & (col('Dep_hour') < 12 ),'Morning')
                                .when((col('Dep_hour') >= 12) & (col('Dep_hour') < 17),'Afternoon')
                                .when((col('Dep_hour') >= 17) & (col('Dep_hour') < 21),'Evening')
                                .otherwise('Night'))


# calculate the arrival time slot

df_spark = df_spark.withColumn('Arrival_time_slot',
                    when((col('Arrival_hour') >= 5) & (col('Arrival_hour') < 12),'Morning')
                    .when((col('Arrival_hour') >= 12) & (col('Arrival_hour') < 17),'Afternoon')
                    .when((col('Arrival_hour') >= 17) & (col('Arrival_hour') < 21),'Evening')
                    .otherwise('Night')
)

df_spark = df_spark.withColumn('Total_duration_min',col('Duration_Hour') * 60 + col('Duration_Min'))


df_spark.groupBy('Dep_Time_Slot') \
    .agg(avg('Total_duration_min').alias('avg_duration_minutes')) \
    .orderBy('avg_duration_minutes', ascending=False) \
    .show()




25/07/16 20:01:32 ERROR Executor: Exception in task 0.0 in stage 48.0 (TID 39)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 1 in cell [41]

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.withException(UTF8StringUtils.scala:51)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.toIntExact(UTF8StringUtils.scala:34)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils.toIntExact(UTF8StringUtils.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.c

NumberFormatException: [CAST_INVALID_INPUT] The value '' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 1 in cell [41]


25/07/17 03:25:08 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 904267 ms exceeds timeout 120000 ms
25/07/17 03:25:08 WARN SparkContext: Killing executors is not supported by current scheduler.
25/07/17 03:25:15 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:81)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:669)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1296)
	at o