In [None]:
import pandas as pd
from pyspark.sql import SparkSession

#### Start a Spark Session

In [None]:
spark = SparkSession.builder.appName('flight EDA').getOrCreate()

In [None]:
spark

#### Read the dataset

1. Converting Excel to CSV for easy reading using pandas

In [8]:
pd.read_excel('/Applications/Repos/Repo/expedition/Spark/flight_price.xlsx').to_csv('flight_price.csv',index=False)

In [9]:
df_spark = spark.read.csv('flight_price.csv',header=True,inferSchema=True)

In [15]:
df_spark.describe()

DataFrame[summary: string, Airline: string, Date_of_Journey: string, Source: string, Destination: string, Route: string, Arrival_Time: string, Duration: string, Total_Stops: string, Additional_Info: string, Price: string]

2. Split Date of Journey to Date, Month and Year

In [16]:
from pyspark.sql.functions import split, col 

In [22]:
df_spark = df_spark.withColumn('Date',split(col('Date_of_Journey'),'/').getItem(0).cast('int')) \
                   .withColumn('Month',split(col('Date_Of_Journey'),'/').getItem(1).cast('int')) \
                   .withColumn('Year',split(col('Date_of_Journey'),'/').getItem(2).cast('int'))

In [25]:
df_spark.show(10)

+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+----+-----+----+
|          Airline|Date_of_Journey|  Source|Destination|               Route|           Dep_Time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|
+-----------------+---------------+--------+-----------+--------------------+-------------------+------------+--------+-----------+--------------------+-----+----+-----+----+
|           IndiGo|     24/03/2019|Banglore|  New Delhi|           BLR → DEL|2025-07-11 22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|
|        Air India|      1/05/2019| Kolkata|   Banglore|CCU → IXR → BBI →...|2025-07-11 05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|
|      Jet Airways|      9/06/2019|   Delhi|     Cochin|DEL → LKO → BOM →...|2025-07-11 09:25:00|04:25 10 Jun|     19h|    2 

3. Dropping unnessary columns

In [None]:
df_spark = df_spark.drop('Date_of_Journey')

In [31]:
df_spark = df_spark.drop('Route')

4. Spliting Departure time and Arrivial time

In [None]:
df_spark = df_spark.withColumn('Dep_time',split(col('Dep_Time'),' ').getItem(1))

+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+
|          Airline|  Source|Destination|Dep_time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|
+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+
|           IndiGo|Banglore|  New Delhi|22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|
|        Air India| Kolkata|   Banglore|05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|
|      Jet Airways|   Delhi|     Cochin|09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|   9|    6|2019|
|           IndiGo| Kolkata|   Banglore|18:05:00|       23:30|  5h 25m|     1 stop|             No info| 6218|  12|    5|2019|
|           IndiGo|Banglore|  New Delhi|16:50:00|       21:35|  4h 45m|     1 stop|             No info|13302| 

In [39]:
df_spark.show()

+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+
|          Airline|  Source|Destination|Dep_time|Arrival_Time|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|
+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+
|           IndiGo|Banglore|  New Delhi|22:20:00|01:10 22 Mar|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|
|        Air India| Kolkata|   Banglore|05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|
|      Jet Airways|   Delhi|     Cochin|09:25:00|04:25 10 Jun|     19h|    2 stops|             No info|13882|   9|    6|2019|
|           IndiGo| Kolkata|   Banglore|18:05:00|       23:30|  5h 25m|     1 stop|             No info| 6218|  12|    5|2019|
|           IndiGo|Banglore|  New Delhi|16:50:00|       21:35|  4h 45m|     1 stop|             No info|13302| 

In [None]:
df_spark = df_spark.withColumn('Dep_hour',split(col('Dep_time'),':').getItem(0))\
                  .withColumn('Dep_Min',split(col('Dep_time'),':').getItem(1))

In [47]:
df_spark = df_spark.withColumn('Arrival_time',split(col('Arrival_Time'),' ').getItem(0))

In [49]:
df_spark.show()

+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+
|          Airline|  Source|Destination|Dep_time|Arrival_time|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|Dep_hour|Dep_Min|
+-----------------+--------+-----------+--------+------------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+
|           IndiGo|Banglore|  New Delhi|22:20:00|       01:10|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|      22|     20|
|        Air India| Kolkata|   Banglore|05:50:00|       13:15|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|      05|     50|
|      Jet Airways|   Delhi|     Cochin|09:25:00|       04:25|     19h|    2 stops|             No info|13882|   9|    6|2019|      09|     25|
|           IndiGo| Kolkata|   Banglore|18:05:00|       23:30|  5h 25m|     1 stop|             No info| 6218|  12|    5|2019|      18| 

In [52]:
df_spark = df_spark.withColumn('Arrival_Hour',split(col('Arrival_time'),':').getItem(0))\
        .withColumn('Arrival_Min',split(col('Arrival_time'),':').getItem(1))

In [55]:
df_spark = df_spark.drop('Dep_time','Arrival_time')

In [57]:
df_spark = df_spark.withColumnRenamed('Dep_hour','Dep_Hour')

In [58]:
df_spark.show()

+-----------------+--------+-----------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+------------+-----------+
|          Airline|  Source|Destination|Duration|Total_Stops|     Additional_Info|Price|Date|Month|Year|Dep_Hour|Dep_Min|Arrival_Hour|Arrival_Min|
+-----------------+--------+-----------+--------+-----------+--------------------+-----+----+-----+----+--------+-------+------------+-----------+
|           IndiGo|Banglore|  New Delhi|  2h 50m|   non-stop|             No info| 3897|  24|    3|2019|      22|     20|          01|         10|
|        Air India| Kolkata|   Banglore|  7h 25m|    2 stops|             No info| 7662|   1|    5|2019|      05|     50|          13|         15|
|      Jet Airways|   Delhi|     Cochin|     19h|    2 stops|             No info|13882|   9|    6|2019|      09|     25|          04|         25|
|           IndiGo| Kolkata|   Banglore|  5h 25m|     1 stop|             No info| 6218|  12|    5|2019|      18|     

In [None]:
df_spark.

PySparkAttributeError: [ATTRIBUTE_NOT_SUPPORTED] Attribute `isnull` is not supported.