<a href="https://colab.research.google.com/github/animesh-11/AI_ML/blob/main/Mini_Assignment_2_Animesh_Kumar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ReadData") \
    .getOrCreate()


try:
    df = spark.read.csv('Flight Dataset.csv', header=True, inferSchema=True)
    df.show(5)
    df.printSchema()
except Exception as e:
    print(f"Error reading data: {e}")


+--------+---------+---------+--------+--------+---------+---------+
| FL_DATE|DEP_DELAY|ARR_DELAY|AIR_TIME|DISTANCE| DEP_TIME| ARR_TIME|
+--------+---------+---------+--------+--------+---------+---------+
|1/1/2006|        5|       19|     350|    2475| 9.083333|12.483334|
|1/2/2006|      167|      216|     343|    2475|11.783334|15.766666|
|1/3/2006|       -7|       -2|     344|    2475| 8.883333|12.133333|
|1/4/2006|       -5|      -13|     331|    2475| 8.916667|    11.95|
|1/5/2006|       -3|      -17|     321|    2475|     8.95|11.883333|
+--------+---------+---------+--------+--------+---------+---------+
only showing top 5 rows
root
 |-- FL_DATE: string (nullable = true)
 |-- DEP_DELAY: integer (nullable = true)
 |-- ARR_DELAY: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- DEP_TIME: double (nullable = true)
 |-- ARR_TIME: double (nullable = true)



In [6]:

def count_early_arrivals(dataframe):
    # Filter for flights where ARR_DELAY is negative (arrived earlier than expected)
    early_arrivals_df = dataframe.filter(dataframe['ARR_DELAY'] < 0)
    # Count the number of such flights
    num_early_arrivals = early_arrivals_df.count()
    return num_early_arrivals


num_early = count_early_arrivals(df)
print(f"Number of flights that arrived earlier than expected: {num_early}")


Number of flights that arrived earlier than expected: 534655


In [7]:

from pyspark.sql.functions import avg

def get_typical_departure_time_long_flights(dataframe):
    # Filter for flights over 2000 miles
    long_distance_flights_df = dataframe.filter(dataframe['DISTANCE'] > 2000)

    # Calculate the average departure time
    typical_dep_time = long_distance_flights_df.select(avg('DEP_TIME')).collect()[0][0]

    return typical_dep_time


typical_dep_time = get_typical_departure_time_long_flights(df)
print(f"Typical departure time for flights over 2000 miles: {typical_dep_time:.2f}")


Typical departure time for flights over 2000 miles: 13.97


In [8]:

def get_proportion_long_arrival_delays(dataframe):
    # Filter for flights with arrival delays longer than 60 minutes
    long_delay_flights_df = dataframe.filter(dataframe['ARR_DELAY'] > 60)

    # Count the number of such flights
    num_long_delay_flights = long_delay_flights_df.count()

    # Get the total number of flights
    total_flights = dataframe.count()

    # Calculate the proportion
    if total_flights > 0:
        proportion = num_long_delay_flights / total_flights
    else:
        proportion = 0.0

    return proportion


proportion_long_delays = get_proportion_long_arrival_delays(df)
print(f"Proportion of flights with arrival delays longer than 60 minutes: {proportion_long_delays:.4f}")

Proportion of flights with arrival delays longer than 60 minutes: 0.0531


In [9]:

from pyspark.sql.functions import avg

def get_average_airtime_early_departures(dataframe):
    # Filter for flights that left earlier than 9:00 am
    early_departure_flights_df = dataframe.filter(dataframe['DEP_TIME'] < 9.0)

    # Calculate the average airtime
    if early_departure_flights_df.count() > 0:
        avg_airtime = early_departure_flights_df.select(avg('AIR_TIME')).collect()[0][0]
    else:
        avg_airtime = 0.0

    return avg_airtime



avg_early_airtime = get_average_airtime_early_departures(df)
print(f"Average airtime for flights that left earlier than 9:00 AM: {avg_early_airtime:.2f} minutes")


Average airtime for flights that left earlier than 9:00 AM: 111.36 minutes


In [10]:

from pyspark.sql.functions import max

def get_max_arrival_delay_no_dep_delay(dataframe):
    # Filter for flights that did not experience a delay upon departure (DEP_DELAY <= 0)
    on_time_departure_flights_df = dataframe.filter(dataframe['DEP_DELAY'] <= 0)

    # Check if there are any flights matching the criteria
    if on_time_departure_flights_df.count() > 0:

        max_arr_delay = on_time_departure_flights_df.select(max('ARR_DELAY')).collect()[0][0]
    else:
        max_arr_delay = None

    return max_arr_delay



max_delay = get_max_arrival_delay_no_dep_delay(df)
if max_delay is not None:
    print(f"Maximum arrival delay for flights that departed on time or early: {max_delay:.2f} minutes")
else:
    print("No flights found that departed on time or early with recorded arrival delays.")


Maximum arrival delay for flights that departed on time or early: 701.00 minutes
