In [20]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connecting to ppa.                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done


In [22]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
# Import the DecisionTreeClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [23]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
# Create a SparkSession
spark = SparkSession.builder.appName("LocalCSVToDataFrame").getOrCreate()

# Define the path to your local CSV file
drive_csv_path = "drive/MyDrive/Data/Combined_Flights_2022.csv"

# Read the local CSV file into a DataFrame
flights_df = spark.read.csv(drive_csv_path, sep=",", header=True)
flightdata1 = flights_df.limit(1000001)

# Show the DataFrame
flightdata1



DataFrame[FlightDate: string, Airline: string, Origin: string, Dest: string, Cancelled: string, Diverted: string, CRSDepTime: string, DepTime: string, DepDelayMinutes: string, DepDelay: string, ArrTime: string, ArrDelayMinutes: string, AirTime: string, CRSElapsedTime: string, ActualElapsedTime: string, Distance: string, Year: string, Quarter: string, Month: string, DayofMonth: string, DayOfWeek: string, Marketing_Airline_Network: string, Operated_or_Branded_Code_Share_Partners: string, DOT_ID_Marketing_Airline: string, IATA_Code_Marketing_Airline: string, Flight_Number_Marketing_Airline: string, Operating_Airline: string, DOT_ID_Operating_Airline: string, IATA_Code_Operating_Airline: string, Tail_Number: string, Flight_Number_Operating_Airline: string, OriginAirportID: string, OriginAirportSeqID: string, OriginCityMarketID: string, OriginCityName: string, OriginState: string, OriginStateFips: string, OriginStateName: string, OriginWac: string, DestAirportID: string, DestAirportSeqID: s

Pre Processing


In [25]:
# Fill NAN values with '0'
flights_df = flightdata1.fillna(0)
flights_df

DataFrame[FlightDate: string, Airline: string, Origin: string, Dest: string, Cancelled: string, Diverted: string, CRSDepTime: string, DepTime: string, DepDelayMinutes: string, DepDelay: string, ArrTime: string, ArrDelayMinutes: string, AirTime: string, CRSElapsedTime: string, ActualElapsedTime: string, Distance: string, Year: string, Quarter: string, Month: string, DayofMonth: string, DayOfWeek: string, Marketing_Airline_Network: string, Operated_or_Branded_Code_Share_Partners: string, DOT_ID_Marketing_Airline: string, IATA_Code_Marketing_Airline: string, Flight_Number_Marketing_Airline: string, Operating_Airline: string, DOT_ID_Operating_Airline: string, IATA_Code_Operating_Airline: string, Tail_Number: string, Flight_Number_Operating_Airline: string, OriginAirportID: string, OriginAirportSeqID: string, OriginCityMarketID: string, OriginCityName: string, OriginState: string, OriginStateFips: string, OriginStateName: string, OriginWac: string, DestAirportID: string, DestAirportSeqID: s

In [26]:
# Dropping non-beneficial columns

columns_to_drop = ['FlightDate', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners', 'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline', 'Operating_Airline', 'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline', 'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName', 'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac', 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName', 'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'ArrDel15', 'ArrivalDelayGroups', 'DivAirportLandings', 'DepTimeBlk', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 'ArrTimeBlk', 'DistanceGroup', 'DepartureDelayGroups', 'Diverted', 'CRSElapsedTime', 'ActualElapsedTime']

df = flights_df.drop(*columns_to_drop)


In [27]:
# Remove the 'flight number' column
flights_df =  flights_df.drop('Flight_Number_Marketing_Airline', 'Flight_Number_Operating_Airline')

# Remove records with missing 'delay' values
#flights_valid_delay = flight_drop_column.filter('delay IS NOT NULL')

# Remove records with missing values
flights_df = flights_df.dropna()
print(flights_df.count())

966404


In [28]:
# Convert miles to kilometers and drop the 'Distance' column
flights_km = flights_df.withColumn('Distance_km', round(flights_df.Distance * 1.60934, 2)).drop('Distance')

# Create 'label' column indicating whether a flight is delayed or not
flights_km = flights_km.withColumn('label', (flights_km.ArrDel15 >= 15).cast('integer'))

# Show the first five records
flights_km.show(5)


+----------+--------------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-----------------+------------------------+---------------------------+-----------+---------------+------------------+------------------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+-------------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------+-----+
|FlightDate|             Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|CRSElapsedTime|A

In [29]:
# Index 'Origin' column
try:
    flights_indexed = StringIndexer(inputCol='Origin', outputCol='Origin_idx').fit(flights_km).transform(flights_km)
except Exception as e:
    print("Error:", e)

# Check the first few records of the DataFrame
flights_indexed.show(5)



+----------+--------------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-----------------+------------------------+---------------------------+-----------+---------------+------------------+------------------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+-------------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------+-----+----------+
|FlightDate|             Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|CRSEl

In [30]:
# Convert relevant columns to the appropriate data types
flights_indexed = flights_indexed.withColumn("DepDelayMinutes", col("DepDelayMinutes").cast("double"))
flights_indexed = flights_indexed.withColumn("AirTime", col("AirTime").cast("double"))
flights_indexed = flights_indexed.withColumn("Distance_km", col("Distance_km").cast("double"))
flights_indexed = flights_indexed.withColumn("DepDel15", col("DepDel15").cast("integer"))
flights_indexed = flights_indexed.withColumn("ArrDel15", col("ArrDel15").cast("integer"))

# Define the columns to include in the feature vector
input_columns = [
    'DepDelayMinutes', 'AirTime', 'Distance_km', 'DepDel15'
]

# Create a VectorAssembler object
assembler = VectorAssembler(inputCols=input_columns, outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights_indexed)

# Check the resulting column
flights_assembled.select('features', 'ArrDel15').show(5, truncate=False)



+---------------------+--------+
|features             |ArrDel15|
+---------------------+--------+
|[0.0,40.0,341.18,0.0]|0       |
|[0.0,55.0,474.76,0.0]|0       |
|[0.0,47.0,403.94,0.0]|0       |
|[0.0,57.0,605.11,0.0]|0       |
|[0.0,49.0,403.94,0.0]|0       |
+---------------------+--------+
only showing top 5 rows



In [31]:
# Convert relevant columns to the appropriate data types
flights_indexed = flights_indexed.withColumn("DepDelayMinutes", col("DepDelayMinutes").cast("double"))
flights_indexed = flights_indexed.withColumn("AirTime", col("AirTime").cast("double"))
flights_indexed = flights_indexed.withColumn("Distance_km", col("Distance_km").cast("double"))
flights_indexed = flights_indexed.withColumn("DepDel15", col("DepDel15").cast("integer"))
flights_indexed = flights_indexed.withColumn("ArrDel15", col("ArrDel15").cast("integer"))

# # Define the columns to include in the feature vector
# input_columns = [
#     'DepDelayMinutes', 'AirTime', 'Distance_km', 'DepDel15'
# ]

X = flights_assembled.select('DepDelayMinutes', 'AirTime', 'Distance_km', 'DepDel15')
y = flights_assembled.select('label')

# Create a VectorAssembler object
assembler = VectorAssembler(inputCols=input_columns, outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights_indexed)

# Check the resulting column
flights_assembled.select('features', 'ArrDel15').show(5, truncate=False)


+---------------------+--------+
|features             |ArrDel15|
+---------------------+--------+
|[0.0,40.0,341.18,0.0]|0       |
|[0.0,55.0,474.76,0.0]|0       |
|[0.0,47.0,403.94,0.0]|0       |
|[0.0,57.0,605.11,0.0]|0       |
|[0.0,49.0,403.94,0.0]|0       |
+---------------------+--------+
only showing top 5 rows



Machine Learning Models Markdown

In [32]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights_assembled.randomSplit([0.8, 0.2], seed=42)

# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights_assembled.count()
print(training_ratio)

0.8003319522684095


In [33]:
# Create a Decision Tree classifier object and fit it to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)

# Make predictions on the test data
predictions = tree_model.transform(flights_test)

# Select and display relevant columns from the prediction DataFrame
predictions.select('label', 'prediction', 'probability').show(5, False)



+-----+----------+-----------+
|label|prediction|probability|
+-----+----------+-----------+
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
+-----+----------+-----------+
only showing top 5 rows



Prediction for Decision Tree classifier model:

label:This column shows,   where 0 typically represents that a flight was not delayed.

Prediction: This column contains the predicted labels and it appears that the model has predicted 0.0 for all of these rows,
indicating that the model predicts that these flights were not delayed.

probability: This column contains the probability scores associated with each prediction.
The probability score in each row indicates that the probability score is [1.0] for all rows, which suggests that the model is very confident in predicting that
these flights were not delayed

These results indicate that, for the first few rows of test dataset, the model predicted that the flights were not delayed,
and it did so with high confidence (probability of 1.0). However, it's important to evaluate the model's performance on a larger portion of the test dataset
and to furter evaluate metrics to assess its accuracy and reliability.







In [34]:
flights_train.show(5)
flights_test.show(5)


+----------+--------------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-----------------+------------------------+---------------------------+-----------+---------------+------------------+------------------+---------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+-------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------+-----+----------+--------------------+
|FlightDate|             Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|Air

In [35]:
# Create a DT classifier object with appropriate parameters
tree = DecisionTreeClassifier(featuresCol="features", labelCol="label")
tree_model = tree.fit(flights_train)


In [36]:
# Create a DT classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)

# Create predictions on test data
prediction = tree_model.transform(flights_test)

# Calculate accuracy
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = evaluator.evaluate(prediction)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display selected columns for the first 5 rows
prediction.select('label', 'prediction', 'probability').show(5, truncate=False)


Accuracy: 100.00%
+-----+----------+-----------+
|label|prediction|probability|
+-----+----------+-----------+
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
|0    |0.0       |[1.0]      |
+-----+----------+-----------+
only showing top 5 rows

