In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

In [2]:

files = ["2018.csv", "2019.csv", "2020.csv"]



In [3]:
def process_file(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Drop completely empty column if it exists
    if 'Unnamed: 27' in df.columns:
        df.drop(columns=['Unnamed: 27'], inplace=True)
    
    # Convert FL_DATE to datetime format
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
    
    # Fill missing numeric values with 0
    df.fillna(0, inplace=True)
    
    return df

# Process each file and store results
dataframes = [process_file(file) for file in files]

# Combine all years into a single DataFrame
df_combined = pd.concat(dataframes, ignore_index=True)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


     FL_DATE OP_CARRIER  OP_CARRIER_FL_NUM ORIGIN DEST  CRS_DEP_TIME  \
0 2018-01-01         UA               2429    EWR  DEN        1517.0   
1 2018-01-01         UA               2427    LAS  SFO        1115.0   
2 2018-01-01         UA               2426    SNA  DEN        1335.0   
3 2018-01-01         UA               2425    RSW  ORD        1546.0   
4 2018-01-01         UA               2424    ORD  ALB         630.0   

   DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  ...  ACTUAL_ELAPSED_TIME  \
0    1512.0       -5.0      15.0      1527.0  ...                250.0   
1    1107.0       -8.0      11.0      1118.0  ...                 83.0   
2    1330.0       -5.0      15.0      1345.0  ...                126.0   
3    1552.0        6.0      19.0      1611.0  ...                182.0   
4     650.0       20.0      13.0       703.0  ...                106.0   

   AIR_TIME  DISTANCE  CARRIER_DELAY  WEATHER_DELAY  NAS_DELAY SECURITY_DELAY  \
0     225.0    1605.0            0.0     

In [4]:


# Convert categorical columns to numerical using Label Encoding
categorical_cols = ['OP_CARRIER', 'ORIGIN', 'DEST', 'CANCELLATION_CODE']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_combined[col] = le.fit_transform(df_combined[col].astype(str))
    label_encoders[col] = le

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


     FL_DATE  OP_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST  CRS_DEP_TIME  \
0 2018-01-01          13               2429     119    96        1517.0   
1 2018-01-01          13               2427     192   316        1115.0   
2 2018-01-01          13               2426     330    96        1335.0   
3 2018-01-01          13               2425     302   253        1546.0   
4 2018-01-01          13               2424     253    14         630.0   

   DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  ...  ACTUAL_ELAPSED_TIME  \
0    1512.0       -5.0      15.0      1527.0  ...                250.0   
1    1107.0       -8.0      11.0      1118.0  ...                 83.0   
2    1330.0       -5.0      15.0      1345.0  ...                126.0   
3    1552.0        6.0      19.0      1611.0  ...                182.0   
4     650.0       20.0      13.0       703.0  ...                106.0   

   AIR_TIME  DISTANCE  CARRIER_DELAY  WEATHER_DELAY  NAS_DELAY  \
0     225.0    1605.0            0.0  

In [5]:

# Create IS_DELAY column (1 if ARR_DELAY > 15, else 0)
df_combined['IS_DELAY'] = (df_combined['ARR_DELAY'] > 15).astype(int)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()

     FL_DATE  OP_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST  CRS_DEP_TIME  \
0 2018-01-01          13               2429     119    96        1517.0   
1 2018-01-01          13               2427     192   316        1115.0   
2 2018-01-01          13               2426     330    96        1335.0   
3 2018-01-01          13               2425     302   253        1546.0   
4 2018-01-01          13               2424     253    14         630.0   

   DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  ...  AIR_TIME  DISTANCE  \
0    1512.0       -5.0      15.0      1527.0  ...     225.0    1605.0   
1    1107.0       -8.0      11.0      1118.0  ...      65.0     414.0   
2    1330.0       -5.0      15.0      1345.0  ...     106.0     846.0   
3    1552.0        6.0      19.0      1611.0  ...     157.0    1120.0   
4     650.0       20.0      13.0       703.0  ...      83.0     723.0   

   CARRIER_DELAY  WEATHER_DELAY  NAS_DELAY  SECURITY_DELAY  \
0            0.0            0.0        0.0      

In [6]:
# Count the number of delayed (1) and non-delayed (0) flights
delay_counts = df_combined['IS_DELAY'].value_counts()

# Print the counts
print(f"On-time flights (0): {delay_counts.get(0, 0)}")
print(f"Delayed flights (1): {delay_counts.get(1, 0)}")


On-time flights (0): 11987426
Delayed flights (1): 5195631


In [None]:
# features = ['OP_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'AIR_TIME', 'DISTANCE']
# target = 'IS_DELAY'


In [None]:
# X = df_combined[features]
# y = df_combined[target]


In [7]:
X = df_combined.drop(columns=["IS_DELAY", "ARR_DELAY"])  # Drop target column and delay minutes
y = df_combined["IS_DELAY"]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Model Accuracy: 0.95


In [None]:
# Training accuracy
train_accuracy = dt_model.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Testing accuracy
test_accuracy = dt_model.score(X_test, y_test)
print(f"Testing Accuracy: {test_accuracy:.2f}")

# Difference between train and test accuracy
accuracy_diff = train_accuracy - test_accuracy
print(f"Accuracy Difference: {accuracy_diff:.2f}")


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)


In [None]:
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.2f}")
print(classification_report(y_test, y_pred_log))

In [None]:
# Training accuracy
train_accuracy_log = log_reg.score(X_train_scaled, y_train)
print(f"🔹 Logistic Regression - Training Accuracy: {train_accuracy_log:.2f}")

# Testing accuracy
test_accuracy_log = log_reg.score(X_test_scaled, y_test)
print(f"🔹 Logistic Regression - Testing Accuracy: {test_accuracy_log:.2f}")

# Accuracy difference
accuracy_diff_log = train_accuracy_log - test_accuracy_log
print(f"🔹 Logistic Regression - Accuracy Difference: {accuracy_diff_log:.2f}\n")


SVM  

In [None]:
svm_model = SVC(kernel="rbf")  # Using RBF kernel for better results
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)


In [None]:
print("SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.2f}")
print(classification_report(y_test, y_pred_svm))

In [None]:
# Training accuracy
train_accuracy_svm = svm_model.score(X_train_scaled, y_train)
print(f"🔹 SVM - Training Accuracy: {train_accuracy_svm:.2f}")

# Testing accuracy
test_accuracy_svm = svm_model.score(X_test_scaled, y_test)
print(f"🔹 SVM - Testing Accuracy: {test_accuracy_svm:.2f}")

# Accuracy difference
accuracy_diff_svm = train_accuracy_svm - test_accuracy_svm
print(f"🔹 SVM - Accuracy Difference: {accuracy_diff_svm:.2f}\n")


XG Boost

In [None]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
print("XGBoost Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
print(classification_report(y_test, y_pred_xgb))

In [None]:
# Training accuracy
train_accuracy_xgb = xgb_model.score(X_train, y_train)
print(f"🔹 XGBoost - Training Accuracy: {train_accuracy_xgb:.2f}")

# Testing accuracy
test_accuracy_xgb = xgb_model.score(X_test, y_test)
print(f"🔹 XGBoost - Testing Accuracy: {test_accuracy_xgb:.2f}")

# Accuracy difference
accuracy_diff_xgb = train_accuracy_xgb - test_accuracy_xgb
print(f"🔹 XGBoost - Accuracy Difference: {accuracy_diff_xgb:.2f}\n")
