In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

In [6]:

files = ["2018.csv", "2019.csv", "2020.csv"]



In [None]:
def process_file(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Drop completely empty column if it exists
    if 'Unnamed: 27' in df.columns:
        df.drop(columns=['Unnamed: 27'], inplace=True)
    
    # Convert FL_DATE to datetime format
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
    
    # Fill missing numeric values with 0
    df.fillna(0, inplace=True)
    
    return df

# Process each file and store results
dataframes = [process_file(file) for file in files]

# Combine all years into a single DataFrame
df_combined = pd.concat(dataframes, ignore_index=True)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


In [None]:


# Convert categorical columns to numerical using Label Encoding
categorical_cols = ['OP_CARRIER', 'ORIGIN', 'DEST', 'CANCELLATION_CODE']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_combined[col] = le.fit_transform(df_combined[col].astype(str))
    label_encoders[col] = le

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


In [None]:

# Create IS_DELAY column (1 if ARR_DELAY > 15, else 0)
df_combined['IS_DELAY'] = (df_combined['ARR_DELAY'] > 15).astype(int)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()

In [12]:
# Count the number of delayed (1) and non-delayed (0) flights
delay_counts = df_combined['IS_DELAY'].value_counts()

# Print the counts
print(f"On-time flights (0): {delay_counts.get(0, 0)}")
print(f"Delayed flights (1): {delay_counts.get(1, 0)}")


On-time flights (0): 11987426
Delayed flights (1): 5195631


In [None]:
# features = ['OP_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'AIR_TIME', 'DISTANCE']
# target = 'IS_DELAY'


In [None]:
# X = df_combined[features]
# y = df_combined[target]


In [None]:
X = df_combined.drop(columns=["IS_DELAY", "ARR_DELAY"])  # Drop target column and delay minutes
y = df_combined["IS_DELAY"]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Model Accuracy: 0.95


In [23]:
# Training accuracy
train_accuracy = dt_model.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Testing accuracy
test_accuracy = dt_model.score(X_test, y_test)
print(f"Testing Accuracy: {test_accuracy:.2f}")

# Difference between train and test accuracy
accuracy_diff = train_accuracy - test_accuracy
print(f"Accuracy Difference: {accuracy_diff:.2f}")


Training Accuracy: 1.00
Testing Accuracy: 0.95
Accuracy Difference: 0.05


In [None]:
cv_scores = cross_val_score(dt_model, X, y, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")
