In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# Load the dataset
df = pd.read_csv("C:/Users/ankit/Downloads/Airline_Delay_Cause.csv")

# Display basic info and check for missing values
print(df.info())
print(df.isnull().sum())


In [None]:
# Example of outlier removal using IQR method
Q1 = df['arr_delay'].quantile(0.30)
Q3 = df['arr_delay'].quantile(0.70)
IQR = Q3 - Q1
df = df[(df['arr_delay'] >= Q1 - 1.5 * IQR) & (df['arr_delay'] <= Q3 + 1.5 * IQR)]


In [None]:
# Convert categorical variables to numeric using Label Encoding
label_encoder = LabelEncoder()
df['carrier'] = label_encoder.fit_transform(df['carrier'])
df['airport'] = label_encoder.fit_transform(df['airport'])

# Fill missing values (if any) with zero or mean (depending on the feature)
df.fillna(0, inplace=True)

# Feature engineering: Extract additional information if needed (like seasonality)
df['season'] = df['month'].apply(lambda x: 'Winter' if x in [12, 1, 2] 
                                 else 'Spring' if x in [3, 4, 5] 
                                 else 'Summer' if x in [6, 7, 8] 
                                 else 'Fall')
df['season'] = label_encoder.fit_transform(df['season'])


In [5]:
# Define features (X) and target (y)
features = ['carrier', 'arr_flights', 'arr_del15', 
            'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct']
X = df[features]
y = df['arr_cancelled']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Logistic Regression
#log_reg = LogisticRegression()
#log_reg.fit(X_train, y_train)
#y_pred_log = log_reg.predict(X_test)

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Evaluation Metrics
#print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))
#print("Classification Report (Random Forest):")
#print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.5063656647854049
Confusion Matrix (Random Forest):
[[3644  272   63 ...    0    0    0]
 [1069  149   52 ...    0    0    0]
 [ 557  103   39 ...    0    0    0]
 ...
 [   2    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]]


In [7]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Train-test split with stratification to ensure label balance
#X_train, X_test, y_train, y_test = train_test_split(
#    X, y, test_size=0.2, random_state=42, stratify=y
#)

# Train a Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, solver='saga', random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Evaluate the model with zero_division parameter in classification report
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=1))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Logistic Regression Accuracy: 0.5286783042394015
Classification Report:
              precision    recall  f1-score   support

         0.0       0.56      0.97      0.71      4045
         1.0       0.17      0.07      0.09      1324
         2.0       0.08      0.01      0.01       750
         3.0       1.00      0.00      0.00       422
         4.0       0.00      0.00      0.00       280
         5.0       0.08      0.01      0.01       170
         6.0       1.00      0.00      0.00       142
         7.0       1.00      0.00      0.00        85
         8.0       1.00      0.00      0.00        70
         9.0       1.00      0.00      0.00        41
        10.0       1.00      0.00      0.00        50
        11.0       1.00      0.00      0.00        42
        12.0       1.00      0.00      0.00        29
        13.0       1.00      0.00      0.00        22
        14.0       1.00      0.00      0.00        18
        15.0       1.00      0.00      0.00        19
        1



In [12]:
#*Question 2*

In [13]:
# Define features (X) and target (y)
features = ['month', 'carrier', 'airport', 'arr_flights', 'carrier_ct', 
            'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct']
X = df[features]
y = df['arr_del15']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features for Gradient Boosting
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [14]:
# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=30, learning_rate=0.1, random_state=42)
gb_clf.fit(X_train, y_train)
y_pred_gb = gb_clf.predict(X_test)

# Evaluation Metrics
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Confusion Matrix (Gradient Boosting):")
print(confusion_matrix(y_test, y_pred_gb))
print("Classification Report (Gradient Boosting):")
print(classification_report(y_test, y_pred_gb))


KeyboardInterrupt: 

In [None]:
# Confusion Matrix Heatmap for Gradient Boosting
sns.heatmap(confusion_matrix(y_test, y_pred_gb), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Gradient Boosting")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
