# Multi class classification

This jupyter notebook will be used to predict mutli class classification

## Importing libraries

In [83]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

In [73]:
df = pd.read_csv('encoded_train.csv')

## Replace negative delay time with zero

In [74]:
df['delay_time'] = df['delay_time'].clip(lower=0)

In [75]:
df.loc[df['status'] == 'active', 'delay_time'] = df.loc[df['status'] == 'active', 'delay_time'].fillna(
    df.loc[df['status'] == 'active', 'delay_time'].mean()
)

In [76]:
final_df = df[['Temperature (°F)_max',
       'Temperature (°F)_avg', 'Temperature (°F)_min', 'Dew Point (°F)_max',
       'Dew Point (°F)_avg', 'Dew Point (°F)_min', 'Humidity (%)_max',
       'Humidity (%)_avg', 'Humidity (%)_min', 'Wind Speed (mph)_max',
       'Wind Speed (mph)_avg', 'Wind Speed (mph)_min', 'Pressure (in)_max',
       'Pressure (in)_avg', 'Pressure (in)_min', 'delay_time',
       'hour_of_day', 'airline_airblue', 'airline_airsial',
       'airline_british airways', 'airline_emirates', 'airline_flyjinnah',
       'airline_klm', 'airline_oman air', 'airline_other',
       'airline_pakistan international airlines', 'airline_qatar airways',
       'airline_serene air', 'day_Friday', 'day_Monday', 'day_Saturday',
       'day_Sunday', 'day_Thursday', 'day_Tuesday', 'day_Wednesday',
       'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep', 'icao_opis', 'icao_opkc', 'icao_opla',
       'iata_isb', 'iata_khi', 'iata_lhe', 'status_active', 'status_cancelled',
       'status_unknown']]

## Note

As we can see below the cancelled and unknown flights are nan so we will set them as "Long Delay" for our multiclass classification model

In [77]:
final_df['delay_time'].isnull().sum()

np.int64(3664)

## Categorize flights into:

■ No Delay (0 min)

■ Short Delay (<45 min)

■ Moderate Delay (45–175 min)

■ Long Delay (>175 min and NAN values)

In [78]:
def categorize_delay(delay):
    if pd.isna(delay):
        return "Long Delay"
    elif delay == 0:
        return "No Delay"
    elif delay < 45:
        return "Short Delay"
    elif 45 <= delay <= 175:
        return "Moderate Delay"
    else:
        return "Long Delay"

In [79]:
final_df['delay_category'] = final_df['delay_time'].apply(categorize_delay)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['delay_category'] = final_df['delay_time'].apply(categorize_delay)


In [80]:
final_df.drop(columns='delay_time',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns='delay_time',inplace=True)


## Now time for model training and prediction

In [154]:
X = final_df.drop('delay_category', axis=1)
y = final_df['delay_category']

In [155]:
X_standardized = StandardScaler().fit_transform(X)

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [157]:
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

## Evaluation Metrics

In [158]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8984973339796413


In [159]:
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Precision: [1.         0.39622642 0.50092764 0.91683389]
Recall: [1.         0.21428571 0.2869288  0.96491846]
F1-Score: [1.         0.2781457  0.36486486 0.94026182]


In [160]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[ 753    0    0    0]
 [   0   21    0   77]
 [   0    2  270  669]
 [   0   30  269 8224]]


## Use KNN for prediction

## First we will oversampled using SMOTE

In [144]:
X = final_df.drop(columns="delay_category")
y = final_df['delay_category']
oversampler = SMOTE()
X_resampled, y_resampled = oversampler.fit_resample(X, y)
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_category'] = y_resampled

In [145]:
X = df_resampled.drop('delay_category', axis=1)
y = df_resampled['delay_category']

In [146]:
X_standardized = StandardScaler().fit_transform(X)

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.2, random_state=42)

In [148]:
k = 7
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

In [149]:
y_pred = knn.predict(X_test)

In [150]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9412571495272557


In [151]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                precision    recall  f1-score   support

    Long Delay       1.00      0.99      1.00      8580
Moderate Delay       0.97      0.99      0.98      8576
      No Delay       0.89      0.91      0.90      8597
   Short Delay       0.91      0.87      0.89      8515

      accuracy                           0.94     34268
     macro avg       0.94      0.94      0.94     34268
  weighted avg       0.94      0.94      0.94     34268



In [152]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[8533    2   13   32]
 [   0 8517    6   53]
 [   1   95 7811  690]
 [   7  136  978 7394]]


In [153]:
report = classification_report(y_test, y_pred, output_dict=True)
for cls in report:
    if cls not in ["accuracy", "macro avg", "weighted avg"]:
        print(f"Class: {cls}")
        print(f" Precision: {report[cls]['precision']}")
        print(f" Recall: {report[cls]['recall']}")

Class: Long Delay
 Precision: 0.9990633415290949
 Recall: 0.9945221445221445
Class: Moderate Delay
 Precision: 0.9733714285714286
 Recall: 0.9931203358208955
Class: No Delay
 Precision: 0.8868074477747502
 Recall: 0.9085727579388159
Class: Short Delay
 Precision: 0.9051291467743909
 Recall: 0.8683499706400469
