## Binary Classification

## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report,roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib
from imblearn.under_sampling import RandomUnderSampler

In [29]:
df = pd.read_csv('../../Phase_1/train.csv')
df_test = pd.read_csv('../final_test.csv')

## Hot encoding airlines (ignoring for now)

In [None]:
common_airlines = set(df_test['airline_name']).intersection(set(df['airline_name']))

# Step 2: Replace airline_name with only the common airlines
df['airline_name'] = df['airline_name'].apply(lambda x: x if x in common_airlines else 'other')
df_test['airline_name'] = df_test['airline_name'].apply(lambda x: x if x in common_airlines else 'other')
one_hot_encoded_train = pd.get_dummies(df['airline_name'], prefix='airline')
one_hot_encoded_test = pd.get_dummies(df_test['airline_name'], prefix='airline')
one_hot_encoded_train = one_hot_encoded_train[sorted(one_hot_encoded_train.columns)]
one_hot_encoded_test = one_hot_encoded_test[sorted(one_hot_encoded_test.columns)]

df_final = pd.concat([df, one_hot_encoded_train], axis=1)
df_test_final = pd.concat([df_test, one_hot_encoded_test], axis=1)
df_final.drop(columns=['airline_name','departure_scheduled_time','departure_actual_time','type','flight_number'
                       ,'arrival_iata_code','arrival_icao_code','arrival_estimated_time'],inplace=True)
df_test_final.drop(columns=['flight_number','airline_name','Unnamed: 0.1','type','arrival_iata_code','arrival_icao_code',
                            'departure_scheduled_time','Unnamed: 0'],inplace=True)

## Hot encoding other columns

In [9]:
df = pd.read_csv('../../Phase_1/train.csv')
df_test = pd.read_csv('../final_test.csv')
# one_hot_months = pd.get_dummies(df['Month'], prefix='month')
# df_final = pd.concat([df, one_hot_months], axis=1)
# one_hot_months = pd.get_dummies(df_test['Month'], prefix='month')
# df_test_final = pd.concat([df_test, one_hot_months], axis=1)

# one_hot_days = pd.get_dummies(df_final['day_of_week'], prefix='day')
# df_final = pd.concat([df_final, one_hot_days], axis=1)
# one_hot_days = pd.get_dummies(df_test_final['day_of_week'], prefix='day')
# df_test_final = pd.concat([df_test_final, one_hot_days], axis=1)

one_hot_icaos = pd.get_dummies(df['departure_icao_code'], prefix='icao')
df_final = pd.concat([df, one_hot_icaos], axis=1)
one_hot_icaos = pd.get_dummies(df_test['Departure ICAO Code'], prefix='icao')
df_test_final = pd.concat([df_test, one_hot_icaos], axis=1)

one_hot_iata = pd.get_dummies(df_final['departure_iata_code'], prefix='iata')
df_final = pd.concat([df_final, one_hot_iata], axis=1)
one_hot_iata = pd.get_dummies(df_test_final['Departure IATA Code'], prefix='iata')
df_test_final = pd.concat([df_test_final, one_hot_iata], axis=1)

one_hot_status = pd.get_dummies(df_final['status'], prefix='status')
df_final = pd.concat([df_final, one_hot_status], axis=1)
one_hot_status = pd.get_dummies(df_test_final['Status'], prefix='status')
df_test_final = pd.concat([df_test_final, one_hot_status], axis=1)

In [10]:
df_final['delay_time']

0        16.0
1         NaN
2         1.0
3         NaN
4        11.0
         ... 
51567     NaN
51568     NaN
51569    15.0
51570     NaN
51571     NaN
Name: delay_time, Length: 51572, dtype: float64

In [14]:
len(df[df['delay_time']<=0])

4601

In [6]:
df['delay_time'].fillna(df['delay_time'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['delay_time'].fillna(df['delay_time'].mean(),inplace=True)


In [7]:
df['delay_time'].mean()

np.float64(10.905195224265892)

In [None]:
df_final['delay_time'] = df_final['delay_time'].clip(lower=0)
df_final.loc[df_final['status'] == 'active', 'delay_time'] = df_final.loc[df_final['status'] == 'active', 'delay_time'].fillna(
    df_final.loc[df_final['status'] == 'active', 'delay_time'].mean()
)
df_final.fillna({'delay_time':-1},inplace=True)
df_final['delay_time_binary'] = df_final['delay_time'].apply(lambda x: 1 if x > 1 or x == -1 else 0)
df_final.drop(columns='delay_time',inplace=True)

In [128]:
df_test_final.columns

Index(['Flight Number', 'Type', 'Status', 'Departure IATA Code',
       'Departure ICAO Code', 'Arrival IATA Code', 'Arrival ICAO Code',
       'Month', 'Day', 'Temperature (°F)_max', 'Temperature (°F)_avg',
       'Temperature (°F)_min', 'Dew Point (°F)_max', 'Dew Point (°F)_avg',
       'Dew Point (°F)_min', 'Humidity (%)_max', 'Humidity (%)_avg',
       'Humidity (%)_min', 'Wind Speed (mph)_max', 'Wind Speed (mph)_avg',
       'Wind Speed (mph)_min', 'Pressure (in)_max', 'Pressure (in)_avg',
       'Pressure (in)_min', 'icao_opis', 'icao_opkc', 'icao_opla', 'iata_isb',
       'iata_khi', 'iata_lhe', 'status_active', 'status_cancelled',
       'status_unknown'],
      dtype='object')

In [129]:
df_final.columns

Index(['flight_number', 'airline_name', 'code_shared_flag', 'type', 'status',
       'departure_iata_code', 'departure_icao_code',
       'departure_scheduled_time', 'departure_actual_time',
       'arrival_iata_code', 'arrival_icao_code', 'arrival_estimated_time',
       'Month', 'Day', 'Temperature (°F)_max', 'Temperature (°F)_avg',
       'Temperature (°F)_min', 'Dew Point (°F)_max', 'Dew Point (°F)_avg',
       'Dew Point (°F)_min', 'Humidity (%)_max', 'Humidity (%)_avg',
       'Humidity (%)_min', 'Wind Speed (mph)_max', 'Wind Speed (mph)_avg',
       'Wind Speed (mph)_min', 'Pressure (in)_max', 'Pressure (in)_avg',
       'Pressure (in)_min', 'day_of_week', 'hour_of_day', 'icao_opis',
       'icao_opkc', 'icao_opla', 'iata_isb', 'iata_khi', 'iata_lhe',
       'status_active', 'status_cancelled', 'status_unknown',
       'delay_time_binary'],
      dtype='object')

In [130]:
df_final.drop(columns=['flight_number','airline_name','code_shared_flag','type','departure_scheduled_time', 'departure_actual_time',
       'arrival_iata_code', 'arrival_icao_code', 'arrival_estimated_time','Day','hour_of_day','Month','day_of_week','departure_icao_code','departure_iata_code','status'],inplace=True)
df_test_final.drop(columns=['Flight Number', 'Type', 'Status', 'Departure IATA Code',
       'Departure ICAO Code', 'Arrival IATA Code', 'Arrival ICAO Code',
       'Month', 'Day'],inplace=True)

In [131]:
df_final.shape,df_test_final.shape

((51572, 25), (12914, 24))

## Feature selection

In [195]:
temperature_features = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min']
dew_point_features = ['Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min']
humidity_features = ['Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min']
wind_speed_features = ['Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min']
precipitation_features = ['Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']

In [None]:
all_features = temperature_features + dew_point_features + humidity_features + wind_speed_features + precipitation_features
avg_columns = [col for col in df_final.columns if '_avg' in col]
other_columns = [col for col in df_final.columns if col not in all_features]
selected_columns = avg_columns + other_columns
df_filtered = df_final[selected_columns]
avg_columns = [col for col in df_test_final.columns if '_avg' in col]
other_columns = [col for col in df_test_final.columns if col not in all_features]
selected_columns = avg_columns + other_columns
df_filtered_test = df_test_final[selected_columns]
# df_filtered_test.drop(columns=['Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min'],inplace=True)

In [197]:
df_filtered.columns

Index(['Temperature (°F)_avg', 'Dew Point (°F)_avg', 'Humidity (%)_avg',
       'Wind Speed (mph)_avg', 'Pressure (in)_avg', 'month_Apr', 'month_Aug',
       'month_Dec', 'month_Feb', 'month_Jan', 'month_Jul', 'month_Jun',
       'month_Mar', 'month_May', 'month_Nov', 'month_Oct', 'month_Sep',
       'icao_opis', 'icao_opkc', 'icao_opla', 'iata_isb', 'iata_khi',
       'iata_lhe', 'status_active', 'status_cancelled', 'status_unknown',
       'delay_time_binary'],
      dtype='object')

## Scale features

In [132]:
features_to_scale = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min',
       'Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min',
       'Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min',
       'Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min',
       'Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']
features_to_scale_2 = [
    'Temperature (°F)_avg',
    'Dew Point (°F)_avg',
    'Humidity (%)_avg',
    'Wind Speed (mph)_avg',
    'Pressure (in)_avg'
]
features_to_scale_3 = [
    'Temperature (°F)_max',
    'Dew Point (°F)_max',
    'Humidity (%)_max',
    'Wind Speed (mph)_max',
    'Pressure (in)_max'
]
scaler = StandardScaler()
df_scaled = df_final.copy()
df_scaled[features_to_scale] = scaler.fit_transform(df_scaled[features_to_scale])
joblib.dump(scaler,'scaler_binary.pkl')

['scaler_binary.pkl']

### ignore for now

In [None]:
# Define feature groups
temperature_features = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min']
dew_point_features = ['Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min']
humidity_features = ['Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min']
wind_speed_features = ['Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min']
precipitation_features = ['Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']

feature_groups = [temperature_features, dew_point_features, humidity_features, wind_speed_features]

df_scaled_train = df_final.copy()

scalers = {}
for group in feature_groups:
    scaler = StandardScaler()
    df_scaled_train[group] = scaler.fit_transform(df_scaled_train[group].values.T).T
    scalers["_".join(group)] = scaler
joblib.dump(scalers, "group_scalers.pkl")

['group_scalers.pkl']

## Ovoersampling

In [None]:
from imblearn.combine import SMOTEENN
X = df_scaled.drop(columns="delay_time_binary")
y = df_scaled['delay_time_binary']

combined_sampler = SMOTEENN(random_state=42)
X_resampled, y_resampled = combined_sampler.fit_resample(X, y)


df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_time_binary'] = y_resampled

print(df_resampled['delay_time_binary'].value_counts())

delay_time_binary
1    36580
0     7351
Name: count, dtype: int64


#### ignore

In [None]:
X = df_final.drop(columns="delay_time_binary")
y = df_final['delay_time_binary']
oversampler = SMOTE(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)
print(len(y_resampled[y_resampled == 0]),int(0.3 * len(y_resampled[y_resampled == 0])))
undersampler = RandomUnderSampler(sampling_strategy={0: int(len(y_resampled[y_resampled == 0])), 1: int(0.3 * len(y_resampled[y_resampled == 0]))}, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_time_binary'] = y_resampled

46355 13906


In [318]:
X = df_scaled.drop(columns="delay_time_binary")
y = df_scaled['delay_time_binary']

oversampler = SMOTE()
X_resampled, y_resampled = oversampler.fit_resample(X, y)
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_time_binary'] = y_resampled

#### thx

In [134]:
df_final['delay_time_binary'].value_counts()

delay_time_binary
1    46355
0     5217
Name: count, dtype: int64

## Undersampling

In [None]:
X = df_scaled.drop(columns="delay_time_binary")
y = df_scaled['delay_time_binary']

undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_time_binary'] = y_resampled

  df_resampled['delay_time_binary'] = y_resampled


In [414]:
df_resampled.shape

(9202, 108)

## continuing... for PCA

In [135]:
X = df_resampled.drop(columns='delay_time_binary')
y = df_resampled['delay_time_binary']

In [136]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

(60261, 5)

In [137]:
y.value_counts()

delay_time_binary
0    46355
1    13906
Name: count, dtype: int64

## Model Training (Gradient Boosting)

n_estimators (int, default=100): The number of boosting stages to be run. Increasing this can lead to better performance but might result in overfitting.

learning_rate (float, default=0.1): The contribution of each tree to the final prediction. Smaller values require more n_estimators to achieve the same level of performance.

max_depth (int, default=3): The maximum depth of each individual tree. Limits the complexity of the tree and helps prevent overfitting.

min_samples_split (int or float, default=2): The minimum number of samples required to split an internal node. Controls when a node is split and can affect model complexity.

min_samples_leaf (int or float, default=1): The minimum number of samples required to be at a leaf node. Reduces overfitting by forcing leaves to have more samples.

random_state (int, RandomState instance, or None, default=None): Controls the randomness of the estimator, ensuring reproducibility of results.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f"Test Accuracy: {acc}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"ROC AUC: {roc_auc}")

Test Accuracy: 0.9798351914687349
Confusion Matrix:
[[9354    0]
 [ 208  753]]
ROC AUC: 0.9317965548412905


In [225]:
joblib.dump(model,'gradient_boosting.pkl')

['gradient_boosting.pkl']

## Model training (Logistic Regression)

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=10000, random_state=42,C= 100, penalty= 'l1', solver= 'liblinear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [151]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.765867418899859


In [152]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[9231    0]
 [2822    0]]


In [142]:
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.97      0.88      9231
           1       0.66      0.20      0.31      2822

    accuracy                           0.79     12053
   macro avg       0.73      0.58      0.59     12053
weighted avg       0.77      0.79      0.74     12053



## XGBOOST

In [344]:
from xgboost import XGBClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, stratify=y, random_state=42
)
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"]))

In [346]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[5948 3447]
 [3168 6226]]


## Preparing test data for prediction

#### Ignore

In [143]:
features_to_scale = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min',
       'Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min',
       'Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min',
       'Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min',
       'Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']
features_to_scale_2 = ['Temperature (°F)_avg','Dew Point (°F)_avg','Humidity (%)_avg','Wind Speed (mph)_avg','Pressure (in)_avg']
features_to_scale_3 = [
    'Temperature (°F)_max',
    'Dew Point (°F)_max',
    'Humidity (%)_max',
    'Wind Speed (mph)_max',
    'Pressure (in)_max'
]
scaler = joblib.load('scaler_binary.pkl')
df_scaled_test = df_test_final.copy()
df_scaled_test[features_to_scale] = scaler.transform(df_scaled_test[features_to_scale])

#### use for now

In [None]:
for group in feature_groups:
    df_test_final[group] = StandardScaler().fit_transform(df_test_final[group].values.T).T

In [24]:
df_scaled_test.shape,df_scaled.shape

((12914, 24), (51572, 25))

## pca

In [144]:
# pca = PCA(n_components=0.95)
pcaed_test = pca.transform(df_test_final)

In [145]:
pcaed_test.shape,X_pca.shape

((12914, 5), (60261, 5))

## Time for hyperparameter tuning

C : Regularization Strengths

Solver : Optimization Solvers

Penalty: Regularization Type

In [380]:
X = df_resampled.drop(columns='delay_time_binary')
y = df_resampled['delay_time_binary']
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

(61062, 38)

In [None]:
log_reg = LogisticRegression(max_iter=10000, random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, 
                           cv=5, verbose=1, n_jobs=-1, scoring='accuracy')

grid_search.fit(X_pca, y)

print("Best Hyperparameters found:", grid_search.best_params_)
print("Best Cross-validation Accuracy:", grid_search.best_score_)

best_log_reg = grid_search.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
  

Best Hyperparameters found: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Cross-validation Accuracy: 0.750293210008216


In [553]:
joblib.dump(grid_search, 'grid_search_logreg.pkl')

['grid_search_logreg.pkl']

In [554]:
joblib.dump(best_log_reg,'best_logreg.pkl')

['best_logreg.pkl']

## Prediction Time

In [146]:
prediction = model.predict(df_test_final)

In [None]:
id_column = range(1, len(prediction) + 1)

output_df = pd.DataFrame({
    "ID": id_column,
    "Delay": prediction
})

In [148]:
output_df['Delay'].value_counts()

Delay
0    11145
1     1769
Name: count, dtype: int64

In [149]:
output_df["Delay"] = output_df["Delay"].apply(lambda x: 'on-time' if x == 0 else 'delayed')
output_df.to_csv("predictions.csv", index=False)