## Multi Classification

## Importing Libraries

In [113]:
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import joblib
from sklearn.naive_bayes import GaussianNB

## Hot encoding airlines

In [None]:
df = pd.read_csv('../../Phase_1/train.csv')
df_test = pd.read_csv('../../Phase_1/test.csv')
common_airlines = set(df_test['airline_name']).intersection(set(df['airline_name']))
df['airline_name'] = df['airline_name'].apply(lambda x: x if x in common_airlines else 'other')
df_test['airline_name'] = df_test['airline_name'].apply(lambda x: x if x in common_airlines else 'other')

one_hot_encoded_train = pd.get_dummies(df['airline_name'], prefix='airline')
one_hot_encoded_test = pd.get_dummies(df_test['airline_name'], prefix='airline')
one_hot_encoded_train = one_hot_encoded_train[sorted(one_hot_encoded_train.columns)]
one_hot_encoded_test = one_hot_encoded_test[sorted(one_hot_encoded_test.columns)]

df_final = pd.concat([df, one_hot_encoded_train], axis=1)
df_test_final = pd.concat([df_test, one_hot_encoded_test], axis=1)
df_final.drop(columns=['airline_name','departure_scheduled_time','departure_actual_time','type','flight_number'
                       ,'arrival_iata_code','arrival_icao_code','arrival_estimated_time'],inplace=True)
df_test_final.drop(columns=['flight_number','airline_name','Unnamed: 0.1','type','arrival_iata_code','arrival_icao_code',
                            'departure_scheduled_time','Unnamed: 0'],inplace=True)

## Hot encoding other columns

In [158]:
def categorize_delay(delay):
    if pd.isna(delay):
        return "Long Delay"
    elif delay == 0:
        return "No Delay"
    elif delay < 45:
        return "Short Delay"
    elif 45 <= delay <= 175:
        return "Moderate Delay"
    else:
        return "Long Delay"

In [159]:
one_hot_months = pd.get_dummies(df_final['Month'], prefix='month')
df_final = pd.concat([df_final, one_hot_months], axis=1)
one_hot_months = pd.get_dummies(df_test_final['Month'], prefix='month')
df_test_final = pd.concat([df_test_final, one_hot_months], axis=1)

one_hot_days = pd.get_dummies(df_final['day_of_week'], prefix='day')
df_final = pd.concat([df_final, one_hot_days], axis=1)
one_hot_days = pd.get_dummies(df_test_final['day_of_week'], prefix='day')
df_test_final = pd.concat([df_test_final, one_hot_days], axis=1)

one_hot_icaos = pd.get_dummies(df_final['departure_icao_code'], prefix='icao')
df_final = pd.concat([df_final, one_hot_icaos], axis=1)
one_hot_icaos = pd.get_dummies(df_test_final['departure_icao_code'], prefix='icao')
df_test_final = pd.concat([df_test_final, one_hot_icaos], axis=1)

one_hot_iata = pd.get_dummies(df_final['departure_iata_code'], prefix='iata')
df_final = pd.concat([df_final, one_hot_iata], axis=1)
one_hot_iata = pd.get_dummies(df_test_final['departure_iata_code'], prefix='iata')
df_test_final = pd.concat([df_test_final, one_hot_iata], axis=1)

one_hot_status = pd.get_dummies(df_final['status'], prefix='status')
df_final = pd.concat([df_final, one_hot_status], axis=1)
one_hot_status = pd.get_dummies(df_test_final['status'], prefix='status')
df_test_final = pd.concat([df_test_final, one_hot_status], axis=1)

df_final['delay_time'] = df_final['delay_time'].clip(lower=0)
df_final.loc[df_final['status'] == 'active', 'delay_time'] = df_final.loc[df_final['status'] == 'active', 'delay_time'].fillna(
    df_final.loc[df_final['status'] == 'active', 'delay_time'].mean()
)
# df_final.fillna({'delay_time':-1},inplace=True)
df_final['delay_category'] = df_final['delay_time'].apply(categorize_delay)
df_final.drop(columns='delay_time',inplace=True)

In [160]:
df_final.drop(columns=['Day','hour_of_day','Month','day_of_week','departure_icao_code','departure_iata_code','status'],inplace=True)
df_test_final.drop(columns=['Day','hour_of_day','Month','day_of_week','departure_icao_code','departure_iata_code','status'],inplace=True)

In [161]:
df_final['delay_category'].value_counts()

delay_category
Short Delay       42834
No Delay           4601
Long Delay         3664
Moderate Delay      473
Name: count, dtype: int64

In [162]:
df_final['delay_category'].value_counts()

delay_category
Short Delay       42834
No Delay           4601
Long Delay         3664
Moderate Delay      473
Name: count, dtype: int64

## Feature selection

In [163]:
temperature_features = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min']
dew_point_features = ['Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min']
humidity_features = ['Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min']
wind_speed_features = ['Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min']
precipitation_features = ['Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']

In [None]:
all_features = temperature_features + dew_point_features + humidity_features + wind_speed_features + precipitation_features
avg_columns = [col for col in df_final.columns if '_max' in col]
other_columns = [col for col in df_final.columns if col not in all_features]
selected_columns = avg_columns + other_columns
df_filtered = df_final[selected_columns]
avg_columns = [col for col in df_test_final.columns if '_max' in col]
other_columns = [col for col in df_test_final.columns if col not in all_features]
selected_columns = avg_columns + other_columns
df_filtered_test = df_test_final[selected_columns]
# df_filtered_test.drop(columns=['Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min'],inplace=True)

In [167]:
df_filtered_test.columns

Index(['Temperature (°F)_max', 'Dew Point (°F)_max', 'Humidity (%)_max',
       'Wind Speed (mph)_max', 'Pressure (in)_max', 'code_shared_flag',
       'airline_aero nomad', 'airline_air arabia', 'airline_air canada',
       'airline_air china ltd',
       ...
       'day_Wednesday', 'icao_opis', 'icao_opkc', 'icao_opla', 'iata_isb',
       'iata_khi', 'iata_lhe', 'status_active', 'status_cancelled',
       'status_unknown'],
      dtype='object', length=107)

## Scale features

In [168]:
features_to_scale = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min',
       'Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min',
       'Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min',
       'Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min',
       'Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']
features_to_scale_2 = [
    'Temperature (°F)_avg',
    'Dew Point (°F)_avg',
    'Humidity (%)_avg',
    'Wind Speed (mph)_avg',
    'Pressure (in)_avg'
]
features_to_scale_3 = [
    'Temperature (°F)_max',
    'Dew Point (°F)_max',
    'Humidity (%)_max',
    'Wind Speed (mph)_max',
    'Pressure (in)_max'
]
features_to_scale_4 = [
    'Temperature (°F)_min',
    'Dew Point (°F)_min',
    'Humidity (%)_min',
    'Wind Speed (mph)_min',
    'Pressure (in)_min'
]
scaler = StandardScaler()
df_scaled = df_filtered.copy()
df_scaled[features_to_scale_3] = scaler.fit_transform(df_scaled[features_to_scale_3])
joblib.dump(scaler,'scaler_binary.pkl')

['scaler_binary.pkl']

### ignore for now

In [None]:
temperature_features = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min']
dew_point_features = ['Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min']
humidity_features = ['Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min']
wind_speed_features = ['Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min']
precipitation_features = ['Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']

feature_groups = [temperature_features, dew_point_features, humidity_features, wind_speed_features]

df_scaled_train = df_final.copy()

scalers = {}
for group in feature_groups:
    scaler = StandardScaler()
    df_scaled_train[group] = scaler.fit_transform(df_scaled_train[group].values.T).T
    scalers["_".join(group)] = scaler

joblib.dump(scalers, "group_scalers.pkl")

['group_scalers.pkl']

## Ovoersampling

In [None]:
from imblearn.combine import SMOTEENN
X = df_scaled.drop(columns="delay_category")
y = df_scaled['delay_category']

combined_sampler = SMOTEENN(random_state=42)
X_resampled, y_resampled = combined_sampler.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_category'] = y_resampled

print(df_resampled['delay_category'].value_counts())

In [77]:
X = df_scaled.drop(columns="delay_category")
y = df_scaled['delay_category']

oversampler = SMOTE()
X_resampled, y_resampled = oversampler.fit_resample(X, y)
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_category'] = y_resampled

  df_resampled['delay_category'] = y_resampled


In [None]:
X = df_scaled.drop(columns="delay_category")
y = df_scaled['delay_category']
sampling_strategy = {
    "No Delay": int(len(y[y == "No Delay"]) * 2),
    "Short Delay": len(y[y == "Short Delay"]),
    "Moderate Delay": len(y[y == "Moderate Delay"]),
    'Long Delay':len(y[y=='Long Delay'])
}

oversampler = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_category'] = y_resampled

  df_resampled['delay_category'] = y_resampled


#### thx

In [169]:
df_scaled['delay_category'].value_counts()

delay_category
Short Delay       42834
No Delay           4601
Long Delay         3664
Moderate Delay      473
Name: count, dtype: int64

In [79]:
df_resampled['delay_category'].value_counts()

delay_category
Short Delay       42834
No Delay          42834
Long Delay        42834
Moderate Delay    42834
Name: count, dtype: int64

## Undersampling

In [None]:
X = df_scaled.drop(columns="delay_category")
y = df_scaled['delay_category']

undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['delay_category'] = y_resampled

  df_resampled['delay_category'] = y_resampled


In [29]:
df_resampled['delay_category'].value_counts()

delay_category
Long Delay        473
Moderate Delay    473
No Delay          473
Short Delay       473
Name: count, dtype: int64

## continuing... for PCA

In [80]:
df_resampled.columns

Index(['code_shared_flag', 'Temperature (°F)_max', 'Temperature (°F)_avg',
       'Temperature (°F)_min', 'Dew Point (°F)_max', 'Dew Point (°F)_avg',
       'Dew Point (°F)_min', 'Humidity (%)_max', 'Humidity (%)_avg',
       'Humidity (%)_min',
       ...
       'icao_opis', 'icao_opkc', 'icao_opla', 'iata_isb', 'iata_khi',
       'iata_lhe', 'status_active', 'status_cancelled', 'status_unknown',
       'delay_category'],
      dtype='object', length=118)

In [170]:
X = df_scaled.drop(columns='delay_category')
y = df_scaled['delay_category']

In [171]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

(51572, 33)

In [172]:
y.value_counts()

delay_category
Short Delay       42834
No Delay           4601
Long Delay         3664
Moderate Delay      473
Name: count, dtype: int64

## Naivve Bayes

In [174]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [175]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8364517692680562
Classification Report:
                 precision    recall  f1-score   support

    Long Delay       1.00      1.00      1.00       753
Moderate Delay       0.02      0.01      0.01        98
      No Delay       0.21      0.24      0.23       941
   Short Delay       0.90      0.90      0.90      8523

      accuracy                           0.84     10315
     macro avg       0.54      0.54      0.54     10315
  weighted avg       0.84      0.84      0.84     10315



## Random forest

In [173]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8490547746000969


In [142]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[ 531    0   13  209]
 [   1    6    4   87]
 [   7    2  137  795]
 [  36   39  364 8084]]


## KNN

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
knn = KNeighborsClassifier(metric='manhattan',n_neighbors= 7, weights= 'distance')
knn.fit(X_train, y_train)

In [177]:
y_pred = knn.predict(X_test)

In [178]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8525448376151236


In [179]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                precision    recall  f1-score   support

    Long Delay       1.00      0.86      0.93       753
Moderate Delay       0.04      0.03      0.04        98
      No Delay       0.25      0.18      0.21       941
   Short Delay       0.89      0.94      0.91      8523

      accuracy                           0.85     10315
     macro avg       0.55      0.50      0.52     10315
  weighted avg       0.83      0.85      0.84     10315



### knn hyperparameter tuning

In [130]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
best_knn = grid_search.best_estimator_
test_score = best_knn.score(X_test, y_test)
print("Test Accuracy with Best Hyperparameters:", test_score)

[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time= 1.9min
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time= 1.8min
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time= 1.9min
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time= 1.9min
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time= 1.8min


Traceback (most recent call last):
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/neighbors/_classification.py", line 259, in predict
    probabilities = self.predict_proba(X)
                    

[CV] END ..metric=manhattan, n_neighbors=11, weights=uniform; total time=   0.3s


Traceback (most recent call last):
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/neighbors/_classification.py", line 259, in predict
    probabilities = self.predict_proba(X)
                    

[CV] END ..metric=manhattan, n_neighbors=11, weights=uniform; total time=   0.3s
[CV] END ..metric=manhattan, n_neighbors=11, weights=uniform; total time=   0.3s


Traceback (most recent call last):
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/neighbors/_classification.py", line 259, in predict
    probabilities = self.predict_proba(X)
                    

[CV] END ..metric=manhattan, n_neighbors=11, weights=uniform; total time=   0.3s


Traceback (most recent call last):
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/neighbors/_classification.py", line 259, in predict
    probabilities = self.predict_proba(X)
                    

[CV] END ..metric=manhattan, n_neighbors=11, weights=uniform; total time=   0.3s
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time= 1.8min
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time= 1.7min
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time= 1.9min
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time= 1.8min
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time= 1.8min
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time= 1.9min
[CV] END ..metric=manhattan, n_neighbors=9, weights=distance; total time= 1.8min
[CV] END .metric=manhattan, n_neighbors=11, weights=distance; total time= 1.8min
[CV] END .metric=manhattan, n_neighbors=11, weights=distance; total time= 1.9min
[CV] END .metric=manhattan, n_neighbors=11, weights=distance; total time= 1.6min
[CV] END .metric=manhattan, n_neighbors=11, weights=distance; total time= 1.7min
[CV] END .metric=manhattan, 

 0.73660029 0.76443064 0.7319458  0.76371922        nan 0.76807226
        nan 0.77184645        nan 0.77267848        nan 0.77132794
        nan 0.77072503]


Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Test Accuracy with Best Hyperparameters: 0.7755269377321179


In [131]:
joblib.dump(grid_search,'grid_search_knn.pkl')

['grid_search_knn.pkl']

## Prediction Time

### Preparing test data for prediction

In [181]:
features_to_scale = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min',
       'Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min',
       'Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min',
       'Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min',
       'Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']
features_to_scale_2 = ['Temperature (°F)_avg','Dew Point (°F)_avg','Humidity (%)_avg','Wind Speed (mph)_avg','Pressure (in)_avg']
features_to_scale_3 = [
    'Temperature (°F)_max',
    'Dew Point (°F)_max',
    'Humidity (%)_max',
    'Wind Speed (mph)_max',
    'Pressure (in)_max'
]
features_to_scale_4 = [
    'Temperature (°F)_min',
    'Dew Point (°F)_min',
    'Humidity (%)_min',
    'Wind Speed (mph)_min',
    'Pressure (in)_min'
]
scaler = joblib.load('scaler_binary.pkl')
df_scaled_test = df_filtered_test.copy()
df_scaled_test[features_to_scale_3] = scaler.transform(df_scaled_test[features_to_scale_3])

### or 

In [None]:
for group in feature_groups:
    df_test_final[group] = StandardScaler().fit_transform(df_test_final[group].values.T).T

### pca

In [182]:
pcaed_test = pca.transform(df_scaled_test)

### Prediction time

In [184]:
prediction = knn.predict(pcaed_test)

In [None]:
id_column = range(1, len(prediction) + 1)
output_df = pd.DataFrame({
    "ID": id_column,
    "Delay": prediction
})

In [186]:
output_df['Delay'].value_counts()

Delay
Short Delay       13061
No Delay           1015
Long Delay          758
Moderate Delay       76
Name: count, dtype: int64

In [187]:
bruh = output_df[:12914]

In [188]:
bruh['Delay'].value_counts()

Delay
Short Delay       11252
No Delay            925
Long Delay          666
Moderate Delay       71
Name: count, dtype: int64

In [189]:
bruh.to_csv('predictions.csv',index=False)