# Regression Prediction

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import numpy as np
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV

## ignore

In [2]:
df = pd.read_csv('../../Phase_1/train.csv')
df_test = pd.read_csv('../../Phase_1/test.csv')
common_airlines = set(df_test['airline_name']).intersection(set(df['airline_name']))

df['airline_name'] = df['airline_name'].apply(lambda x: x if x in common_airlines else 'other')
df_test['airline_name'] = df_test['airline_name'].apply(lambda x: x if x in common_airlines else 'other')

one_hot_encoded_train = pd.get_dummies(df['airline_name'], prefix='airline')
one_hot_encoded_test = pd.get_dummies(df_test['airline_name'], prefix='airline')

one_hot_encoded_train = one_hot_encoded_train[sorted(one_hot_encoded_train.columns)]
one_hot_encoded_test = one_hot_encoded_test[sorted(one_hot_encoded_test.columns)]

df_final = pd.concat([df, one_hot_encoded_train], axis=1)
df_test_final = pd.concat([df_test, one_hot_encoded_test], axis=1)
df_final.drop(columns=['airline_name','departure_scheduled_time','departure_actual_time','type','flight_number'
                       ,'arrival_iata_code','arrival_icao_code','arrival_estimated_time'],inplace=True)
df_test_final.drop(columns=['flight_number','airline_name','Unnamed: 0.1','type','arrival_iata_code','arrival_icao_code',
                            'departure_scheduled_time','Unnamed: 0'],inplace=True)

## use below

In [4]:
df = pd.read_csv('../new_train.csv')
df_test = pd.read_csv('../final_test.csv')
# one_hot_months = pd.get_dummies(df['Month'], prefix='month')
# df_final = pd.concat([df, one_hot_months], axis=1)
# one_hot_months = pd.get_dummies(df_test['Month'], prefix='month')
# df_test_final = pd.concat([df_test, one_hot_months], axis=1)

# one_hot_days = pd.get_dummies(df_final['day_of_week'], prefix='day')
# df_final = pd.concat([df_final, one_hot_days], axis=1)
# one_hot_days = pd.get_dummies(df_test_final['day_of_week'], prefix='day')
# df_test_final = pd.concat([df_test_final, one_hot_days], axis=1)

one_hot_icaos = pd.get_dummies(df['departure_icao_code'], prefix='icao')
df_final = pd.concat([df, one_hot_icaos], axis=1)
one_hot_icaos = pd.get_dummies(df_test['Departure ICAO Code'], prefix='icao')
df_test_final = pd.concat([df_test, one_hot_icaos], axis=1)

one_hot_iata = pd.get_dummies(df_final['departure_iata_code'], prefix='iata')
df_final = pd.concat([df_final, one_hot_iata], axis=1)
one_hot_iata = pd.get_dummies(df_test_final['Departure IATA Code'], prefix='iata')
df_test_final = pd.concat([df_test_final, one_hot_iata], axis=1)

one_hot_status = pd.get_dummies(df_final['status'], prefix='status')
df_final = pd.concat([df_final, one_hot_status], axis=1)
one_hot_status = pd.get_dummies(df_test_final['Status'], prefix='status')
df_test_final = pd.concat([df_test_final, one_hot_status], axis=1)

## Transforming `delay_time` to Percentage of a Day

I have scaled the `delay_time` column to represent it as a percentage of a day (24 hours) with the following approach.

---

1. **Each minute value** is represented as a proportion of a full day (24 hours).  
- Formula:  

  Percentage of a day = `delay_time / (24 * 60)`  

- For example:  
  - For a delay time of **15 minutes**, it becomes:  
  15 / (24 * 60) = 0.0104  

2. **For NaN values**, replace them with **1**, indicating a full day's proportion.


In [5]:
df_final['delay_time'] = df_final['delay_time'].clip(lower=0)
df_final.loc[df_final['status'] == 'active', 'delay_time'] = df_final.loc[df_final['status'] == 'active', 'delay_time'].fillna(
    df_final.loc[df_final['status'] == 'active', 'delay_time'].mean()
)
# df_final['delay_time'].fillna(df['delay_time'].mean(),inplace=True)
# df_final['delay_time'].fillna(24*60,inplace=True)
# df_final.dropna(inplace=True)
df_final['delay_time'] = df_final['delay_time'].apply(lambda x: x / (24 * 60) if pd.notnull(x) else 1)

In [6]:
df_final.drop(columns=['flight_number','airline_name','code_shared_flag','type','departure_scheduled_time', 'departure_actual_time',
       'arrival_iata_code', 'arrival_icao_code', 'arrival_estimated_time','Day','hour_of_day','Month','day_of_week','departure_icao_code','departure_iata_code','status'],inplace=True)
df_test_final.drop(columns=['Flight Number', 'Type', 'Status', 'Departure IATA Code',
       'Departure ICAO Code', 'Arrival IATA Code', 'Arrival ICAO Code',
       'Month', 'Day'],inplace=True)

In [7]:
df_final['delay_time']

0        0.000000
1        0.001354
2        0.000000
3        0.001354
4        0.001354
           ...   
51567    0.001354
51568    0.001354
51569    0.000000
51570    0.001354
51571    1.000000
Name: delay_time, Length: 51572, dtype: float64

## Feature scaling

In [8]:
features_to_scale = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min',
       'Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min',
       'Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min',
       'Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min',
       'Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']
features_to_scale_2 = [
    'Temperature (°F)_avg',
    'Dew Point (°F)_avg',
    'Humidity (%)_avg',
    'Wind Speed (mph)_avg',
    'Pressure (in)_avg'
]
features_to_scale_3 = [
    'Temperature (°F)_max',
    'Dew Point (°F)_max',
    'Humidity (%)_max',
    'Wind Speed (mph)_max',
    'Pressure (in)_max'
]
features_to_scale_4 = [
    'Temperature (°F)_min',
    'Dew Point (°F)_min',
    'Humidity (%)_min',
    'Wind Speed (mph)_min',
    'Pressure (in)_min'
]
scaler = StandardScaler()
df_scaled = df_final.copy()
df_scaled[features_to_scale] = scaler.fit_transform(df_scaled[features_to_scale])
joblib.dump(scaler,'scaler_binary.pkl')

['scaler_binary.pkl']

In [None]:
temperature_features = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min']
dew_point_features = ['Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min']
humidity_features = ['Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min']
wind_speed_features = ['Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min']
precipitation_features = ['Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']

feature_groups = [temperature_features, dew_point_features, humidity_features, wind_speed_features]

df_scaled_train = df_final.copy()

scalers = {}
for group in feature_groups:
    scaler = StandardScaler()
    df_scaled_train[group] = scaler.fit_transform(df_scaled_train[group].values.T).T
    scalers["_".join(group)] = scaler

joblib.dump(scalers, "group_scalers.pkl")

['group_scalers.pkl']

In [188]:
df_scaled.shape

(30205, 25)

## pca

In [9]:
X = df_scaled.drop(columns='delay_time')
y = df_scaled['delay_time']
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

(51572, 9)

## Random forest regressor

In [234]:
X_standardized = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [201]:
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Square Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 0.014665979387571002
Root Mean Square Error (RMSE): 0.09531306958868757


## Hyper Parameter Tuning

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Best Parameters:", grid_search.best_params_)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Square Error (RMSE):", rmse)
joblib.dump(grid_search,'grid_search_rfr.pkl')

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; tot



[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  12.0s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  19.7s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  22.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  31.8s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  20.1s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  21.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  33.4s
[CV] END bootstrap=True, max_depth=30, max_fea

1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
647 fits failed with the following error:
Traceback (most recent call last):
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/zain/.pyenv/versions/3.11.9/envs/ml_old_python/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/zain/.pyenv/versions/

Best Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Mean Absolute Error (MAE): 0.0030467591702380155
Root Mean Square Error (RMSE): 0.023026400560060716


['grid_search_rfr.pkl']

## Linear regression

In [97]:
X = df_scaled.drop(columns='delay_time')
y = df_scaled['delay_time']

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [227]:
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Square Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 0.1303241590538353
Root Mean Square Error (RMSE): 0.25690805691787433


## Ridge Regression (hyper parameter tuning)

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
ridge = Ridge()
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_ridge = grid_search.best_estimator_
y_pred = best_ridge.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Best Alpha:", grid_search.best_params_)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Square Error (RMSE):", rmse)

Best Alpha: {'alpha': 100}
Mean Absolute Error (MAE): 0.12929074892368614
Root Mean Square Error (RMSE): 0.2548295986626984


## Getting test data ready

In [235]:
features_to_scale = ['Temperature (°F)_max', 'Temperature (°F)_avg', 'Temperature (°F)_min',
       'Dew Point (°F)_max', 'Dew Point (°F)_avg', 'Dew Point (°F)_min',
       'Humidity (%)_max', 'Humidity (%)_avg', 'Humidity (%)_min',
       'Wind Speed (mph)_max', 'Wind Speed (mph)_avg', 'Wind Speed (mph)_min',
       'Pressure (in)_max', 'Pressure (in)_avg', 'Pressure (in)_min']
features_to_scale_2 = ['Temperature (°F)_avg','Dew Point (°F)_avg','Humidity (%)_avg','Wind Speed (mph)_avg','Pressure (in)_avg']
features_to_scale_3 = [
    'Temperature (°F)_max',
    'Dew Point (°F)_max',
    'Humidity (%)_max',
    'Wind Speed (mph)_max',
    'Pressure (in)_max'
]
features_to_scale_4 = [
    'Temperature (°F)_min',
    'Dew Point (°F)_min',
    'Humidity (%)_min',
    'Wind Speed (mph)_min',
    'Pressure (in)_min'
]
scaler = joblib.load('scaler_binary.pkl')
df_scaled_test = df_test_final.copy()
df_scaled_test[features_to_scale] = scaler.transform(df_scaled_test[features_to_scale])

In [None]:
for group in feature_groups:
    df_test_final[group] = StandardScaler().fit_transform(df_test_final[group].values.T).T

## pca

In [236]:
pcaed_test = pca.transform(df_test_final)

In [237]:
prediction = model.predict(pcaed_test)
# prediction = [0 if i<=0 else i for i in prediction]

In [238]:
minutes = [i * (24 * 60) for i in prediction]

In [239]:
len(prediction),len(minutes)

(12914, 12914)

In [240]:
id_column = range(1, len(prediction) + 1)
output_df = pd.DataFrame({
    "ID": id_column,
    "Delay": minutes
})

In [241]:
output_df.to_csv('predictions.csv',index=False)

In [242]:
output_df['Delay']

0        1375.253149
1        1375.253149
2        1375.253149
3        1375.253149
4        1375.253149
            ...     
12909    1375.303200
12910    1375.303200
12911    1375.303200
12912    1411.228352
12913    1375.303200
Name: Delay, Length: 12914, dtype: float64

In [243]:
heavy = pd.read_csv('../heavy_clap.csv')
heavy['Delay']

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
12909     0.0
12910    55.0
12911    23.0
12912    60.0
12913     0.0
Name: Delay, Length: 12914, dtype: float64