# Regression Classification

## Importing Libraries

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('encoded_train.csv')

In [3]:
df.head()

Unnamed: 0,airline_name,type,status,departure_iata_code,departure_icao_code,Month,Day,Temperature (°F)_max,Temperature (°F)_avg,Temperature (°F)_min,...,month_Sep,icao_opis,icao_opkc,icao_opla,iata_isb,iata_khi,iata_lhe,status_active,status_cancelled,status_unknown
0,airblue,departure,active,lhe,opla,Jan,31,61,56.8,52,...,False,False,False,True,False,False,True,True,False,False
1,flyjinnah,departure,active,lhe,opla,Jan,25,54,46.2,41,...,False,False,False,True,False,False,True,True,False,False
2,other,departure,active,lhe,opla,Jan,27,68,57.1,43,...,False,False,False,True,False,False,True,True,False,False
3,flyjinnah,departure,active,lhe,opla,Jan,19,57,47.4,41,...,False,False,False,True,False,False,True,True,False,False
4,other,departure,active,lhe,opla,Jan,25,54,46.2,41,...,False,False,False,True,False,False,True,True,False,False


## Replace negative delay times with zero

In [4]:
df['delay_time'] = df['delay_time'].clip(lower=0)

## Now fill nan values of active flight with mean

In [5]:
df['delay_time'].mean()

np.float64(13.484575669570829)

In [6]:
df.loc[df['status'] == 'active', 'delay_time'] = df.loc[df['status'] == 'active', 'delay_time'].fillna(
    df.loc[df['status'] == 'active', 'delay_time'].mean()
)

In [7]:
df['delay_time'].isnull().sum()

np.int64(3664)

## Transforming `delay_time` to Percentage of a Day

I have scaled the `delay_time` column to represent it as a percentage of a day (24 hours) with the following approach.

---

1. **Each minute value** is represented as a proportion of a full day (24 hours).  
- Formula:  

  Percentage of a day = `delay_time / (24 * 60)`  

- For example:  
  - For a delay time of **15 minutes**, it becomes:  
  15 / (24 * 60) = 0.0104  

2. **For NaN values**, replace them with **1**, indicating a full day's proportion.


In [8]:
df['delay_time'] = df['delay_time'].apply(lambda x: x / (24 * 60) if pd.notnull(x) else 1)

## Get the final_df

In [11]:
final_df = df[['Temperature (°F)_max',
       'Temperature (°F)_avg', 'Temperature (°F)_min', 'Dew Point (°F)_max',
       'Dew Point (°F)_avg', 'Dew Point (°F)_min', 'Humidity (%)_max',
       'Humidity (%)_avg', 'Humidity (%)_min', 'Wind Speed (mph)_max',
       'Wind Speed (mph)_avg', 'Wind Speed (mph)_min', 'Pressure (in)_max',
       'Pressure (in)_avg', 'Pressure (in)_min', 'delay_time',
       'hour_of_day', 'airline_airblue', 'airline_airsial',
       'airline_british airways', 'airline_emirates', 'airline_flyjinnah',
       'airline_klm', 'airline_oman air', 'airline_other',
       'airline_pakistan international airlines', 'airline_qatar airways',
       'airline_serene air', 'day_Friday', 'day_Monday', 'day_Saturday',
       'day_Sunday', 'day_Thursday', 'day_Tuesday', 'day_Wednesday',
       'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep', 'icao_opis', 'icao_opkc', 'icao_opla',
       'iata_isb', 'iata_khi', 'iata_lhe', 'status_active', 'status_cancelled',
       'status_unknown']]

## Random Forest Regressor Model prediction and analysis

In [None]:
X = final_df.drop(columns=['delay_time'])
y = final_df['delay_time']
X_standardized = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [17]:
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Square Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 0.0035524727177759065
Root Mean Square Error (RMSE): 0.005540618618585394


## Linear Regression Model prediction and analysis

In [22]:
X = final_df.drop(columns=['delay_time'])
y = final_df['delay_time']
X_standardized = StandardScaler().fit_transform(X)

In [31]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_standardized)

In [32]:
X_standardized.shape,X_pca.shape

((51572, 55), (51572, 33))

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [34]:
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Square Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 0.026078062673738234
Root Mean Square Error (RMSE): 0.03292988151682394
