In [9]:
import pandas as pd

In [10]:
Datasets_path="C:\\Users\\asus\\Desktop\\Data Science\\Machine Learning\\Project\\Datasets"
Data=pd.read_csv(f"{Datasets_path}\\delays_train.csv")

In [11]:
Data.dropna(inplace=True)

In [12]:
# Using pd.get_dummies() for one-hot encoding
Data = pd.get_dummies(Data, columns=['Arrival_State'], drop_first=True)
Data = pd.get_dummies(Data, columns=['Departure_State'], drop_first=True)
Data = pd.get_dummies(Data, columns=['Marketing_Airline'], drop_first=True)

In [7]:
import numpy as np 
def handle_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    # Impute upper outliers with upper limit
    series = np.where(series > upper_limit, upper_limit, series)
    # Impute lower outliers with lower limit
    series = np.where(series < lower_limit, lower_limit, series)
    return series

numeric_features = Data.select_dtypes(include=[np.number]).columns
Data[numeric_features] = Data[numeric_features].apply(handle_outliers)

In [13]:
import numpy as np
numeric_columns = Data.select_dtypes(include=[np.number]).columns
Q1 = Data[numeric_columns].quantile(0.25)
Q3 = Data[numeric_columns].quantile(0.75)
IQR = Q3 - Q1
# Define the threshold for outliers
threshold = 1.5
# Filter out rows with outliers in any numeric column
Data = Data[~((Data[numeric_columns] < (Q1 - threshold * IQR)) | (Data[numeric_columns] > (Q3 + threshold * IQR))).any(axis=1)]

### Feature Selection

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Assuming you have your dataset loaded into a DataFrame called 'data'
# Split the data into features (X) and target variable (y)
X = Data.drop(columns=['Arrival_Delay'])
y = Data['Arrival_Delay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the Lasso model
lasso = Lasso(alpha=0.1)
# Fit the Lasso model to the training data
lasso.fit(X_train_scaled, y_train)

# Select features with non-zero coefficients
selected_features = SelectFromModel(lasso, prefit=True).get_support()

# Get the names of the selected features
selected_feature_names = X.columns[selected_features]

print("Selected Features:")
print(selected_feature_names)

### GradientBoostingRegressor

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_predict
import numpy as np

# Assuming Data is your DataFrame where Arrival_Delay is one of the features
X = Data.drop(columns=['Arrival_Delay'])  # Features
y = Data['Arrival_Delay']  # Target variable

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [3, 4, 5]
}

# Initialize GridSearchCV with GradientBoostingRegressor
grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_scaled, y)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Making predictions using cross-validation
y_pred_cv = cross_val_predict(best_model, X_scaled, y, cv=5)

# Calculating MAPE
mape = mean_absolute_percentage_error(y, y_pred_cv)

# Calculating MAE
mae = mean_absolute_error(y, y_pred_cv)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred_cv))

print("MAPE:", mape)
print("MAE:", mae)
print("RMSE:", rmse)

### XGBRegressor

In [9]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# Assuming Data_no_outliers is your DataFrame where Arrival_Delay is one of the features
X = Data.drop(columns=['Arrival_Delay'])  # Features
y = Data['Arrival_Delay']  # Target variable

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Setting up parameter grid for GridSearchCV
param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [3, 4, 5]}

# Initialize GridSearchCV with XGBRegressor
grid_search = GridSearchCV(XGBRegressor(), param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=42),
                           scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_scaled, y)

# Get the best model
best_model = grid_search.best_estimator_

# Making predictions using cross-validation
y_pred_cv = cross_val_predict(best_model, X_scaled, y, cv=5)

# Calculating MAPE
mape = mean_absolute_percentage_error(y, y_pred_cv)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred_cv))

# Calculating MAE
mae = mean_absolute_error(y, y_pred_cv)

print("MAPE:", mape)
print("RMSE:", rmse)
print("MAE:", mae)

MAPE: 17.075928632628965
RMSE: 5.1914302499063565
MAE: 4.027635404182541


In [14]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Assuming Data is your DataFrame where Arrival_Delay is one of the features
X = Data.drop(columns=['Arrival_Delay'])  # Features
y = Data['Arrival_Delay']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting the XGB Regressor
model = XGBRegressor()
model.fit(X_train_scaled, y_train)

# Making predictions
y_pred = model.predict(X_test_scaled)

# Calculating MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calculating MSE
mse = mean_squared_error(y_test, y_pred)

print("MAPE:", mape)
print("MSE:", mse)

MAPE: 4.016387786343856
MSE: 30.79189026983233
