In [None]:
import pandas as pd

In [None]:
Datasets_path="C:\\Users\\asus\\Desktop\\Data Science\\Machine Learning\\Project\\Datasets"
Data=pd.read_csv(f"{Datasets_path}\\delays_train.csv")

In [None]:
Data.dropna(inplace=True)

In [None]:
# Using pd.get_dummies() for one-hot encoding
Data = pd.get_dummies(Data, columns=['Arrival_State'], drop_first=True)
Data = pd.get_dummies(Data, columns=['Departure_State'], drop_first=True)
Data = pd.get_dummies(Data, columns=['Marketing_Airline'], drop_first=True)

In [None]:
# import numpy as np
# numeric_columns = Data.select_dtypes(include=[np.number]).columns
# Q1 = Data[numeric_columns].quantile(0.25)
# Q3 = Data[numeric_columns].quantile(0.75)
# IQR = Q3 - Q1
# # Define the threshold for outliers
# threshold = 1.5
# # Filter out rows with outliers in any numeric column
# Data = Data[~((Data[numeric_columns] < (Q1 - threshold * IQR)) | (Data[numeric_columns] > (Q3 + threshold * IQR))).any(axis=1)]

In [None]:
import numpy as np 
def handle_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    # Impute upper outliers with upper limit
    series = np.where(series > upper_limit, upper_limit, series)
    # Impute lower outliers with lower limit
    series = np.where(series < lower_limit, lower_limit, series)
    return series

numeric_features = Data.select_dtypes(include=[np.number]).columns
Data[numeric_features] = Data[numeric_features].apply(handle_outliers)

### RandomForestRegressor

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_predict
import numpy as np

# Assuming Data is your DataFrame where Arrival_Delay is one of the features
X = Data.drop(columns=['Arrival_Delay'])  # Features
y = Data['Arrival_Delay']  # Target variable

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fitting the Random Forest Regressor and making predictions using cross-validation
model = RandomForestRegressor()
y_pred_cv = cross_val_predict(model, X_scaled, y, cv=5)

# Calculating MAPE
mape = mean_absolute_percentage_error(y, y_pred_cv)

# Calculating MAE
mae = mean_absolute_error(y, y_pred_cv)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred_cv))

print("MAPE:", mape)
print("MAE:", mae)
print("RMSE:", rmse)

### Decision Tree

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_predict
import numpy as np

# Assuming Data is your DataFrame where Arrival_Delay is one of the features
X = Data.drop(columns=['Arrival_Delay'])  # Features
y = Data['Arrival_Delay']  # Target variable

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Set up the GridSearchCV
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model to the data
grid_search.fit(X_scaled, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best MSE (from cross-validation): {best_score}')

# Fit the Decision Tree Regressor with the best parameters
best_dtr = grid_search.best_estimator_

# Predict on the data using cross-validation
y_pred_cv = cross_val_predict(best_dtr, X_scaled, y, cv=5)

# Calculate evaluation metrics
mape = mean_absolute_percentage_error(y, y_pred_cv)
mae = mean_absolute_error(y, y_pred_cv)
rmse = np.sqrt(mean_squared_error(y, y_pred_cv))

print("MAPE:", mape)
print("MAE:", mae)
print("RMSE:", rmse)
