In [4]:
import pandas as pd

In [5]:
Datasets_path="C:\\Users\\asus\\Desktop\\Data Science\\Machine Learning\\Project\\Datasets"
Data=pd.read_csv(f"{Datasets_path}\\delays_train.csv")

In [11]:
Data.columns

Index(['Weekday', 'Month_of_Year', 'Day_of_Month', 'Scheduled_Departure_Time',
       'Scheduled_Arrival_Time', 'Marketing_Airline_DOT_ID', 'Flight_Number',
       'Origin_Airport_ID', 'Destination_Airport_ID', 'Flight_Cancelled',
       ...
       'Departure_State_WY', 'Marketing_Airline_AS', 'Marketing_Airline_B6',
       'Marketing_Airline_DL', 'Marketing_Airline_F9', 'Marketing_Airline_G4',
       'Marketing_Airline_HA', 'Marketing_Airline_NK', 'Marketing_Airline_UA',
       'Marketing_Airline_WN'],
      dtype='object', length=138)

In [None]:
Data.dropna(inplace=True)

In [8]:
# Using pd.get_dummies() for one-hot encoding
Data = pd.get_dummies(Data, columns=['Arrival_State'], drop_first=True)
Data = pd.get_dummies(Data, columns=['Departure_State'], drop_first=True)
Data = pd.get_dummies(Data, columns=['Marketing_Airline'], drop_first=True)

In [38]:
# # Specify the columns to drop
# columns_to_drop = ['Arrival_State', 'Departure_State', 'Marketing_Airline']
# Data.drop(columns=columns_to_drop, inplace=True)

### Outlier Handling 

In [12]:
import numpy as np 
def handle_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    # Impute upper outliers with upper limit
    series = np.where(series > upper_limit, upper_limit, series)
    # Impute lower outliers with lower limit
    series = np.where(series < lower_limit, lower_limit, series)
    return series

numeric_features = Data.select_dtypes(include=[np.number]).columns
Data[numeric_features] = Data[numeric_features].apply(handle_outliers)

In [None]:
# import numpy as np
# numeric_columns = Data.select_dtypes(include=[np.number]).columns
# Q1 = Data[numeric_columns].quantile(0.25)
# Q3 = Data[numeric_columns].quantile(0.75)
# IQR = Q3 - Q1
# # Define the threshold for outliers
# threshold = 1.5
# # Filter out rows with outliers in any numeric column
# Data = Data[~((Data[numeric_columns] < (Q1 - threshold * IQR)) | (Data[numeric_columns] > (Q3 + threshold * IQR))).any(axis=1)]

### Feature Selection 

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Assuming you have your dataset loaded into a DataFrame called 'data'
# Split the data into features (X) and target variable (y)
X = Data.drop(columns=['Arrival_Delay'])
y = Data['Arrival_Delay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the Lasso model
lasso = Lasso(alpha=0.1)
# Fit the Lasso model to the training data
lasso.fit(X_train_scaled, y_train)

# Select features with non-zero coefficients
selected_features = SelectFromModel(lasso, prefit=True).get_support()

# Get the names of the selected features
selected_feature_names = X.columns[selected_features]

print("Selected Features:")
print(selected_feature_names)

Selected Features:
Index(['Month_of_Year', 'Scheduled_Departure_Time', 'Flight_Number',
       'Departure_Delay', 'Taxi_Out_Time', 'Taxi_In_Time',
       'Actual_Departure_Time', 'Flight_Duration', 'Flight_Distance',
       'Arrival_State_AL', 'Arrival_State_AZ', 'Arrival_State_CA',
       'Arrival_State_CO', 'Arrival_State_CT', 'Arrival_State_FL',
       'Arrival_State_HI', 'Arrival_State_IL', 'Arrival_State_MD',
       'Arrival_State_ME', 'Arrival_State_NC', 'Arrival_State_NE',
       'Arrival_State_NV', 'Arrival_State_OH', 'Arrival_State_PA',
       'Arrival_State_PR', 'Arrival_State_RI', 'Arrival_State_SC',
       'Arrival_State_TX', 'Arrival_State_UT', 'Arrival_State_VA',
       'Arrival_State_VI', 'Arrival_State_VT', 'Arrival_State_WA',
       'Departure_State_AZ', 'Departure_State_CA', 'Departure_State_CO',
       'Departure_State_FL', 'Departure_State_HI', 'Departure_State_ID',
       'Departure_State_IL', 'Departure_State_KS', 'Departure_State_KY',
       'Departure_State_LA',

### SVM

In [None]:
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_predict, train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming Data is your DataFrame where Arrival_Delay is one of the features
X = Data.drop(columns=['Arrival_Delay'])  # Features
y = Data['Arrival_Delay']  # Target variable

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the SVR model
svr = SVR()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best parameters
best_params = grid_search.best_params_

# Use only the significant features
# Replace X_scaled with the actual variable name of your scaled features
X_significant = X_scaled[:, significant_features_indices]

# Fit the SVR model using cross-validation only on the significant features
y_pred_cv = cross_val_predict(grid_search.best_estimator_, X_significant, y, cv=5)

# Calculate evaluation metrics
mape = mean_absolute_percentage_error(y, y_pred_cv)
rmse = np.sqrt(mean_squared_error(y, y_pred_cv))
mae = mean_absolute_error(y, y_pred_cv)

print("Best Parameters:", best_params)
print("MAPE:", mape)
print("RMSE:", rmse)
print("MAE:", mae)

### KNN

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt  # Import sqrt function to calculate square root

# Assuming Data_no_outliers is your preprocessed DataFrame
# X contains features, y contains target variable
X = Data.drop(columns=['Arrival_Delay'])
y = Data['Arrival_Delay']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model to the data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best MSE (from cross-validation): {best_score}')

# Fit the KNN regression model with the best parameters
best_knn = grid_search.best_estimator_

# Predict on the test set
y_pred = best_knn.predict(X_test_scaled)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = sqrt(mse)  # Calculate RMSE using square root of MSE

print(f'Mean Absolute Error (MAE) on test set: {mae}')
print(f'Mean Absolute Percentage Error (MAPE) on test set: {mape}')
print(f'Root Mean Squared Error (RMSE) on test set: {rmse}')