In [2]:
# Baisc import
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import catboost
import lightgbm as lgb

from sklearn.metrics import (
    mean_absolute_error,       # MAE
    mean_squared_error,        # MSE
    mean_squared_log_error,    # MSLE
    median_absolute_error,     # MedAE
    r2_score,                  # R²
    explained_variance_score,  # EVS
    max_error                   # Max Error
)

import joblib

import warnings
warnings.filterwarnings('ignore')

In [3]:
notebook_dir = os.getcwd()

processed_data_path = os.path.join(notebook_dir, "data")
X=joblib.load(os.path.join(processed_data_path,'X'))
y=joblib.load(os.path.join(processed_data_path,'y'))

In [4]:
X

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,4,2,2,2,5,5,1,2.17,1
1,4,2,1,2,4,5,1,2.33,1
2,0,2,1,2,1,5,1,2.17,1
3,5,2,4,2,0,5,1,2.25,1
4,5,2,4,2,4,5,1,2.33,1
...,...,...,...,...,...,...,...,...,...
297812,5,1,4,0,2,3,0,10.08,49
297813,5,1,0,0,5,3,0,10.42,49
297814,5,1,1,0,5,3,0,13.83,49
297815,5,1,1,0,2,3,0,10.00,49


In [5]:
y

0          5953
1          5953
2          5956
3          5955
4          5955
          ...  
297812    69265
297813    77105
297814    79099
297815    81585
297816    81585
Name: price, Length: 297817, dtype: int64

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42)

In [7]:
y_train.shape,X_test.shape,y_train.shape,y_test.shape

((178690,), (119127, 9), (178690,), (119127,))

In [8]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rsme = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae, rsme, r2_score

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
models = {

    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest Regressor': RandomForestRegressor(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'XGBoost Regressor': xgb.XGBRegressor(),
    'CatBoost Regressor': catboost.CatBoostRegressor(verbose=False),
    'LightGBM Regressor': lgb.LGBMRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
    
}

In [16]:
results = []

for model_name, model in models.items():
    
    print(f"{model_name} in execution..")
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)  
    test_score = model.score(X_test, y_test)  
 
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
   
    results.append({
        'Model': model_name,
        'Train R² Score': train_score,
        'Test R² Score': test_score,
        'Mean Squared Error': mse,
        'Mean Absolute Error': mae
    })


Linear Regression in execution..
Ridge Regression in execution..
Lasso Regression in execution..
Random Forest Regressor in execution..
K-Nearest Neighbors in execution..
Gradient Boosting Regressor in execution..
XGBoost Regressor in execution..
CatBoost Regressor in execution..
LightGBM Regressor in execution..
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 347
[LightGBM] [Info] Number of data points in the train set: 178690, number of used features: 9
[LightGBM] [Info] Start training from score 20950.359796
AdaBoost Regressor in execution..


In [17]:
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Model,Train R² Score,Test R² Score,Mean Squared Error,Mean Absolute Error
0,Linear Regression,0.907027,0.904719,49131810.0,4641.727169
1,Ridge Regression,0.907027,0.904719,49131820.0,4641.737551
2,Lasso Regression,0.907027,0.904719,49131840.0,4641.69452
3,Random Forest Regressor,0.997568,0.985051,7708487.0,1132.834388
4,K-Nearest Neighbors,0.98243,0.972473,14194300.0,1879.650351
5,Gradient Boosting Regressor,0.952916,0.952466,24511290.0,2976.048664
6,XGBoost Regressor,0.977656,0.975932,12410720.0,2062.96912
7,CatBoost Regressor,0.97548,0.974168,13320380.0,2125.680012
8,LightGBM Regressor,0.970795,0.969915,15513400.0,2356.796445
9,AdaBoost Regressor,0.935318,0.934827,33606420.0,3713.707108
