In [6]:
#Importing all the important Libraries

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


In [7]:
#Loading The Dataset
df = pd.read_csv("study.csv")

In [8]:
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [9]:
#Preparing Independent vs Dependent Features

X = df.drop(columns="math score", axis=1)
y = df['math score']

In [12]:
#Create a Column Transformer with 3 Types of Transformer

num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [13]:
#Let's apply it to our Independents Features present in X
X = preprocessor.fit_transform(X)

In [19]:
#Let's Split Our Dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [21]:
y_train.shape, y_test.shape

((800,), (200,))

Let's create a model pipeline without RandomSearchCV to see which model performs best on this dataset

In [22]:

def evaluate_model(true, predicted):
    mae=mean_absolute_error(true, predicted)
    mse=mean_squared_error(true, predicted)
    rmse=np.sqrt(mean_absolute_error(true, predicted))
    r2_square=r2_score(true, predicted)
    return mae, mse, rmse, r2_square


In [53]:
models={
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Trees": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBoost Regressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list=[]
r2_list_train=[]
r2_list_test=[]
model_r2_dict_train=[]
model_r2_dict_test=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) #Train the Model

    #make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Evaluate the model
    model_train_mae, model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae, model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance on Training Dataset")
    print(f"MAE on Train Data {model_train_mae}")
    print(f"MSE on Train Data {model_train_mse}")
    print(f"RMSE on Train Data {model_train_rmse}")
    print(f"R2 Score on Train Data {model_train_r2}")

    print("-----------------------------------------------")

    print("Model Performance on Testing Dataset")
    print(f"MAE on Test Data {model_test_mae}")
    print(f"MSE on Test Data {model_test_mse}")
    print(f"RMSE on Test Data {model_test_rmse}")
    print(f"R2 Score on Test Data {model_test_r2}")
    
    r2_list_train.append(model_train_r2)
    r2_list_test.append(model_test_r2)

    print("="*35)
    print("\n")

model_r2_train = list(zip(model_list, r2_list_train))
model_r2_test = list(zip(model_list, r2_list_test))

Linear Regression
Model Performance on Training Dataset
MAE on Train Data 4.2697900390625
MSE on Train Data 28.36333822250366
RMSE on Train Data 2.066347027743041
R2 Score on Train Data 0.8741909314066747
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 4.23453125
MSE on Test Data 29.389923706054688
RMSE on Test Data 2.057797669840259
R2 Score on Test Data 0.8792220064484081


Lasso
Model Performance on Training Dataset
MAE on Train Data 5.206302661246528
MSE on Train Data 43.47840400585579
RMSE on Train Data 2.2817323816009902
R2 Score on Train Data 0.8071462015863456
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 5.157881810347763
MSE on Test Data 42.50641683841163
RMSE on Test Data 2.2710970499623664
R2 Score on Test Data 0.8253197323627852


Ridge
Model Performance on Training Dataset
MAE on Train Data 4.264987823725982
MSE on Train Data 28.33778823308244
RMSE on Train Data 2

Linear Regression
Model Performance on Training Dataset
MAE on Train Data 4.2697900390625
MSE on Train Data 28.36333822250366
RMSE on Train Data 2.066347027743041
R2 Score on Train Data 0.8741909314066747
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 4.23453125
MSE on Test Data 29.389923706054688
RMSE on Test Data 2.057797669840259
R2 Score on Test Data 0.8792220064484081
===================================

Lasso
Model Performance on Training Dataset
MAE on Train Data 5.206302661246528
MSE on Train Data 43.47840400585579
RMSE on Train Data 2.2817323816009902
R2 Score on Train Data 0.8071462015863456
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 5.157881810347763
MSE on Test Data 42.50641683841163
RMSE on Test Data 2.2710970499623664
R2 Score on Test Data 0.8253197323627852
===================================


Ridge
Model Performance on Training Dataset
MAE on Train Data 4.264987823725982
MSE on Train Data 28.33778823308244
RMSE on Train Data 2.0651846948217445
R2 Score on Train Data 0.8743042615212909
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 4.211100688014262
MSE on Test Data 29.056272192348317
RMSE on Test Data 2.052096656596434
R2 Score on Test Data 0.8805931485028737
===================================


K-Nearest Neighbors
Model Performance on Training Dataset
MAE on Train Data 4.51675
MSE on Train Data 32.57995
RMSE on Train Data 2.1252646893975347
R2 Score on Train Data 0.8554876322327585
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 5.621
MSE on Test Data 52.6066
RMSE on Test Data 2.3708648211148606
R2 Score on Test Data 0.7838129945787431
===================================


Decision Trees
Model Performance on Training Dataset
MAE on Train Data 0.01875
MSE on Train Data 0.078125
RMSE on Train Data 0.13693063937629152
R2 Score on Train Data 0.9996534669718089
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 6.1
MSE on Test Data 58.26
RMSE on Test Data 2.4698178070456938
R2 Score on Test Data 0.760580327642493
===================================


Random Forest Regressor
Model Performance on Training Dataset
MAE on Train Data 1.838490625
MSE on Train Data 5.296821777795493
RMSE on Train Data 1.3559095194739212
R2 Score on Train Data 0.9765052967622628
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 4.6335125
MSE on Test Data 36.245717184027775
RMSE on Test Data 2.1525595229865306
R2 Score on Test Data 0.8510480993380914
===================================


XGBoost Regressor
Model Performance on Training Dataset
MAE on Train Data 0.6874666035175323
MSE on Train Data 1.0146163961652577
RMSE on Train Data 0.8291360585076084
R2 Score on Train Data 0.9954995512962341
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 5.057730598449707
MSE on Test Data 41.90370826987466
RMSE on Test Data 2.2489398832449274
R2 Score on Test Data 0.8277965784072876
===================================

CatBoosting Regressor
Model Performance on Training Dataset
MAE on Train Data 2.405393926779502
MSE on Train Data 9.257805405523678
RMSE on Train Data 1.5509332438178962
R2 Score on Train Data 0.9589358676277713
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 4.612531714976557
MSE on Test Data 36.10365799356841
RMSE on Test Data 2.14768054304558
R2 Score on Test Data 0.8516318920747058
===================================


AdaBoost Regressor
Model Performance on Training Dataset
MAE on Train Data 4.768298582593765
MSE on Train Data 33.85991179648585
RMSE on Train Data 2.183643419286621
R2 Score on Train Data 0.8498102045552519
-----------------------------------------------
Model Performance on Testing Dataset
MAE on Test Data 4.584259536857235
MSE on Test Data 35.60061609180217
RMSE on Test Data 2.1410884000566712
R2 Score on Test Data 0.8536991445172559
===================================



In [54]:
#Checking how the model performed on Training Dataset
model_r2_train

[('Linear Regression', 0.8741909314066747),
 ('Lasso', 0.8071462015863456),
 ('Ridge', 0.8743042615212909),
 ('K-Nearest Neighbors', 0.8554876322327585),
 ('Decision Trees', 0.9996534669718089),
 ('Random Forest Regressor', 0.976420257176037),
 ('XGBoost Regressor', 0.9954995512962341),
 ('CatBoosting Regressor', 0.9589358676277713),
 ('AdaBoost Regressor', 0.8486120500818954)]

In [55]:
#Checking how the model performed on Testing Dataset
model_r2_test

[('Linear Regression', 0.8792220064484081),
 ('Lasso', 0.8253197323627852),
 ('Ridge', 0.8805931485028737),
 ('K-Nearest Neighbors', 0.7838129945787431),
 ('Decision Trees', 0.7270262317825711),
 ('Random Forest Regressor', 0.8524833571387738),
 ('XGBoost Regressor', 0.8277965784072876),
 ('CatBoosting Regressor', 0.8516318920747058),
 ('AdaBoost Regressor', 0.8455496980005138)]

In [62]:
#Let's Visualize in Proper Manner
train_result=pd.DataFrame(model_r2_train, columns=["Model Name", "R2 Score on Train Data"])
test_result=pd.DataFrame(model_r2_test, columns=["Model Name", "R2 Score on Test Data"])

In [68]:
train_result

Unnamed: 0,Model Name,R2 Score on Train Data
0,Linear Regression,0.874191
1,Lasso,0.807146
2,Ridge,0.874304
3,K-Nearest Neighbors,0.855488
4,Decision Trees,0.999653
5,Random Forest Regressor,0.97642
6,XGBoost Regressor,0.9955
7,CatBoosting Regressor,0.958936
8,AdaBoost Regressor,0.848612


In [69]:
test_result

Unnamed: 0,Model Name,R2 Score on Test Data
0,Linear Regression,0.879222
1,Lasso,0.82532
2,Ridge,0.880593
3,K-Nearest Neighbors,0.783813
4,Decision Trees,0.727026
5,Random Forest Regressor,0.852483
6,XGBoost Regressor,0.827797
7,CatBoosting Regressor,0.851632
8,AdaBoost Regressor,0.84555


In [75]:
#Let's make a final dataframe of result
model_result_df = pd.merge(train_result, test_result)

In [83]:
model_result_df

Unnamed: 0,Model Name,R2 Score on Train Data,R2 Score on Test Data
0,Linear Regression,0.874191,0.879222
1,Lasso,0.807146,0.82532
2,Ridge,0.874304,0.880593
3,K-Nearest Neighbors,0.855488,0.783813
4,Decision Trees,0.999653,0.727026
5,Random Forest Regressor,0.97642,0.852483
6,XGBoost Regressor,0.9955,0.827797
7,CatBoosting Regressor,0.958936,0.851632
8,AdaBoost Regressor,0.848612,0.84555


In [84]:
model_result_df_sort = model_result_df.sort_values(by="R2 Score on Test Data", ascending=False)

In [85]:
model_result_df_sort

Unnamed: 0,Model Name,R2 Score on Train Data,R2 Score on Test Data
2,Ridge,0.874304,0.880593
0,Linear Regression,0.874191,0.879222
5,Random Forest Regressor,0.97642,0.852483
7,CatBoosting Regressor,0.958936,0.851632
8,AdaBoost Regressor,0.848612,0.84555
6,XGBoost Regressor,0.9955,0.827797
1,Lasso,0.807146,0.82532
3,K-Nearest Neighbors,0.855488,0.783813
4,Decision Trees,0.999653,0.727026
