In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import RandomizedSearchCV

In [3]:
df=pd.read_csv('data/stud.csv')

In [4]:
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [5]:
X=df.drop('math score',axis=1)
Y=df['math score']

In [6]:
num_features=X.select_dtypes(exclude='O').columns
cat_features=X.select_dtypes(include='O').columns

In [7]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor=ColumnTransformer(
    [
        ('OneHotEncoder',OneHotEncoder(),cat_features),
        ('StandardScalar',StandardScaler(),num_features)
    ],remainder='passthrough'
)

In [8]:
X= preprocessor.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split
X_Train,X_Test,Y_Train,Y_Test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [10]:
def evaluate_model(true,predict):
    return mean_absolute_error(true,predict),np.sqrt(mean_squared_error(true,predict)),r2_score(true,predict)

In [17]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'AdaBoostRegressor':AdaBoostRegressor(),
    'CatBoostRegressor':CatBoostRegressor(verbose=False),
    'XGBRegressor':XGBRegressor()
}

In [18]:
model_list=[]
r2_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_Train,Y_Train)

    y_train_pred=model.predict(X_Train)
    y_test_pred=model.predict(X_Test)

    mae_train,rmse_train,r2_train=evaluate_model(Y_Train,y_train_pred)
    mae_test,rmse_test,r2_test=evaluate_model(Y_Test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for train')
    print(f'Train MAE - {mae_train:.4f}')
    print(f'Train RMSE - {rmse_train:.4f}')
    print(f'Train R2 - {r2_train:.4f}')

    print('-'*30)

    print('Model performance for test')
    print(f'Test MAE - {mae_test:.4f}')
    print(f'Test RMSE - {rmse_test:.4f}')
    print(f'Test R2 - {r2_test:.4f}')
    r2_list.append(r2_test)

    print('='*30)
    print('\n')

LinearRegression
Model performance for train
Train MAE - 4.2667
Train RMSE - 5.3231
Train R2 - 0.8743
------------------------------
Model performance for test
Test MAE - 4.2148
Test RMSE - 5.3940
Test R2 - 0.8804


Lasso
Model performance for train
Train MAE - 5.2063
Train RMSE - 6.5938
Train R2 - 0.8071
------------------------------
Model performance for test
Test MAE - 5.1579
Test RMSE - 6.5197
Test R2 - 0.8253


Ridge
Model performance for train
Train MAE - 4.2650
Train RMSE - 5.3233
Train R2 - 0.8743
------------------------------
Model performance for test
Test MAE - 4.2111
Test RMSE - 5.3904
Test R2 - 0.8806


KNeighborsRegressor
Model performance for train
Train MAE - 4.5167
Train RMSE - 5.7077
Train R2 - 0.8555
------------------------------
Model performance for test
Test MAE - 5.6210
Test RMSE - 7.2530
Test R2 - 0.7838


DecisionTreeRegressor
Model performance for train
Train MAE - 0.0187
Train RMSE - 0.2795
Train R2 - 0.9997
------------------------------
Model performance