In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor

In [3]:
df = pd.read_csv('D:\ML_Projects\student_performance_regressor\\research\stud.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
x = df.drop(['writing_score'], axis=1)
y = df['writing_score']

In [10]:
num_feat = [i for i in x.columns if df[i].dtype != 'O']
cat_feat = [i for i in x.columns if df[i].dtype == 'O']
num_feat, cat_feat

(['math_score', 'reading_score'],
 ['gender',
  'race_ethnicity',
  'parental_level_of_education',
  'lunch',
  'test_preparation_course'])

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [11]:
onehot = OneHotEncoder()
stdSclr = StandardScaler()

processor = ColumnTransformer(
    [("OneHotEncoder",onehot,cat_feat),
     ("StandardScaler",stdSclr,num_feat)]
)

In [12]:
x = processor.fit_transform(x)

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=10)
x_train.shape, x_test.shape

((700, 19), (300, 19))

In [14]:
def validation(actual, pred):
    return (mean_absolute_error(actual, pred), np.sqrt(mean_squared_error(actual, pred)), r2_score(actual, pred))

In [15]:
from xgboost import XGBRegressor

In [16]:
models = {
    'LinearReg': LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "XGBoost": XGBRegressor(),
    "Adaboost": AdaBoostRegressor()
}

In [17]:
list(models)

['LinearReg',
 'Ridge',
 'Lasso',
 'DecisionTree',
 'RandomForest',
 'XGBoost',
 'Adaboost']

In [18]:
mode_list = []
r2_scores = []

for i in list(models):
    model = models[i]
    model.fit(x_train,y_train)
    
    train_pred = model.predict(x_train)
    test_pred = model.predict(x_test)
    
    mae, mse, r2 = validation(y_train, train_pred)
    print("Model: {}: Evaluation on Train data: mae: {}, mse: {}, r2: {}".format(i,mae,mse,r2))
    
    mae, mse, r2 = validation(y_test, test_pred)
    print("Model: {}: Evaluation on Test data: mae: {}, mse: {}, r2: {}".format(i,mae,mse,r2))
    mode_list.append(i)
    r2_scores.append(r2)

Model: LinearReg: Evaluation on Train data: mae: 2.82613525390625, mse: 3.4847186672978383, r2: 0.948352745165251
Model: LinearReg: Evaluation on Test data: mae: 2.8164217122395834, mse: 3.483942169739039, r2: 0.9449060072360227
Model: Ridge: Evaluation on Train data: mae: 2.8253453374442627, mse: 3.484684977637065, r2: 0.9483537437939786
Model: Ridge: Evaluation on Test data: mae: 2.81479177290921, mse: 3.48192636067942, r2: 0.9449697435639943
Model: Lasso: Evaluation on Train data: mae: 3.773760260288974, mse: 4.715986508734068, r2: 0.905407455685144
Model: Lasso: Evaluation on Test data: mae: 3.4753602541152744, mse: 4.251551351821211, r2: 0.9179540343871557
Model: DecisionTree: Evaluation on Train data: mae: 0.011428571428571429, mse: 0.15583874449479593, r2: 0.9998967089528062
Model: DecisionTree: Evaluation on Test data: mae: 4.358333333333333, mse: 5.373934002820156, r2: 0.86891680152608
Model: RandomForest: Evaluation on Train data: mae: 1.2712964285714285, mse: 1.5640242328556

In [19]:
pd.DataFrame(list(zip(mode_list, r2_scores)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
1,Ridge,0.94497
0,LinearReg,0.944906
4,RandomForest,0.932342
6,Adaboost,0.924476
5,XGBoost,0.922869
2,Lasso,0.917954
3,DecisionTree,0.868917
