In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#models
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
#from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import warnings


In [2]:
dataset= pd.read_csv("C:/Users/shudh/OneDrive/Desktop/ML_OPS/data/student_performance.csv", sep="\t")
dataset.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
#Preparing X and Y variables
X=dataset.drop(columns=["math score"], axis=1)
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [4]:
y=dataset['math score']
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math score, dtype: int64

In [17]:
#create column Transformers with 3 types of transformers
num_features=X.select_dtypes(exclude="object").columns
cat_features=X.select_dtypes(include="object").columns

''' 

OneHotEncoder is a feature transformation tool provided by the sklearn.preprocessing module in Scikit-learn, which is used to convert categorical
data into a format that can be provided to machine learning algorithms to improve predictions. Specifically, it converts categorical features to 
a binary matrix, where each column corresponds to one possible category value.
'''

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
    transformers=[
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScalar", numeric_transformer, num_features),
    ]
)

In [25]:
X=preprocessor.fit_transform(X)

In [27]:
X.shape

(1000, 19)

In [28]:
#separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [29]:
#create an evaluation function to give all metrics after model training
def evaluation_model(true, predicted):
    mae=mean_absolute_error(true, predicted)
    mse=mean_squared_error(true, predicted)
    rmse=np.sqrt(mean_squared_error(true, predicted))
    r2_square=r2_score(true, predicted)

    return mae, mse, rmse, r2_square

In [35]:
models={
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor()
}
model_list=[]
r2_list=[]


for i in range(len(list(models))):
    model=list(models.values())[i]
    #train model
    model.fit(X_train, y_train)

    #make prediction
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)


    #evaluate train and test dataset
    model_train_mae, model_train_mse, model_train_rmse,model_train_r2_square=evaluation_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2_square=evaluation_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print("Model performance for training dataset")

    print("Root mean squared error", model_train_rmse)
    print("Mean absolute error", model_train_mae)
    print("Mean squared error", model_train_mse)
    print("R2 score", model_train_r2_square)

    print("------------------------------------------------------------------")

    print("Model performance for test dataset")

    print("Root mean squared error", model_test_rmse)
    print("Mean absolute error", model_test_mae)
    print("Mean squared error", model_test_mse)
    print("R2 score", model_test_r2_square)
    r2_list.append(model_test_r2_square)

    print('='*35)
    print("\n")
    


Linear Regression
Model performance for training dataset
Root mean squared error 5.32433481852575
Mean absolute error 4.267109375
Mean squared error 28.348541259765625
R2 score 0.8742565651513869
------------------------------------------------------------------
Model performance for test dataset
Root mean squared error 5.3959872842671395
Mean absolute error 4.2158203125
Mean squared error 29.116678771972655
R2 score 0.8803449074540941


Lasso
Model performance for training dataset
Root mean squared error 6.593815587795566
Mean absolute error 5.206302661246526
Mean squared error 43.47840400585579
R2 score 0.8071462015863456
------------------------------------------------------------------
Model performance for test dataset
Root mean squared error 6.519694535667419
Mean absolute error 5.157881810347763
Mean squared error 42.5064168384116
R2 score 0.8253197323627853


Ridge
Model performance for training dataset
Root mean squared error 5.323324922741654
Mean absolute error 4.26498782372

In [36]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=["Model Name", "R2_Score"]).sort_values(by=["R2_Score"], ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.880345
7,AdaBoostRegressor,0.850691
5,Random Forest Regressor,0.849899
6,XGBRegressor,0.827797
1,Lasso,0.82532
3,K-Neighbors Regressor,0.783813
4,Decision Tree,0.732821
