In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


In [2]:
df = pd.read_csv(r"E:\ml Projects\notebook\data\StudentsPerformance.csv")
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [3]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [4]:
X = df.drop(columns=['math score'],axis=1)
y = df['math score']

In [5]:
num_feature = X.select_dtypes(exclude=['object']).columns.to_list()
cat_feature = X.select_dtypes(include=['object']).columns.to_list()

num_transformation = StandardScaler()
cat_transformation = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",cat_transformation,cat_feature),
        ("StandardScaler",num_transformation,num_feature)
    ],
    remainder='passthrough'
)


In [6]:
X = preprocessor.fit_transform(X)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2)

In [7]:
temp_model = {
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False),
    "RandomForestRegressor":RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    
}

In [13]:
model_list = []
r2_list = []

for i in range(len(list(temp_model))):
    model = list(temp_model.values())[i]
    
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2_score = r2_score(y_train,y_train_pred)
    test_r2_score = r2_score(y_test,y_test_pred)
    r2_list.append(test_r2_score)

    model_list.append(list(temp_model.keys())[i])
    print(model)
    print(f"Training Score : {train_r2_score}")
    print(f"Test Score : {test_r2_score}")
    print("==================================================================================================")

LinearRegression()
Training Score : 0.8742744782693375
Test Score : 0.8811710947341137
Lasso()
Training Score : 0.8071462015863456
Test Score : 0.8253197323627852
Ridge()
Training Score : 0.8743042615212909
Test Score : 0.8805931485028738
AdaBoostRegressor()
Training Score : 0.851196559185923
Test Score : 0.8503830694057439
<catboost.core.CatBoostRegressor object at 0x0000024E1A27C7C0>
Training Score : 0.9589358676277713
Test Score : 0.8516318920747058
RandomForestRegressor()
Training Score : 0.9761968403289446
Test Score : 0.8530596741627207
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None

In [19]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model',"Score"]).sort_values(by=["Score"],ascending=False)

Unnamed: 0,Model,Score
0,LinearRegression,0.881171
2,Ridge,0.880593
5,RandomForestRegressor,0.85306
4,CatBoostRegressor,0.851632
3,AdaBoostRegressor,0.850383
6,XGBRegressor,0.827797
1,Lasso,0.82532
7,DecisionTreeRegressor,0.757149


In [None]:
final_model = LinearRegression()
final_model = final_model.fit(X_train,y_train)
y_pred = final_model.predict(X_test)
print(r2_score(y_test,y_pred) * 100)


88.11710947341138
