In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv(r"E:\ML Project MK02\notebook\EDA_StudentPerformance.csv")
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total_marks,average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333
...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,282,94.000000
996,male,group C,high school,free/reduced,none,62,55,55,172,57.333333
997,female,group C,high school,free/reduced,completed,59,71,65,195,65.000000
998,female,group D,some college,standard,completed,68,78,77,223,74.333333


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   object 
 5   math score                   1000 non-null   int64  
 6   reading score                1000 non-null   int64  
 7   writing score                1000 non-null   int64  
 8   total_marks                  1000 non-null   int64  
 9   average_score                1000 non-null   float64
dtypes: float64(1), int64(4), object(5)
memory usage: 78.2+ KB


In [4]:
X = df.drop(columns=['math score'],axis=1)
y = df['math score']

In [5]:
cat_feature = X.select_dtypes(include='object').columns.to_list()
num_feature = X.select_dtypes(exclude='object').columns.to_list()

scaler = StandardScaler()
encoder = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",encoder,cat_feature),
        ("StandardScaler",scaler,num_feature)
    ],
    remainder='passthrough'
)

In [6]:
X = preprocessor.fit_transform(X)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [7]:
temp_model = {
    "LinearRegression": LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "AdaBoost":AdaBoostRegressor(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False),
    "RandomForestRegressor":RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    "DecisionTress":DecisionTreeRegressor()
}

In [8]:
model_list =[]
r2_list = []

for i in range(len(list(temp_model))):
    model = list(temp_model.values())[i]

    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2_score = r2_score(y_train,y_train_pred)
    test_r2_score = r2_score(y_test,y_test_pred)
    r2_list.append(test_r2_score)

    model_list.append(list(temp_model.keys())[i])
    print(model)
    print(f"Training Score : ",train_r2_score)
    print(f"Testing Score : ",test_r2_score)
    print("="*40)


LinearRegression()
Training Score :  1.0
Testing Score :  1.0
Lasso()
Training Score :  0.9013344454535516
Testing Score :  0.9060946888526118
Ridge()
Training Score :  0.9994135133621143
Testing Score :  0.999430625155644
AdaBoostRegressor()
Training Score :  0.9348993336720133
Testing Score :  0.9143411853686553
<catboost.core.CatBoostRegressor object at 0x0000027F2DC13490>
Training Score :  0.9964941637643833
Testing Score :  0.9672407272698839
RandomForestRegressor()
Training Score :  0.9947400005253948
Testing Score :  0.9528505270484189
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None

In [9]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['model','score']).sort_values(by=["score"],ascending=False)

Unnamed: 0,model,score
0,LinearRegression,1.0
2,Ridge,0.999431
4,CatBoostRegressor,0.967241
6,XGBRegressor,0.966608
5,RandomForestRegressor,0.952851
3,AdaBoost,0.914341
7,DecisionTress,0.911439
1,Lasso,0.906095


In [10]:
final_model = LinearRegression()
final_model = final_model.fit(X_train,y_train)
y_pred = final_model.predict(X_test)
print(r2_score(y_test,y_pred) * 100)

100.0
