In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score    
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR    
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [10]:
df = pd.read_csv('data\stud.csv')

In [11]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [13]:
X = df.drop(columns=['math_score'],axis=1)
Y = df["math_score"]

In [14]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

from sklearn.compose import ColumnTransformer


numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', oh_transformer, cat_cols)
    ]
 )

In [15]:
X = preprocessor.fit_transform(X)   


In [16]:
X

array([[ 0.19399858,  0.39149181,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.42747598,  1.31326868,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.77010859,  1.64247471,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.12547206, -0.20107904,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60515772,  0.58901542,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.15336989,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ]], shape=(1000, 19))

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [37]:
X_train.shape , X_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

In [38]:
def evaluate_model(true , predicted):
    
    rmse = np.sqrt(mean_squared_error(true, predicted))
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2 = r2_score(true, predicted)
    
    print(f"MAE: {mae}, MSE: {mse}, R2: {r2} , RMSE: {rmse}")
    
    return mae, mse, r2 


In [43]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

models = {
    'Linear Regression': LinearRegression(),
    "Decision tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "CatBoost": CatBoostRegressor()
     
}

model_list = []
r2_list = []

for i in range(len(models)):
    model = models[list(models.keys())[i]]
    model.fit(X_train, Y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    model_train_mse, model_train_mae , model_train_r2 = evaluate_model(y_train , y_train_pred)
    model_test_mse, model_test_mae , model_test_r2 = evaluate_model(y_test , y_test_pred)
   
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance in training set: ")
    print("Train MSE: ", model_train_mse)
    print("Train MAE: ", model_train_mae)
    print("Train R2: ", model_train_r2)  
    
    print("------------------------------------------")

    print("Model Performance in test set: ")
    print("Test MSE: ", model_test_mse)
    print("Test MAE: ", model_test_mae)
    print("Test R2: ", model_test_r2)

    r2_list.append(model_test_r2)
    print("="*35)
    print("\n")


MAE: 4.266711846071956, MSE: 28.33487038064859, R2: 0.8743172040139593 , RMSE: 5.323050852720514
MAE: 4.214763142474849, MSE: 29.095169866715466, R2: 0.8804332983749565 , RMSE: 5.3939938697328405
Linear Regression
Model Performance in training set: 
Train MSE:  4.266711846071956
Train MAE:  28.33487038064859
Train R2:  0.8743172040139593
------------------------------------------
Model Performance in test set: 
Test MSE:  4.214763142474849
Test MAE:  29.095169866715466
Test R2:  0.8804332983749565


MAE: 0.01875, MSE: 0.078125, R2: 0.9996534669718089 , RMSE: 0.2795084971874737
MAE: 6.44, MSE: 64.61, R2: 0.7344849805867055 , RMSE: 8.038034585643434
Decision tree
Model Performance in training set: 
Train MSE:  0.01875
Train MAE:  0.078125
Train R2:  0.9996534669718089
------------------------------------------
Model Performance in test set: 
Test MSE:  6.44
Test MAE:  64.61
Test R2:  0.7344849805867055


MAE: 1.8136624999999997, MSE: 5.2272105347222215, R2: 0.9768140659764494 , RMSE: 2.2

In [45]:
pd.DataFrame({
    'Model': model_list,
    'R2': r2_list
}).sort_values(by='R2', ascending=False).reset_index(drop=True)

Unnamed: 0,Model,R2
0,Linear Regression,0.880433
1,Gradient Boosting,0.872476
2,CatBoost,0.851632
3,Random Forest,0.850656
4,AdaBoost,0.849403
5,XGBoost,0.821221
6,Decision tree,0.734485
