In [130]:
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [131]:
dataset = pd.read_csv('data/stud.csv', sep=',')



Numerical features: ['math_score', 'reading_score', 'writing_score']
Categorial features: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [None]:
#dataset['total_score'] = dataset['math_score'] + dataset['reading_score'] + dataset['writing_score']

#dataset['average_score'] = dataset['total_score'] / 3




In [132]:
# X= dataset.drop('total_score', axis=1)
# y= dataset['total_score']

X = dataset.drop(columns=['math_score'],axis=1)

y = dataset['math_score']

X.info()

y.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   reading_score                1000 non-null   int64 
 6   writing_score                1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB
<class 'pandas.core.series.Series'>
RangeIndex: 1000 entries, 0 to 999
Series name: math_score
Non-Null Count  Dtype
--------------  -----
1000 non-null   int64
dtypes: int64(1)
memory usage: 7.9 KB


In [None]:
# numeric_features = [ f for f in dataset.columns if dataset.dtypes[f] != 'object']

# categorial_features = [ f for f in dataset.columns if dataset.dtypes[f] == 'object']


# print("Numerical features:", numeric_features)
# print("Categorial features:", categorial_features)  

In [133]:
numeric_features = [ f for f in X.columns if X.dtypes[f] != 'object']

categorial_features = [ f for f in X.columns if X.dtypes[f] == 'object']


print("Numerical features:", numeric_features)
print("Categorial features:", categorial_features)  

Numerical features: ['reading_score', 'writing_score']
Categorial features: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [134]:
## satndard scaler

numeric_tranformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_tranformer, numeric_features),
        ('cat', categorical_transformer, categorial_features)
    ])

In [135]:
X= preprocessor.fit_transform(X)

In [96]:
print(X)

[[ 0.19399858  0.39149181  1.         ...  1.          0.
   1.        ]
 [ 1.42747598  1.31326868  1.         ...  1.          1.
   0.        ]
 [ 1.77010859  1.64247471  1.         ...  1.          0.
   1.        ]
 ...
 [ 0.12547206 -0.20107904  1.         ...  0.          1.
   0.        ]
 [ 0.60515772  0.58901542  1.         ...  1.          1.
   0.        ]
 [ 1.15336989  1.18158627  1.         ...  0.          0.
   1.        ]]


In [136]:
## train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [137]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800, 19) (200, 19) (800,) (200,)


In [138]:
##evaluation function
def evaluate_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return rmse, r2



In [144]:
##models)
from sklearn.linear_model import Lasso, Ridge


models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

print(type(models))
model_list = []
r2_list =[]

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)
    print(f"{model_name}:")
    print(f"Train RMSE: {train_rmse:.4f}, Train R2: {train_r2:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}, Test R2: {test_r2:.4f}")
    print("-" * 30)
    model_list.append(model_name)
    r2_list.append(test_r2)
  

<class 'dict'>
Linear Regression:
Train RMSE: 5.3231, Train R2: 0.8743
Test RMSE: 5.3940, Test R2: 0.8804
------------------------------
Lasso:
Train RMSE: 6.5938, Train R2: 0.8071
Test RMSE: 6.5197, Test R2: 0.8253
------------------------------
Ridge:
Train RMSE: 5.3233, Train R2: 0.8743
Test RMSE: 5.3904, Test R2: 0.8806
------------------------------
K-Neighbors Regressor:
Train RMSE: 5.7134, Train R2: 0.8552
Test RMSE: 7.2538, Test R2: 0.7838
------------------------------
Decision Tree:
Train RMSE: 0.2795, Train R2: 0.9997
Test RMSE: 7.9009, Test R2: 0.7435
------------------------------
Random Forest Regressor:
Train RMSE: 2.3192, Train R2: 0.9761
Test RMSE: 6.0666, Test R2: 0.8488
------------------------------
XGBRegressor:
Train RMSE: 1.0073, Train R2: 0.9955
Test RMSE: 6.5958, Test R2: 0.8212
------------------------------
CatBoosting Regressor:
Train RMSE: 3.0427, Train R2: 0.9589
Test RMSE: 6.0086, Test R2: 0.8516
------------------------------
AdaBoost Regressor:
Train RM