## Model Training

In [76]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [77]:
df = pd.read_csv('data/stud.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## Preparing X & y variables

In [78]:
X = df.drop(columns= 'math_score')
X.head()
print(type(X))

<class 'pandas.core.frame.DataFrame'>


In [79]:
y = df['math_score']
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [80]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in'parental level of education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


In [81]:
### Create a column transformer with 3 types of transformer
num_features = X.select_dtypes(exclude= 'object').columns
onehot_column = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop= 'first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,onehot_column),
        ("StandardScaler", numeric_transformer,num_features)
    ], remainder= 'passthrough'
)

In [82]:
X = preprocessor.fit_transform(X)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.193999,0.391492
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.427476,1.313269
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.770109,1.642475
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.833899,-1.583744
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.605158,0.457333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.044215,1.774157
996,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.970952,-0.859491
997,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.125472,-0.201079
998,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.605158,0.589015


In [83]:
X.shape

(1000, 14)

## Train Test Split

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state= 42)
X_train.shape, X_test.shape

((800, 14), (200, 14))

## Model training and model selection

In [85]:
#### Create a function to evaluate model

def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    R2_score = r2_score(true,predicted)
    return mae,rmse,R2_score



In [90]:
models = {
                "LinearRegression" : LinearRegression(),
                "Lasso" : Lasso(),
                "Ridge": Ridge(),
                "K-Neighbors Regression": KNeighborsRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Random Forest Regressor": RandomForestRegressor(),
                "Adaboost Regressor": AdaBoostRegressor(),
                "Gradient Boost Regressor": GradientBoostingRegressor(),
                "Xgboost Regressor": XGBRegressor()
    }

model_list = []
r2_list =[]

for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train)  ## Train the model

        ##make Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        ##Evaluate train and test dataset

        model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
        model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
        
        print(list(models.keys())[i])
        model_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))
    
        print('----------------------------------')
        
        print('Model performance for Test set')
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        r2_list.append(model_test_r2)

        print('='*35)
        print('\n')

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 5.3231
- Mean Absolute Error: 4.2667
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3940
- Mean Absolute Error: 4.2148
- R2 Score: 0.8804


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.5938
- Mean Absolute Error: 5.2063
- R2 Score: 0.8071
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.5197
- Mean Absolute Error: 5.1579
- R2 Score: 0.8253


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.3235
- Mean Absolute Error: 4.2650
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3936
- Mean Absolute Error: 4.2125
- R2 Score: 0.8805


K-Neighbors Regression
Model performance for Training set
- Root Mean Squared Error: 5.5678
- Mean Absolute Error: 4.4510
- R2 Score: 0.8625
-----------------------

In [91]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880451
0,LinearRegression,0.880433
7,Gradient Boost Regressor,0.872874
6,Adaboost Regressor,0.854346
5,Random Forest Regressor,0.852459
1,Lasso,0.82532
8,Xgboost Regressor,0.825046
3,K-Neighbors Regression,0.776335
4,Decision Tree,0.736766


In [97]:
### Performing Hyperparameter Tuning for Ridge, Linear Regression and Gradient Boosting

###initialize few params for Ridge, Gradient Boost

ridge_params = {'alpha': [1,3,5,7,10],
                'fit_intercept' : [True],
                'max_iter' : [1000,2000,3000,5000,7000,9000,11000,15000],
                'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']}
                

gboost_params = {'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
                 'learning_rate': [0.1,0.3,0.5,0.7,0.9,1],
                 'n_estimators' : [100,300,500,700,900,1000,1300,1500],
                 'criterion': ['friedman_mse','squared_error'],
                 'min_samples_split': [2,5,7,9,11,13,15,17,20],
                 'min_samples_leaf': [1,3,5,7,9,11,13,15,17,19],
                 'min_weight_fraction_leaf': [0.1,0.01,0.0001,0.2,0.002,0.004,0.4,0.5],
                 'max_depth': [3,5,7,9,11,13,15],
                 'random_state': [42,'None']
}






In [98]:
## Models list for Hyperparameter tuning
randomcv_models = [
                   ('ridge',Ridge(),ridge_params),
                    ('gradientboost',GradientBoostingRegressor(),gboost_params)
]

In [99]:
from sklearn.model_selection import RandomizedSearchCV
model_param = {}
for name,model,params in randomcv_models:
    random = RandomizedSearchCV(estimator= model,
                               param_distributions=params,
                               n_iter=100,
                               cv=3,
                               verbose=2,
                               n_jobs=-1)
    random.fit(X_train,y_train)
    model_param[name]= random.best_params_

for model_name in model_param:
    print(f"--------------------Best Params for {model_name}-----------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


30 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\linear_model\_ridge.py", line 1131, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\linear_model\_ridge.py", line 808, in fit
    raise

Fitting 3 folds for each of 100 candidates, totalling 300 fits


162 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "e:\project\mlproject\venv\lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidPa

--------------------Best Params for ridge-----------------------
{'solver': 'saga', 'max_iter': 9000, 'fit_intercept': True, 'alpha': 1}
--------------------Best Params for gradientboost-----------------------
{'random_state': 42, 'n_estimators': 100, 'min_weight_fraction_leaf': 0.01, 'min_samples_split': 13, 'min_samples_leaf': 13, 'max_depth': 13, 'loss': 'absolute_error', 'learning_rate': 0.1, 'criterion': 'friedman_mse'}


In [101]:
## Retraining the model

models = {
                "Ridge Regressor": Ridge(solver='saga',max_iter=9000,fit_intercept=True, alpha=1),
                "Gradient Boost Regressor" : GradientBoostingRegressor(n_estimators= 100, random_state=42, min_weight_fraction_leaf=0.01,min_samples_split=13,min_samples_leaf=13,max_depth=13,loss="absolute_error",learning_rate=0.1,criterion="friedman_mse")
    }

for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train)  ## Train the model

        ##make Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        ##Evaluate train and test dataset

        model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
        model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
        
        print(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))
    
        print('----------------------------------')
        
        print('Model performance for Test set')
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        
        print('='*35)
        print('\n')


Ridge Regressor
Model performance for Training set
- Root Mean Squared Error: 5.3235
- Mean Absolute Error: 4.2650
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3937
- Mean Absolute Error: 4.2126
- R2 Score: 0.8804


Gradient Boost Regressor
Model performance for Training set
- Root Mean Squared Error: 3.8359
- Mean Absolute Error: 2.5747
- R2 Score: 0.9347
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.2416
- Mean Absolute Error: 4.7076
- R2 Score: 0.8399




In [None]:
#### Selecting Ridge Regression model