In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
!pip install catboost



In [4]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from catboost import CatBoostRegressor

In [5]:
df = pd.read_csv('./data/stud.csv')

In [6]:
X = df.drop(columns=["math_score"])
y = df["math_score"]

In [7]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [8]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [9]:
###Create column transformer for three type of transformer
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns
num_features,cat_features

(Index(['reading_score', 'writing_score'], dtype='object'),
 Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
        'test_preparation_course'],
       dtype='object'))

In [10]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [11]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()


In [12]:
preprocessor = ColumnTransformer(
  [
    ("OneHotEncoder",oh_transformer,cat_features),
    ("StandardScaler",numeric_transformer,num_features)
  ]
)

In [13]:
X = preprocessor.fit_transform(X)

In [14]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
def evaluate_model(true,predicted):
  mae = mean_absolute_error(true,predicted)
  mse = mean_squared_error(true,predicted)
  rmse = np.sqrt(mean_squared_error(true,predicted))
  r2 = r2_score(true,predicted)
  return mae,mse,rmse,r2

In [17]:
!pip install xgboost



In [18]:
from sklearn.linear_model import Lasso,Ridge
from xgboost import XGBRegressor
models = {
  "Linear Regression":LinearRegression(),
  "Lasso":Lasso(),
  "Ridge":Ridge(),
  "K-Neighbors Regressor":KNeighborsRegressor(),
  "Decision Tree":DecisionTreeRegressor(),
  "Random Forest Regressor":RandomForestRegressor(),
  "XGBRegressor":XGBRegressor(),
  "CatBoosting Regressor":CatBoostRegressor(verbose=False),
  "AdaBoost Regressor":AdaBoostRegressor()
}


In [19]:
model_list = []
r2_list = []
for i in range(len(list(models))):
  model = list(models.values())[i]

  model.fit(X_train,y_train)


  y_train_pred = model.predict(X_train)
  y_test_pred = model.predict(X_test)

  model_train_mae,model_train_mse,model_train_rmse,model_train_r2 = evaluate_model(y_train,y_train_pred)
  model_test_mae,model_test_mse,model_test_rmse,model_test_r2 = evaluate_model(y_test,y_test_pred)
  print(list(models.keys())[i])
  model_list.append(list(models.keys())[i])
  r2_list.append(model_test_r2)
  print("Model Performance For Training Set")
  print(f"MAE = {model_train_mae}")
  print(f"MSE = {model_train_mse}")
  print(f"RMSE = {model_train_rmse}")
  print(f"R2 Score = {model_train_r2}")
  print("\n")
  print("----------------------------------")
  print("Model Performance For Test Set")
  print(f"MAE = {model_test_mae}")
  print(f"MSE = {model_test_mse}")
  print(f"RMSE = {model_test_rmse}")
  print(f"R2 Score = {model_test_r2}")
  print("\n")


Linear Regression
Model Performance For Training Set
MAE = 4.267109375
MSE = 28.348541259765625
RMSE = 5.32433481852575
R2 Score = 0.8742565651513869


----------------------------------
Model Performance For Test Set
MAE = 4.2158203125
MSE = 29.116678771972655
RMSE = 5.3959872842671395
R2 Score = 0.8803449074540941


Lasso
Model Performance For Training Set
MAE = 5.206302661246526
MSE = 43.47840400585579
RMSE = 6.593815587795566
R2 Score = 0.8071462015863456


----------------------------------
Model Performance For Test Set
MAE = 5.157881810347763
MSE = 42.5064168384116
RMSE = 6.519694535667419
R2 Score = 0.8253197323627853


Ridge
Model Performance For Training Set
MAE = 4.264987823725981
MSE = 28.33778823308244
RMSE = 5.323324922741654
R2 Score = 0.8743042615212909


----------------------------------
Model Performance For Test Set
MAE = 4.2111006880142625
MSE = 29.056272192348324
RMSE = 5.390387016935642
R2 Score = 0.8805931485028737




[WinError 2] The system cannot find the file specified
  File "c:\Users\hp\Desktop\mlproject\venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\hp\Desktop\mlproject\venv\lib\subprocess.py", line 493, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\hp\Desktop\mlproject\venv\lib\subprocess.py", line 858, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\hp\Desktop\mlproject\venv\lib\subprocess.py", line 1327, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


K-Neighbors Regressor
Model Performance For Training Set
MAE = 4.516749999999999
MSE = 32.57765
RMSE = 5.707683417990174
R2 Score = 0.8554978341651085


----------------------------------
Model Performance For Test Set
MAE = 5.621
MSE = 52.6066
RMSE = 7.253040741647602
R2 Score = 0.7838129945787431


Decision Tree
Model Performance For Training Set
MAE = 0.01875
MSE = 0.078125
RMSE = 0.2795084971874737
R2 Score = 0.9996534669718089


----------------------------------
Model Performance For Test Set
MAE = 6.245
MSE = 61.285
RMSE = 7.828473669879717
R2 Score = 0.7481490796356021


Random Forest Regressor
Model Performance For Training Set
MAE = 1.8354031250000002
MSE = 5.317377069603883
RMSE = 2.305943856559366
R2 Score = 0.9764141212420616


----------------------------------
Model Performance For Test Set
MAE = 4.617895833333333
MSE = 35.709406711805556
RMSE = 5.975734826095077
R2 Score = 0.8532520690864839


XGBRegressor
Model Performance For Training Set
MAE = 0.6874666035175323
MSE 

In [20]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=["Model Name","R2_Score"]).sort_values(by = ['R2_Score'],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.880345
5,Random Forest Regressor,0.853252
7,CatBoosting Regressor,0.851632
8,AdaBoost Regressor,0.848278
6,XGBRegressor,0.827797
1,Lasso,0.82532
3,K-Neighbors Regressor,0.783813
4,Decision Tree,0.748149


In [21]:
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train,y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test,y_pred)
print("Accuracy of the model is %.2f" %score)

Accuracy of the model is 0.88


In [22]:
pred_df = pd.DataFrame({
  "Actual Value":y_test,
  "Predicted Value":y_pred,
  "Difference":y_test-y_pred
})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
521,91,76.507812,14.492188
737,53,58.953125,-5.953125
740,80,76.960938,3.039062
660,74,76.757812,-2.757812
411,84,87.539062,-3.539062
...,...,...,...
408,52,43.546875,8.453125
332,62,62.031250,-0.031250
208,74,67.976562,6.023438
613,65,67.132812,-2.132812
