In [22]:
# importing libraries 
import pandas as pd 
import numpy as np 
import seaborn as sns 
from matplotlib import pyplot as plt 
# importing training functions 
from sklearn.linear_model import LinearRegression , Ridge , Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.metrics import mean_absolute_error ,r2_score , mean_squared_error
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV



In [23]:
df=pd.read_csv("D:/Work/NLP & ML/ML_Project_1/notebook/Data/stud.csv")
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [24]:
x=df.drop('math_score',axis=1)
y=df['math_score']
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [25]:
num_f=x.select_dtypes(exclude='object').columns
cat_f=x.select_dtypes(include='object').columns
# num_f
cat_f

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')

In [26]:
# using pipeline and columntransformer to transform the data 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

cat=OneHotEncoder()
num=StandardScaler()
preprocessor=ColumnTransformer(
    [('ohe',cat,cat_f),('standard',num,num_f)]
)

In [27]:
x=preprocessor.fit_transform(x)
x

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [28]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [29]:
def get_performance(y_train,y_pred):
    r2=r2_score(y_train,y_pred)
    abosolute_error=mean_absolute_error(y_train,y_pred)
    squared_error=mean_squared_error(y_train,y_pred)

    return r2,abosolute_error,squared_error

In [30]:
models={'linearregression':LinearRegression(),'ridgeregression':Ridge(),'lassoregression':Lasso(),'knn':KNeighborsRegressor(),'decisiontree':DecisionTreeRegressor(),'adaboost':AdaBoostRegressor(),
        'randomforest':RandomForestRegressor(),'svm':SVR(),'xgboost':XGBRegressor(),'catboost':CatBoostRegressor()}

In [33]:
for i in range(len(list(models))) :
    model_name=list(models.keys())[i]
    model_func=list(models.values())[i]
    
    model_func.fit(x_train,y_train)

    y_pred_train=model_func.predict(x_train)
    y_pred_test=model_func.predict(x_test)

    r2train,absolutetrain,squaredtrain=get_performance(y_train,y_pred_train)
    r2test,absolutetest,squaredtest=get_performance(y_test,y_pred_test)

    print('-------------------------------------')

    print(model_name)

    print('train-parameter')
    print(f'r2_train = {r2train}\n')
    print(f'absolute_error = {absolutetrain}\n')
    print(f'squared_error = {squaredtrain}\n')

    print('**************')

    print('test-parameter')
    print(f'r2_test = {r2test}\n')
    print(f'absolute_test = {absolutetest}\n')
    print(f'squared_error = {squaredtest}\n')



-------------------------------------
linearregression
train-parameter
r2_train = 0.8751313605993936

absolute_error = 4.198519565142325

squared_error = 27.544689325007596

**************
test-parameter
r2_test = 0.8758630443016734

absolute_test = 4.418261320297315

squared_error = 30.886593188073082

-------------------------------------
ridgeregression
train-parameter
r2_train = 0.8751146197676473

absolute_error = 4.197676960902203

squared_error = 27.548382173842256

**************
test-parameter
r2_test = 0.8759067784434541

absolute_test = 4.41550580190255

squared_error = 30.87571166904426

-------------------------------------
lassoregression
train-parameter
r2_train = 0.807844422971275

absolute_error = 5.158226125142767

squared_error = 42.38746971802188

**************
test-parameter
r2_test = 0.8102810073900306

absolute_test = 5.392913577222256

squared_error = 47.20410059865967

-------------------------------------
knn
train-parameter
r2_train = 0.8538245568542916

abs

In [34]:
df_predicted = pd.DataFrame({'actual_value':y_test,'predicted_value':y_pred_test,'difference':y_test-y_pred_test})
df_predicted

Unnamed: 0,actual_value,predicted_value,difference
521,91,74.390762,16.609238
737,53,55.530637,-2.530637
740,80,76.404788,3.595212
660,74,76.340790,-2.340790
411,84,87.102284,-3.102284
...,...,...,...
468,77,71.463336,5.536664
935,70,61.108344,8.891656
428,65,60.235512,4.764488
7,40,46.077139,-6.077139
