# Model Training

### 1.1 importing packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('data/StudentsPerformance.csv')

In [3]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df['Total_score'] = df['math score'] + df['reading score'] + df['writing score']
df['Average_score'] = df['Total_score'] /3 

In [6]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,Total_score,Average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


## Feature Engineering
- Making total score and average score feature
- Dropping 3 subject

In [7]:
df = df.drop(['math score', 'reading score', 'writing score'], axis = 1)

In [8]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,Total_score,Average_score
0,female,group B,bachelor's degree,standard,none,218,72.666667
1,female,group C,some college,standard,completed,247,82.333333
2,female,group B,master's degree,standard,none,278,92.666667
3,male,group A,associate's degree,free/reduced,none,148,49.333333
4,male,group C,some college,standard,none,229,76.333333


## Preparing X(independent) and y(dependent) variables

In [9]:
X = df.drop(['Total_score', 'Average_score'], axis = 1)
y = df['Average_score']

In [10]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none


In [11]:
y.head()

0    72.666667
1    82.333333
2    92.666667
3    49.333333
4    76.333333
Name: Average_score, dtype: float64

In [19]:
print("Categories in Gender variable:",end="" )
print(df['gender'].unique())

print("Categories in Race variable:",end="")
print(df['race/ethnicity'].unique())

print("Categories in Parental level of education variable:",end=" " )
print(df['parental level of education'].unique())

print("Categories in Lunch variable:",end="" )
print(df['lunch'].unique())

print("Categories in Test preparation course variable:",end="" )
print(df['test preparation course'].unique())

Categories in Gender variable:['female' 'male']
Categories in Race variable:['group B' 'group C' 'group A' 'group D' 'group E']
Categories in Parental level of education variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in Lunch variable:['standard' 'free/reduced']
Categories in Test preparation course variable:['none' 'completed']


## Columns Transformation for the Model

In [21]:
numerical_features = X.select_dtypes(exclude = "object").columns
categorical_features = X.select_dtypes(include = "object" ).columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", categorical_transformer, categorical_features),
        ("StandardScaler", numerical_transformer, numerical_features)
    ]
)

## Insight 
- A pipeline has been created to transform all the numerical and categorical columns 

In [22]:
X = preprocessor.fit_transform(X)

In [25]:
X.shape

(1000, 17)

## TrainTest Split

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((800, 17), (200, 17))

## Create and Eva