1. Import Libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

2. Read CSV as Dataframe

In [4]:
df = pd.read_csv('data/StudentsPerformance.csv')
df.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


3. Prepare X and Y variables. In this project, we would like to predict the math score of a student given other characteristics i.e. features.

In [5]:
X = df.drop(columns='math score', axis=1)
X

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [8]:
y = df['math score']
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

4. Showing Unique Values/Categories in each column

In [7]:
for col in X.columns:
    if col not in ['reading score', 'writing score']:
        print('Categories in {0} are {1}'.format(col, df[col].unique()))

Categories in gender are ['female' 'male']
Categories in race/ethnicity are ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in parental level of education are ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in lunch are ['standard' 'free/reduced']
Categories in test preparation course are ['none' 'completed']


5. Now, the next step to *transform* your data, both numeric and categorical.

In [10]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

col_transformer = ColumnTransformer(
    [
        ("OneHotEncoder", ohe_transformer, cat_features), 
        ("StandardScalar", num_transformer, num_features)
    ]
)

In [11]:
X = col_transformer.fit_transform(X)
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [12]:
X.shape

(1000, 19)

6. Split the dataset into train and test

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'X_train shape: {X_train.shape}\ny_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}\ny_test shape: {y_test.shape}')

X_train shape: (800, 19)
y_train shape: (800,)
X_test shape: (200, 19)
y_test shape: (200,)


7. Create a evaluation function that calculates the values of all metrics.

In [26]:
def evaluate(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

8. Define models

In [37]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "Catboosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoostRegressor": AdaBoostRegressor()
}

9. Train models and evaluate them.

In [38]:
mae_list = []
rmse_list = []
r2_list = []

for key in models.keys():
    current_model = models[key]
    current_model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = current_model.predict(X_train)
    y_test_pred = current_model.predict(X_test)

    # Evaluate model on train and test sets
    train_mae, train_rmse, train_r2 = evaluate(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate(y_test, y_test_pred) 

    print(f'{list(models.keys())[i]} Performance for training set:')
    print(f'- Root Mean Squared Error: {round(train_rmse, 4)}')
    print(f'- Mean Absolute Error: {round(train_mae, 4)}')
    print(f'- R2 Score: {round(train_r2, 4)}')
    print('--------------------------------------------------------')
    print(f'{list(models.keys())[i]} Performance for test set:')
    print(f'- Root Mean Squared Error: {round(test_rmse, 4)}')
    print(f'- Mean Absolute Error: {round(test_mae, 4)}')
    print(f'- R2 Score: {round(test_r2, 4)}')

    mae_list.append(test_mae); rmse_list.append(test_rmse); r2_list.append(test_r2)
    print('='*55)

Linear Regression Performance for training set:
- Root Mean Squared Error: 5.3314
- Mean Absolute Error: 4.2744
- R2 Score: 0.8739
--------------------------------------------------------
Linear Regression Performance for test set:
- Root Mean Squared Error: 5.4519
- Mean Absolute Error: 4.2416
- R2 Score: 0.8779
Linear Regression Performance for training set:
- Root Mean Squared Error: 6.5938
- Mean Absolute Error: 5.2063
- R2 Score: 0.8071
--------------------------------------------------------
Linear Regression Performance for test set:
- Root Mean Squared Error: 6.5197
- Mean Absolute Error: 5.1579
- R2 Score: 0.8253
Linear Regression Performance for training set:
- Root Mean Squared Error: 5.3233
- Mean Absolute Error: 4.265
- R2 Score: 0.8743
--------------------------------------------------------
Linear Regression Performance for test set:
- Root Mean Squared Error: 5.3904
- Mean Absolute Error: 4.2111
- R2 Score: 0.8806
Linear Regression Performance for training set:
- Root M

10. Show Results

In [34]:
pd.DataFrame(list(zip(list(models.keys()), r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"], ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.877854
7,Catboosting Regressor,0.851632
5,Random Forest Regressor,0.849231
8,AdaBoostRegressor,0.84538
1,Lasso,0.82532
6,XGBRegressor,0.821589
3,K-Neighbors Regressor,0.783193
4,Decision Tree Regressor,0.74704
