In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings

0 for Female
1 for Male

In [None]:
df = pd.read_csv('data/student_score.csv')
df = df.drop(['Unnamed: 0', 'Gender'], axis=1)
df.head()

In [24]:
X = df.drop(['1st'], axis=1)
y = df['1st']

In [25]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((152, 6), (17, 6), (152,), (17,))

Metrics

In [28]:
models = {
    'Decision Tree Regression' : DecisionTreeRegressor(),
    'Random Forest Regression' : RandomForestRegressor(),
    'AdaBoost Regression' : AdaBoostRegressor(),
    'Linear Regression' : LinearRegression(),
    'Ridge Regression' : Ridge(),
    'Lasso Regression' : Lasso(),
    'Cat Boost Regression' : CatBoostRegressor(),
    'XGB Regression' : XGBRegressor(),
    'K Nearest Regressor' : KNeighborsRegressor()
}

In [27]:
def evaluate_model(true, prediction):
    mae = mean_absolute_error(true, prediction)
    mse = mean_squared_error(true, prediction)
    rmse = np.sqrt(mean_squared_error(true, prediction))
    r2_square = r2_score(true, prediction)
    return mae, rmse, r2_square

All algorithms

In [29]:
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test Data
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print('Model Performance for training set')
    print(f'-Root mean squared error: {model_train_rmse}')
    print(f'-Mean Absolute error: {model_train_mae}')
    print(f'-R2 score: {model_train_r2}')

    print('----------------------------')

    print('Model Performance for testing set')
    print(f'-Root mean squared error: {model_test_rmse}')
    print(f'-Mean Absolute error: {model_test_mae}')
    print(f'-R2 score: {model_test_r2}')

    r2_list.append(model_test_r2)

    print('____________________________________________________________')
    print('____________________________________________________________')
    

Decision Tree Regression
Model Performance for training set
-Root mean squared error: 0.0
-Mean Absolute error: 0.0
-R2 score: 1.0
----------------------------
Model Performance for testing set
-Root mean squared error: 0.40423493460231896
-Mean Absolute error: 0.3464705882352939
-R2 score: 0.7084502225010003
____________________________________________________________
____________________________________________________________
Random Forest Regression
Model Performance for training set
-Root mean squared error: 0.16213519740279195
-Mean Absolute error: 0.12838157894736932
-R2 score: 0.9581246659062755
----------------------------
Model Performance for testing set
-Root mean squared error: 0.36133290174346516
-Mean Absolute error: 0.27752941176470586
-R2 score: 0.7670514318099881
____________________________________________________________
____________________________________________________________
AdaBoost Regression
Model Performance for training set
-Root mean squared error: 0.284

In [30]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2 Score']).sort_values(by=['R2 Score'], ascending=False)

Unnamed: 0,Model Name,R2 Score
3,Linear Regression,1.0
7,XGB Regression,0.800833
1,Random Forest Regression,0.767051
6,Cat Boost Regression,0.76555
8,K Nearest Regressor,0.765272
0,Decision Tree Regression,0.70845
2,AdaBoost Regression,0.708215
4,Ridge Regression,0.690005
5,Lasso Regression,-0.06247


## Linear Regression

In [31]:
reg = LinearRegression(fit_intercept=True)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(f'Accuracy of the model is {score}%')

Accuracy of the model is 100.0%


In [32]:
pred_df = pd.DataFrame({'Actual Value':y_test, 'Predicted Value':y_pred, 'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
138,7.07,7.07,0.0
30,6.81,6.81,-8.881784e-16
119,5.78,5.78,0.0
29,5.74,5.74,-3.552714e-15
143,6.44,6.44,-8.881784e-16
162,7.33,7.33,-4.440892e-15
165,7.3,7.3,-2.664535e-15
51,7.26,7.26,-7.993606e-15
105,6.11,6.11,-8.881784e-16
60,7.15,7.15,-1.776357e-15
