Various ML models to determine High School Grades

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
filepath = '/content/drive/MyDrive/student-mat.csv'
df = pd.read_csv(filepath, delimiter=';')

In [None]:
df.describe

In [None]:
features = ['school',	'sex',	'age',	'address',	'famsize',
       'Pstatus',	'Medu',	'Fedu',	'Mjob',	'Fjob',	'reason',	'guardian',
       'traveltime'	, 'studytime',	'failures',	'schoolsup',	'famsup',	'paid',	
       'activities',	'nursery',	'higher',	'internet',	'romantic',	'famrel',	
       'freetime',	'goout',	'Dalc',	'Walc',	'health',	'absences']
X = df[features]
y = df['G3']

cat_features = ['school', 'sex', 'address', 'famsize', 
                'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 
                'schoolsup', 'famsup', 'paid', 'activities',
                'nursery', 'higher', 'internet', 'romantic']

# label encoding for categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feature in cat_features:
  X[feature] = le.fit_transform(df[feature].astype(str))



from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

from sklearn.metrics import r2_score, mean_absolute_error
print(r2_score(y_test, y_pred1))
print(mean_absolute_error(y_pred1, y_test))

0.11762830611961084
3.2858618603501415


Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
model2 = DecisionTreeRegressor()
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

from sklearn.metrics import r2_score, mean_absolute_error
print(r2_score(y_pred2, y_test))
print(mean_absolute_error(y_pred2, y_test))

-0.368872957477806
3.689873417721519


In [None]:
# function for optimal amount of leaves
def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
  model3 = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
  model3.fit(X_train, y_train)
  y_pred3 = model3.predict(X_test)
  mae = mean_absolute_error(y_test, y_pred3)
  return(mae)

candidates_leaves = [5, 25, 50, 100, 250, 500, 750, 800, 900]
for i in candidates_leaves:
  my_mae = get_mae(i, X_train, X_test, y_train, y_test)
  print(i, my_mae)

5 3.260959959393261
25 3.306380678874017
50 3.7142436088359396
100 3.921589310829817
250 3.962025316455696
500 3.962025316455696
750 3.962025316455696
800 3.962025316455696
900 3.962025316455696


Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
model4 = RandomForestRegressor(n_estimators = 500)
model4.fit(X_train, y_train)
y_pred4 = model4.predict(X_test)

from sklearn.metrics import r2_score, mean_absolute_error
print(r2_score(y_test, y_pred4))
print(mean_absolute_error(y_pred4, y_test))

0.19352450834448764
3.1026962025316456


In [None]:
# write function for optimal amount of estimators
def get_mae(estimators, X_train, X_test, y_train, y_test):
  model5 = RandomForestRegressor(n_estimators=estimators, random_state=0)
  model5.fit(X_train, y_train)
  y_pred5 = model5.predict(X_test)
  mae = mean_absolute_error(y_test, y_pred5)
  return(mae)

candidates_estimators = [5, 25, 50, 100, 250, 500, 750, 800, 850] # 250 is best
for i in candidates_estimators:
  my_mae = get_mae(i, X_train, X_test, y_train, y_test)
  print(i, my_mae)

5 3.259493670886076
25 3.1349367088607596
50 3.0564556962025318
100 3.0138607594936704
250 3.0260506329113923
500 3.0290759493670887
750 3.0443459915611815
800 3.042096518987342
850 3.041600893521965


In [None]:
# we use 250 estimators
model5 = RandomForestRegressor(n_estimators=250, random_state=0)
model5.fit(X_train, y_train)
y_pred5 = model5.predict(X_test)


In [None]:
!pip install eli5

In [None]:
# permutation importance
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model5, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())

Weight,Feature
0.3335  ± 0.2263,failures
0.3008  ± 0.0694,absences
0.0191  ± 0.0401,sex
0.0171  ± 0.0071,schoolsup
0.0137  ± 0.0057,guardian
0.0134  ± 0.0043,school
0.0128  ± 0.0041,freetime
0.0113  ± 0.0264,reason
0.0100  ± 0.0062,activities
0.0068  ± 0.0245,Medu


Gradient Boosting

In [None]:
from xgboost import XGBRegressor
model6 = XGBRegressor(n_estimators = 500, learning_rate = 0.05)
model6.fit(X_train, y_train)

y_pred6 = model6.predict(X_test)
mae = mean_absolute_error(y_test, y_pred6)
print(mae)

3.301335980620565
