# Student Performance - Models

## Set Up Environment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
scores = pd.read_csv('data/StudentsPerformance_Polished.csv')
scores.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,math score gender mean,math score gender median,...,reading score test preparation course min,reading score test preparation course max,reading score test preparation course stdev,reading score test preparation course count,writing score test preparation course mean,writing score test preparation course median,writing score test preparation course min,writing score test preparation course max,writing score test preparation course stdev,writing score test preparation course count
0,female,group B,bachelor's degree,standard,none,72,72,74,63.6,65.0,...,17,100,14.463885,642,64.5,65.0,10,100,14.999661,642
1,female,group C,some college,standard,completed,69,90,88,63.6,65.0,...,37,100,13.638384,358,74.4,76.0,36,100,13.375335,358
2,female,group B,master's degree,standard,none,90,95,93,63.6,65.0,...,17,100,14.463885,642,64.5,65.0,10,100,14.999661,642


## Define Classes

In [3]:
class Data_Processing:
    """
    This class is for cleaning data and preparing it for modeling.
    
    Attributes:
        data (dataframe): The pandas dataframe supplied when initiating a process_data object.
    """
    
    def __init__(self, data):
        """This method is the constructor for the process_data class."""
        self.data = data
    
    def label_encode(self, col, map_dict):
        """This method converts categorical variables to numeric based on hierarchy, defined by the map_dict parameter."""
        self.data[col] = self.data[col].map(map_dict)
    
    def one_hot_encode(self, col, drop_first=True):
        """This method converts non-hierarchical categorical variables to dummy variables."""
        self.data = pd.get_dummies(self.data, columns=[col], drop_first=drop_first)
        
    def drop_columns(self, cols):
        """This method drops a single or a list of columns."""
        self.data.drop(cols, axis=1, inplace=True)
    
    def transform(self, col, new_col, function):
        """This method uses a function to convert the values in an existing column to new values in a new column."""
        self.data[new_col] = self.data[col].apply(function)
    
    def convert_to_bool(self, string):
        """This method creates an indicator value from a string."""
        if string == 'NONE':
            return 0
        else:
            return 1
    
    def normalize(self, col, new_col):
        """This method scales numeric variables to 0-1 range."""
        self.data[new_col] = preprocessing.Normalizer(norm='max').transform([self.data[col]])[0]

In [4]:
class Model:
    """
    This class is for building a model, training it, and using it to make predictions.
    
    Attributes:
        model_type: The type of model used to make predictions.  This is provided when instantiating the object.
        data (dataframe): The pandas dataframe used to train the model and use it for predictions.
        features (list): The variables used by the model to make predictions.
        target: The value the model aims to predict.
        model: The model after it has been fitted to the training data.
    """
    
    def __init__(self, model_type, data, features, target, model_name):
        """This method is the constructor for the model class."""
        self.model = model_type
        self.data = data
        self.features = features
        self.target = target
        self.name = model_name
    
    def build_model(self, compare_df, test_size=.2):
        """This method splits the dataframe into training and test sets, trains the model on the training data, 
        makes predictions on the test set, and saves metrics in the compare_df dataframe."""
        
        # Split data set into training and test sets based on the test_size parameter
        features_train, features_test, target_train, target_test = train_test_split(
            self.data[self.features], self.data[self.target], test_size=test_size)
        
        # Fit the model to the training data and save the score as a variable
        self.model = self.model.fit(features_train, target_train)
        training_r2 = self.model.score(features_train, target_train)
        
        # Make predictions on the test set and save the R^2 and MSE scores as variables
        predicted_target = self.model.predict(features_test)
        test_r2 = r2_score(y_true=target_test, y_pred=predicted_target)
        mse = mean_squared_error(y_true=target_test, y_pred=predicted_target)
        rmse = mse**.5
        
        # Add the 3 scores to the compare_df (given as an argument)
        compare_df.loc[self.name] = [training_r2, test_r2, rmse, 0]
        
        return self.model
    
    def cross_validate(self, cv, compare_df, scoring='neg_mean_squared_error'):
        """This method runs cross validation on the dataset."""
        neg_mse = cross_val_score(self.model, self.data[self.features], self.data[self.target],  cv=cv, scoring=scoring)
        avg_mse = sum(neg_mse) / len(neg_mse) * -1.0
        avg_rmse = avg_mse**.5
        compare_df.loc[self.name, 'Cross Validation Score'] = avg_rmse
    
    def get_coefficients(self, test_size=.2):
        """This method gets the coefficients of the model and displays them as a series."""
        features_train, features_test, target_train, target_test = train_test_split(
            self.data[self.features], self.data[self.target], test_size=test_size)
        self.model = self.model.fit(features_train, target_train)
        coefficients = pd.Series(self.model.coef_, index=features_train.columns).sort_values(ascending=False)
        return coefficients
    
    def predict(self, data):
        """This method makes predictions on the given dataset."""
        predictions = self.model.predict(data)
        return predictions

In [5]:
class Compare:
    """
    This class is for comparing the results of different models.
    
    Attributes:
        data (dataframe): The pandas dataframe supplied when initiating a compare object.
    """
    
    def __init__(self, df):
        """This method is the constructor for the compare class."""
        self.data = df
    
    def graph_results(self, col, name, figsize=(7,4), left_margin=.4):
        """This method creates a bar graph of a given column of the dataframe and saves it as a jpg file."""
        
        metric_compare = self.data[col].sort_values()
        graph_title = col.replace('_',' ')
        file_name = 'charts/model_comparison_{}.jpg'.format(name)
        
        fig, ax = plt.subplots(figsize=figsize)
        fig.subplots_adjust(left=left_margin)
        ax.barh(metric_compare.index, metric_compare)
        ax.set_title(graph_title)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        plt.savefig(file_name)

## Prepare Data for Modeling

In [6]:
category_cols = ['gender', 'race/ethnicity', 'lunch', 'test preparation course']

student_performance_cols = ['math score', 'reading score', 'writing score']
student_performance_norm_cols = ['math score norm', 'reading score norm', 'writing score norm']

group_metric_cols = list(scores.columns)
for col in (category_cols + student_performance_cols + ['parental level of education']):
    group_metric_cols.remove(col)

In [7]:
model_data = Data_Processing(scores)

for i in range(len(student_performance_cols)):
    model_data.normalize(student_performance_cols[i], student_performance_norm_cols[i])

for col in group_metric_cols:
    model_data.normalize(col, col)

level_of_education_map = {
    "some high school": 1,
    "high school": 2,
    "some college": 3,
    "associate's degree": 4,
    "bachelor's degree": 5,
    "master's degree": 6
}
model_data.label_encode('parental level of education', level_of_education_map)

for col in category_cols:
    model_data.one_hot_encode(col)

In [8]:
model_data.data.head(2)

Unnamed: 0,parental level of education,math score,reading score,writing score,math score gender mean,math score gender median,math score gender min,math score gender max,math score gender stdev,math score gender count,...,math score norm,reading score norm,writing score norm,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_standard,test preparation course_none
0,5,72,72,74,0.925764,0.942029,0.0,1.0,1.0,1.0,...,0.72,0.72,0.74,0,1,0,0,0,1,1
1,3,69,90,88,0.925764,0.942029,0.0,1.0,1.0,1.0,...,0.69,0.9,0.88,0,0,1,0,0,1,0


## Build Models

In [9]:
lin_reg = LinearRegression()
svm_reg = LinearSVR(epsilon=2, C=30)
rf_reg = RandomForestRegressor(n_estimators=25, max_depth=4, max_features=None)
gb_reg = GradientBoostingRegressor(n_estimators=25, max_depth=4)

feature_compare = pd.DataFrame(columns=['Training_R2', 'Test_R2', 'Root Mean Squared Error', 'Cross Validation Score'])
model_compare = pd.DataFrame(columns=['Training_R2', 'Test_R2', 'Root Mean Squared Error', 'Cross Validation Score'])

### Finding the Right Features

In [10]:
experimental_features1m = ['reading score norm', 'writing score norm']
experimental_features1r = ['math score norm', 'writing score norm']
experimental_features1w = ['math score norm', 'reading score norm']
experimental_features2 = ['gender_male', 'race/ethnicity_group B', 'race/ethnicity_group C', 'race/ethnicity_group D', 
                         'race/ethnicity_group E', 'parental level of education', 'lunch_standard', 
                         'test preparation course_none']
experimental_features3 = group_metric_cols

#### Predicting Math Score

In [11]:
math_features = list(model_data.data.columns)

remove_cols = ['math score', 'reading score', 'writing score', 'math score norm']
for col in remove_cols:
    math_features.remove(col)

math_target = 'math score'

In [12]:
lreg_m_e1 = Model(lin_reg, model_data.data, experimental_features1m, math_target, 'Linear Regression - Math - E1 - Other Scores')
lreg_m_e1.build_model(feature_compare)
lreg_m_e1.cross_validate(4, feature_compare)

lreg_m_e2 = Model(lin_reg, model_data.data, experimental_features2, math_target, 'Linear Regression - Math - E2 - Individual Demographic')
lreg_m_e2.build_model(feature_compare)
lreg_m_e2.cross_validate(4, feature_compare)

lreg_m_e3 = Model(lin_reg, model_data.data, experimental_features3, math_target, 'Linear Regression - Math - E3 - Group')
lreg_m_e3.build_model(feature_compare)
lreg_m_e3.cross_validate(4, feature_compare)

lreg_m_e12 = Model(lin_reg, model_data.data, (experimental_features1m + experimental_features2), math_target, 'Linear Regression - Math - E1 + E2')
lreg_m_e12.build_model(feature_compare)
lreg_m_e12.cross_validate(4, feature_compare)

lreg_m_e13 = Model(lin_reg, model_data.data, (experimental_features1m + experimental_features3), math_target, 'Linear Regression - Math - E1 + E3')
lreg_m_e13.build_model(feature_compare)
lreg_m_e13.cross_validate(4, feature_compare)

lreg_m_e23 = Model(lin_reg, model_data.data, (experimental_features2 + experimental_features3), math_target, 'Linear Regression - Math - E2 + E3')
lreg_m_e23.build_model(feature_compare)
lreg_m_e23.cross_validate(4, feature_compare)

lreg_m = Model(lin_reg, model_data.data, math_features, math_target, 'Linear Regression - Math - All')
lreg_m.build_model(feature_compare)
lreg_m.cross_validate(4, feature_compare)

In [13]:
model_compare

Unnamed: 0,Training_R2,Test_R2,Root Mean Squared Error,Cross Validation Score


In [14]:
# lreg_m_b.get_coefficients()

In [15]:
lreg_m.get_coefficients()

math score gender count                          1.622272e+14
math score parental level of education min       7.847690e+13
reading score parental level of education min    6.469358e+13
math score gender stdev                          5.735331e+13
writing score race/ethnicity min                 5.618206e+13
                                                     ...     
reading score test preparation course count     -3.943875e+13
writing score test preparation course count     -3.943875e+13
reading score test preparation course min       -5.174328e+13
writing score gender median                     -5.375424e+13
math score gender median                        -1.331619e+14
Length: 100, dtype: float64

#### Predicting Reading Score

In [16]:
reading_features = list(model_data.data.columns)

remove_cols = ['math score', 'reading score', 'writing score', 'reading score norm']
for col in remove_cols:
    reading_features.remove(col)

reading_target = 'reading score'

In [17]:
lreg_r_e1 = Model(lin_reg, model_data.data, experimental_features1r, reading_target, 'Linear Regression - Reading - E1 - Other Scores')
lreg_r_e1.build_model(feature_compare)
lreg_r_e1.cross_validate(4, feature_compare)

lreg_r_e2 = Model(lin_reg, model_data.data, experimental_features2, reading_target, 'Linear Regression - Reading - E2 - Individual Demographic')
lreg_r_e2.build_model(feature_compare)
lreg_r_e2.cross_validate(4, feature_compare)

lreg_r_e3 = Model(lin_reg, model_data.data, experimental_features3, reading_target, 'Linear Regression - Reading - E3 - Group')
lreg_r_e3.build_model(feature_compare)
lreg_r_e3.cross_validate(4, feature_compare)

lreg_r_e12 = Model(lin_reg, model_data.data, (experimental_features1r + experimental_features2), reading_target, 'Linear Regression - Reading - E1 + E2')
lreg_r_e12.build_model(feature_compare)
lreg_r_e12.cross_validate(4, feature_compare)

lreg_r_e13 = Model(lin_reg, model_data.data, (experimental_features1r + experimental_features3), reading_target, 'Linear Regression - Reading - E1 + E3')
lreg_r_e13.build_model(feature_compare)
lreg_r_e13.cross_validate(4, feature_compare)

lreg_r_e23 = Model(lin_reg, model_data.data, (experimental_features2 + experimental_features3), reading_target, 'Linear Regression - Reading - E2 + E3')
lreg_r_e23.build_model(feature_compare)
lreg_r_e23.cross_validate(4, feature_compare)

lreg_r = Model(lin_reg, model_data.data, reading_features, reading_target, 'Linear Regression - Reading - All')
lreg_r.build_model(feature_compare)
lreg_r.cross_validate(4, feature_compare)

In [18]:
model_compare

Unnamed: 0,Training_R2,Test_R2,Root Mean Squared Error,Cross Validation Score


In [19]:
# lreg_r_b.get_coefficients()

In [20]:
lreg_r.get_coefficients()

gender_male                                   2.060985e+13
reading score gender median                   6.056138e+12
math score gender median                      5.129107e+12
reading score gender max                      4.588321e+12
writing score gender max                      3.873236e+12
                                                  ...     
math score parental level of education min   -2.618543e+12
math score gender max                        -2.984443e+12
writing score lunch min                      -2.987795e+12
reading score gender min                     -3.959541e+12
math score gender min                        -1.851943e+13
Length: 100, dtype: float64

#### Predicting Writing Score

In [21]:
writing_features = list(model_data.data.columns)

remove_cols = ['math score', 'reading score', 'writing score', 'writing score norm']
for col in remove_cols:
    writing_features.remove(col)

writing_target = 'writing score'

In [22]:
lreg_w_e1 = Model(lin_reg, model_data.data, experimental_features1w, writing_target, 'Linear Regression - Writing - E1 - Other Scores')
lreg_w_e1.build_model(feature_compare)
lreg_w_e1.cross_validate(4, feature_compare)

lreg_w_e2 = Model(lin_reg, model_data.data, experimental_features2, writing_target, 'Linear Regression - Writing - E2 - Individual Demographic')
lreg_w_e2.build_model(feature_compare)
lreg_w_e2.cross_validate(4, feature_compare)

lreg_w_e3 = Model(lin_reg, model_data.data, experimental_features3, writing_target, 'Linear Regression - Writing - E3 - Group')
lreg_w_e3.build_model(feature_compare)
lreg_w_e3.cross_validate(4, feature_compare)

lreg_w_e12 = Model(lin_reg, model_data.data, (experimental_features1w + experimental_features2), writing_target, 'Linear Regression - Writing - E1 + E2')
lreg_w_e12.build_model(feature_compare)
lreg_w_e12.cross_validate(4, feature_compare)

lreg_w_e13 = Model(lin_reg, model_data.data, (experimental_features1w + experimental_features3), writing_target, 'Linear Regression - Writing - E1 + E3')
lreg_w_e13.build_model(feature_compare)
lreg_w_e13.cross_validate(4, feature_compare)

lreg_w_e23 = Model(lin_reg, model_data.data, (experimental_features2 + experimental_features3), writing_target, 'Linear Regression - Writing - E2 + E3')
lreg_w_e23.build_model(feature_compare)
lreg_w_e23.cross_validate(4, feature_compare)

lreg_w = Model(lin_reg, model_data.data, writing_features, writing_target, 'Linear Regression - Writing - All')
lreg_w.build_model(feature_compare)
lreg_w.cross_validate(4, feature_compare)

In [23]:
model_compare

Unnamed: 0,Training_R2,Test_R2,Root Mean Squared Error,Cross Validation Score


In [24]:
# lreg_w_b.get_coefficients()

In [25]:
lreg_w.get_coefficients()

gender_male                                      4.797787e+13
math score gender count                          3.744006e+13
math score gender mean                           3.045076e+13
math score gender median                         2.968024e+13
parental level of education                      2.677448e+13
                                                     ...     
reading score gender max                        -3.670179e+13
math score gender min                           -4.774803e+13
math score gender stdev                         -6.388830e+13
reading score parental level of education min   -6.757964e+13
math score parental level of education min      -9.853301e+13
Length: 100, dtype: float64

### Comparing Models

In [26]:
lreg_w = Model(lin_reg, model_data.data, writing_features, writing_target, 'Linear Regression - Writing')
lreg_w.build_model(model_compare)
lreg_w.cross_validate(4, model_compare)

svm_w = Model(svm_reg, model_data.data, writing_features, writing_target, 'Support Vector Machine - Writing')
svm_w.build_model(model_compare)
svm_w.cross_validate(4, model_compare)

rf_w = Model(rf_reg, model_data.data, writing_features, writing_target, 'Random Forest - Writing')
rf_w.build_model(model_compare)
rf_w.cross_validate(4, model_compare)

gb_w = Model(gb_reg, model_data.data, writing_features, writing_target, 'Gradient Boosting - Writing')
gb_w.build_model(model_compare)
gb_w.cross_validate(4, model_compare)



In [27]:
model_compare

Unnamed: 0,Training_R2,Test_R2,Root Mean Squared Error,Cross Validation Score
Linear Regression - Writing,0.948337,0.943697,3.667011,3.528955
Support Vector Machine - Writing,0.948071,0.933678,3.868197,3.651729
Random Forest - Writing,0.924069,0.919896,4.432069,4.512939
Gradient Boosting - Writing,0.944847,0.922375,4.305004,4.179977
