# Student Performance - Models

## Set Up Environment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import kNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
scores = pd.read_csv('data/StudentsPerformance_Polished.csv')
scores.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


## Define Classes

In [None]:
class Data_Processing:
    """
    This class is for cleaning data and preparing it for modeling.
    
    Attributes:
        data (dataframe): The pandas dataframe supplied when initiating a process_data object.
    """
    
    def __init__(self, data):
        """This method is the constructor for the process_data class."""
        self.data = data
    
    def label_encode(self, col, new_col, map_dict):
        """This method converts categorical variables to numeric based on hierarchy, defined by the map_dict parameter."""
        self.data[new_col] = self.data[col].map(map_dict)
    
    def one_hot_encode(self, col, drop_first=True):
        """This method converts non-hierarchical categorical variables to dummy variables."""
        self.data = pd.get_dummies(self.data, columns=[col], drop_first=drop_first)
        
    def drop_columns(self, cols):
        """This method drops a single or a list of columns."""
        self.data.drop(cols, axis=1, inplace=True)
    
    def transform(self, col, new_col, function):
        """This method uses a function to convert the values in an existing column to new values in a new column."""
        self.data[new_col] = self.data[col].apply(function)
    
    def convert_to_bool(self, string):
        """This method creates an indicator value from a string."""
        if string == 'NONE':
            return 0
        else:
            return 1
    
    def normalize(self, col, new_col):
        """This method scales numeric variables to 0-1 range."""
        self.data[new_col] = preprocessing.Normalizer(norm='max').transform([self.data[col]])[0]

In [None]:
class Model:
    """
    This class is for building a model, training it, and using it to make predictions.
    
    Attributes:
        model_type: The type of model used to make predictions.  This is provided when instantiating the object.
        data (dataframe): The pandas dataframe used to train the model and use it for predictions.
        features (list): The variables used by the model to make predictions.
        target: The value the model aims to predict.
        model: The model after it has been fitted to the training data.
    """
    
    def __init__(self, model_type, data, features, target, model_name):
        """This method is the constructor for the model class."""
        self.model = model_type
        self.data = data
        self.features = features
        self.target = target
        self.name = model_name
    
    def build_model(self, compare_df, test_size=.2):
        """This method splits the dataframe into training and test sets, trains the model on the training data, 
        makes predictions on the test set, and saves metrics in the compare_df dataframe."""
        
        # Split data set into training and test sets based on the test_size parameter
        features_train, features_test, target_train, target_test = train_test_split(self.data[self.features], self.data[self.target], test_size=test_size)
        
        # Fit the model to the training data and save the score as a variable
        self.model = self.model.fit(features_train, target_train)
        training_r2 = model.score(features_train, target_train)
#         self.model = model
        
        # Make predictions on the test set and save the R^2 and MSE scores as variables
        predicted_target = model.predict(features_test)
        test_r2 = r2_score(y_true=target_test, y_pred=predicted_target)
        mse = mean_squared_error(y_true=target_test, y_pred=predicted_target)
        rmse = mse**.5
        
        # Add the 3 scores to the compare_df (given as an argument)
        compare_df.loc[self.model_name] = [training_r2, test_r2, rmse, 0]
        
        return model
    
    def cross_validate(self, cv, scoring='neg_mean_squared_error'):
        """This method runs cross validation on the dataset."""
        neg_mse = cross_val_score(self.model, self.data[self.features], self.data[self.target],  cv=cv, scoring=scoring)
        avg_mse = sum(neg_mse) / len(neg_mse) * -1.0
        avg_rmse = avg_mse**.5
        compare_df.loc[self.model_name, 'cv_rmse'] = avg_rmse
    
    def predict(self, data):
        """This method makes predictions on the given dataset."""
        predictions = self.model.predict(data)
        return predictions

In [None]:
class Compare:
    """
    This class is for comparing the results of different models.
    
    Attributes:
        data (dataframe): The pandas dataframe supplied when initiating a compare object.
    """
    
    def __init__(self, df):
        """This method is the constructor for the compare class."""
        self.data = df
    
    def graph_results(self, col, name, figsize=(7,4), left_margin=.4):
        """This method creates a bar graph of a given column of the dataframe and saves it as a jpg file."""
        
        metric_compare = self.data[col].sort_values()
        graph_title = col.replace('_',' ')
        file_name = 'charts/model_comparison_{}.jpg'.format(name)
        
        fig, ax = plt.subplots(figsize=figsize)
        fig.subplots_adjust(left=left_margin)
        ax.barh(metric_compare.index, metric_compare)
        ax.set_title(graph_title)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        plt.savefig(file_name)

## Prepare Data for Modeling

In [None]:
model_data = Data_Processing(scores)

model_data.normalize('math score', 'math score norm')
model_data.normalize('reading score', 'reading score norm')
model_data.normalize('writing score', 'writing score norm')

level_of_education_map = {
    "some high school": 1,
    "high school": 2,
    "some college": 3,
    "associate's degree": 4,
    "bachelor's degree": 5,
    "master's degree": 6
}
model_data.label_encode('parental level of education', 'parental level of education code', level_of_education_map)

category_cols = ['gender', 'race/ethnicity', 'lunch', 'test preparation course']
for col in category_cols:
    model_data.one_hot_encode(col)

## Build Models

In [None]:
lin_reg = LinearRegression()
svm_reg = LinearSVR()
knn_reg = kNeighborsRegressor()
rf_reg = RandomForestRegressor()
gb_reg = GradientBoostingRegressor()

model_compare = pd.DataFrame(columns=['training_r2', 'test_r2', 'rmse', 'cv_rmse'])

### Predicting Math Score

In [None]:
math_features = list(model_data.data.columns)

remove_cols = ['math score', 'reading score', 'writing score', 'math score norm']
for col in remove_cols:
    math_features.remove(col)

math_target = 'math score'

In [None]:
lreg = Model(lin_reg, model_data.data, math_features, math_target, 'Linear Regression - Math')
lreg.build_model(model_compare)
lreg.cross_validate(4)

In [None]:
model_compare

### Predicting Reading Score

In [None]:
reading_features = list(model_data.data.columns)

remove_cols = ['math score', 'reading score', 'writing score', 'reading score norm']
for col in remove_cols:
    reading_features.remove(col)

reading_target = 'reading score'

In [None]:
lreg = Model(lin_reg, model_data.data, reading_features, reading_target, 'Linear Regression - Reading')
lreg.build_model(model_compare)
lreg.cross_validate(4)

In [None]:
model_compare

### Predicting Writing Score

In [None]:
writing_features = list(model_data.data.columns)

remove_cols = ['math score', 'reading score', 'writing score', 'writing score norm']
for col in remove_cols:
    writing_features.remove(col)

writing_target = 'writing score'

In [None]:
lreg = Model(lin_reg, model_data.data, writing_features, writing_target, 'Linear Regression - Writing')
lreg.build_model(model_compare)
lreg.cross_validate(4)

In [None]:
model_compare