## Final Project - Student Predictive Model

#### Imports

In [127]:
import csv
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap        
import numpy as np
import scipy as sp
import random
import pandas as pd
from math import sqrt
from sklearn import datasets
from sklearn.svm import SVC 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import seaborn as sb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

#### Import Data from CSV File

In [2]:
#Reading in Data
pre_processed_data = pd.read_csv('StudentsPerformance.csv', delimiter=',')
pre_processed_data.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


#### Global Constants

In [3]:
PARAMETER_ORDER = ['gender', 'race_ethnicity', 'parent_education', 'lunch', 'test_prep', 'math_score', 'reading_score', 'writing_score']

#### Helper Functions

In [17]:
def simple_plot(name, x_label, x, y_label, y):
    plt.scatter(x,y)
    plt.title(name)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()
    
# Categorical to Numerical - Requires correct order - FIX  THIS
def convert_to_integer(data):
    return np.array([
        1,
        G_integer_map[data['gender']],
        RE_integer_map[data['race_ethnicity']],
        PE_integer_map[data['parent_education']],
        L_integer_map[data['lunch']],
        TP_integer_map[data['test_prep']]
    ]).reshape(1,-1)

def convert_to_onehot(data):
    preProcess = [
        1,
        G_onehot_map[data['gender']],
        RE_onehot_map[data['race_ethnicity']],
        PE_onehot_map[data['parent_education']],
        L_onehot_map[data['lunch']],
        TP_onehot_map[data['test_prep']]
    ]
    postProcess = np.concatenate(([np.array(arr) for arr in preProcess]), axis=None)
    return postProcess.reshape(1,-1)

def convert_to_binary(data):
    preProcess = [
        1,
        G_binary_map[data['gender']],
        RE_binary_map[data['race_ethnicity']],
        PE_binary_map[data['parent_education']],
        L_binary_map[data['lunch']],
        TP_binary_map[data['test_prep']]
    ]
    postProcess = np.concatenate(([np.array(arr) for arr in preProcess]), axis=None)
    return postProcess.reshape(1,-1)


#### Category Mapping (Integer Mapping)

In [5]:
# Gender Mapping
G_integer_map = {
    'female' : 1,
    'male' : 0
}

# Race/Ethnicity Mapping
RE_integer_map = {
    'group A' : 0,
    'group B' : 1,
    'group C' : 2,
    'group D' : 3,
    'group E' : 4
}

# Parental Education Mapping
PE_integer_map = {
    'some college' :        0,
    'associate\'s degree' : 1,
    'high school' :         2,
    'some high school' :    3,
    'bachelor\'s degree' :  4,
    'master\'s degree' :    5
}

# Lunch Mapping
L_integer_map = {
    'standard' : 0,
    'free/reduced' : 1
}

# Test Prep Mapping

TP_integer_map = {
    'none' : 0,
    'completed' : 1
}

#### One Hot Encoding

In [6]:
# Gender Mapping
G_onehot_map = {
    'female' : [1, 0],
    'male' : [0, 1]
}

# Race/Ethnicity Mapping
RE_onehot_map = {
    'group A' : [1, 0, 0, 0, 0],
    'group B' : [0, 1, 0, 0, 0],
    'group C' : [0, 0, 1, 0, 0],
    'group D' : [0, 0, 0, 1, 0],
    'group E' : [0, 0, 0, 0, 1]
}

# Parental Education Mapping
PE_onehot_map = {
    'some college' :        [1, 0, 0, 0, 0, 0],
    'associate\'s degree' : [0, 1, 0, 0, 0, 0],
    'high school' :         [0, 0, 1, 0, 0, 0],
    'some high school' :    [0, 0, 0, 1, 0, 0],
    'bachelor\'s degree' :  [0, 0, 0, 0, 1, 0],
    'master\'s degree' :    [0, 0, 0, 0, 0, 1]
}

# Lunch Mapping
L_onehot_map = {
    'standard' : [1, 0],
    'free/reduced' : [0, 1]
}

# Test Prep Mapping

TP_onehot_map = {
    'none' : [1, 0],
    'completed' : [0, 1]
}

#### Binary Encoding

In [7]:
# Gender Mapping
G_binary_map = {
    'female' : [0],
    'male' : [1]
}

# Race/Ethnicity Mapping
RE_binary_map = {
    'group A' : [0, 0, 0],
    'group B' : [0, 0, 1],
    'group C' : [0, 1, 0],
    'group D' : [0, 1, 1],
    'group E' : [1, 0, 0]
}

# Parental Education Mapping
PE_binary_map = {
    'some college' :        [0, 0, 0],
    'associate\'s degree' : [0, 0, 1],
    'high school' :         [0, 1, 0],
    'some high school' :    [0, 1, 1],
    'bachelor\'s degree' :  [1, 0, 0],
    'master\'s degree' :    [1, 0, 1]
}

# Lunch Mapping
L_binary_map = {
    'standard' : [0],
    'free/reduced' : [1]
}

# Test Prep Mapping

TP_binary_map = {
    'none' : [0],
    'completed' : [1]
}

#### Visualization of Data

In [8]:
#Visualizing Data

#Pre-Processed Individual Data
math = np.array(pre_processed_data["math score"])
reading = np.array(pre_processed_data["reading score"])
writing = np.array(pre_processed_data['writing score'])
gender = np.array(pre_processed_data['gender'])
race_ethnicity = np.array(pre_processed_data['race/ethnicity'])
parent_education = np.array(pre_processed_data['parental level of education'])
lunch = np.array(pre_processed_data['lunch'])
test_prep = np.array(pre_processed_data['test preparation course'])

# Mapped Individual Data

mapped_gender = np.array([G_integer_map[value] for value in gender])
mapped_race_ethnicity = np.array([RE_integer_map[value] for value in race_ethnicity])
mapped_parent_education = np.array([PE_integer_map[value] for value in parent_education])
mapped_lunch = np.array([L_integer_map[value] for value in lunch])
mapped_test_prep = np.array([TP_integer_map[value] for value in test_prep])

# Processed 
average_score = (math + reading + writing) / 3
intersect = np.ones(len(average_score))

# MAPPED DATA MATRICIES

N = len(average_score)

#Integer Mapped
integer_mapped_data_matrix = np.column_stack((intersect, mapped_gender, 
                               mapped_race_ethnicity, 
                               mapped_parent_education, 
                               mapped_lunch, 
                               mapped_test_prep))

#One Hot Encoded
onehot_mapped_data_matrix = np.vstack(([convert_to_onehot({'gender':gender[i],
                                                          'race_ethnicity':race_ethnicity[i],
                                                          'parent_education':parent_education[i],
                                                          'lunch':lunch[i],
                                                          'test_prep':test_prep[i]}) for i in range(N)]))

#Binary Encoded
binary_mapped_data_matrix = np.vstack(([convert_to_binary({'gender':gender[i],
                                                          'race_ethnicity':race_ethnicity[i],
                                                          'parent_education':parent_education[i],
                                                          'lunch':lunch[i],
                                                          'test_prep':test_prep[i]}) for i in range(N)]))

integer_reg = LinearRegression().fit(integer_mapped_data_matrix, average_score)
onehot_reg = LinearRegression().fit(onehot_mapped_data_matrix, average_score)
binary_reg = LinearRegression().fit(binary_mapped_data_matrix, average_score)

dict1 = {'gender':'female', 
            'race_ethnicity':'group B', 
            'parent_education':'bachelor\'s degree', 
            'lunch':'standard', 
            'test_prep':'none'}

dict2 = {'gender':'male', 
            'race_ethnicity':'group A', 
            'parent_education':'associate\'s degree', 
            'lunch':'free/reduced', 
            'test_prep':'none'}

integer_converted1 = convert_to_integer(dict1)
onehot_converted1 = convert_to_onehot(dict1)
binary_converted1 = convert_to_binary(dict1)

integer_converted_2 = convert_to_integer(dict2)
onehot_converted2 = convert_to_onehot(dict2)
binary_converted2 = convert_to_binary(dict2)

print('INTEGER')
print(integer_reg.predict(integer_converted1))
print(integer_reg.predict(integer_converted_2))

print('ONEHOT')
print(onehot_reg.predict(onehot_converted1))
print(onehot_reg.predict(onehot_converted2))

print('BINARY')
print(binary_reg.predict(binary_converted1))
print(binary_reg.predict(binary_converted2))

INTEGER
[68.29569006]
[52.58197253]
ONEHOT
[72.125]
[55.375]
BINARY
[72.61428369]
[55.01559059]


#### Visualizing a 2-Dimensional Plot With Classes (Classification)

In [84]:
GENDER_COL = 0
RACE_COL = 1
PARENTAL_EDU_COL = 2
LUNCH_COL = 3
TEST_PREP_COL = 4
MATH_SCORE_COL = 5
READING_SCORE_COL = 6
WRITING_SCORE_COL = 7

# math = np.array(raw_data["math score"])
# reading = np.array(raw_data["reading score"])
# simple_plot(math, reading)

# def class_plot(class_name, class1, class2, feature1, feature2):
#     plt.scatter(df2[feature1][(class_name == class1) | (class_name == class1)],
#             df2[feature2][(class_name == class1) | (class_name == class1)],
#            color='red',
#            label=class1.capitalize() + " Scores")
#     plt.scatter(df2[feature1][class_name == class2],
#             df2[feature2][class_name == class2],
#            color='blue',
#            label=class2.capitalize() + " Scores")

#     plt.title("Classification Plot based on " + class_name.name.capitalize())
#     plt.xlabel(feature1.capitalize())
#     plt.ylabel(feature2.capitalize())
#     plt.legend()
    
# class_plot(pre_processed_data.gender, 'male', 'female', 'writing score', 'reading score')



#### Comparing Integer vs. Binary vs. One-Hot Encoding

In [113]:
import warnings
warnings.filterwarnings('ignore')
class K_Fold():
    def __init__(self, X, y, k):
        self.X = X
        self.y = y
        self.k = k
        
    def k_fold_test(self):
        #Split up data into folds
        listX = np.array_split(self.X, self.k, axis=0)
        listY = np.array_split(self.y, self.k)

        #Set up SVC
        model = LinearRegression()
        
        #get average accuracy
        total = 0
        for i in range(self.k):
            #Partition matricies to use
            training_X = np.vstack(np.delete(listX, i, 0))
            training_Y = np.vstack(np.delete(listY, i, 0))
            
            #Test Fit
            model.fit(training_X, np.ravel(training_Y))
            accuracy = model.score(listX[i], listY[i])
            total += accuracy
        return total/self.k

In [129]:
# Split into test and train data and shuffle (80/20 split)
int_x_shuffled, int_y_shuffled = shuffle(integer_mapped_data_matrix, average_score)
binary_x_shuffled, binary_y_shuffled = shuffle(binary_mapped_data_matrix, average_score)
OH_x_shuffled, OH_y_shuffled = shuffle(onehot_mapped_data_matrix, average_score)

x_int_train, x_int_test, y_int_train, y_int_test = train_test_split(int_x_shuffled, int_y_shuffled, test_size = 0.2, random_state = 37)
x_binary_train, x_binary_test, y_binary_train, y_binary_test = train_test_split(binary_x_shuffled, binary_y_shuffled, test_size = 0.2, random_state = 37)
x_OH_train, x_OH_test, y_OH_train, y_OH_test = train_test_split(OH_x_shuffled, OH_y_shuffled, test_size = 0.2, random_state = 37)

int_reg = LinearRegression().fit(x_int_train, y_int_train)
OH_reg = LinearRegression().fit(x_OH_train, y_OH_train)
binary_reg = LinearRegression().fit(x_binary_train, y_binary_train)

# Create K-Fold Classes for each Encoding Method
kf_int = K_Fold(int_x_shuffled, int_y_shuffled, k=5)
kf_OH = K_Fold(OH_x_shuffled, OH_y_shuffled, k=5)
kf_binary = K_Fold(binary_x_shuffled, binary_y_shuffled, k=5)

print(kf_int.k_fold_test())
print(kf_OH.k_fold_test())
print(kf_binary.k_fold_test())

0.18619955812345346
0.2152502516660701
0.22068128863305167
