# Student Performance - Feature Engineering

## Set Up Environment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
scores = pd.read_csv('data/StudentsPerformance.csv')
scores.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


## Define Classes

In [3]:
class Group_Metrics:
    def __init__(self, df):
        self.data = df
    
    def insert_group_metric(self, groups, metric_col, metric_name, func):
        group_values = self.data.pivot_table(index=groups, values=metric_col, aggfunc=func)[metric_col]
        if type(groups) == str:
            scores['{} {} {}'.format(metric_col, groups, metric_name)] = self.data.set_index(groups).index.map(group_values)
        else:
            scores['{} {} {}'.format(metric_col, groups, metric_name)] = self.data.set_index(
                list(groups)).index.map(group_values)
    
    def _slice_df(self, filters):
        cols = list(filters)
        vals = list(filters.values())
        subset_df = self.data
        for i in range(len(filters)):
            subset_df = subset_df[subset_df[cols[i]] == vals[i]]
        return subset_df

In [4]:
group_scores = Group_Metrics(scores)

In [5]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [6]:
groups = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
metric_cols = ['math score', 'reading score', 'writing score']
metric_names = ['mean', 'median', 'min', 'max', 'stdev', 'count']
functions = [np.mean, np.median, np.min, np.max, np.std, len]

for group in groups:
    for m_col in metric_cols:
        for i in range(len(metric_names)):
            group_scores.insert_group_metric(group, m_col, metric_names[i], functions[i])

In [7]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,math score gender mean,math score gender median,...,reading score test preparation course min,reading score test preparation course max,reading score test preparation course stdev,reading score test preparation course count,writing score test preparation course mean,writing score test preparation course median,writing score test preparation course min,writing score test preparation course max,writing score test preparation course stdev,writing score test preparation course count
0,female,group B,bachelor's degree,standard,none,72,72,74,63.633205,65,...,17,100,14.463885,642,64.504673,65,10,100,14.999661,642
1,female,group C,some college,standard,completed,69,90,88,63.633205,65,...,37,100,13.638384,358,74.418994,76,36,100,13.375335,358
2,female,group B,master's degree,standard,none,90,95,93,63.633205,65,...,17,100,14.463885,642,64.504673,65,10,100,14.999661,642
3,male,group A,associate's degree,free/reduced,none,47,57,44,68.728216,69,...,17,100,14.463885,642,64.504673,65,10,100,14.999661,642
4,male,group C,some college,standard,none,76,78,75,68.728216,69,...,17,100,14.463885,642,64.504673,65,10,100,14.999661,642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,63.633205,65,...,37,100,13.638384,358,74.418994,76,36,100,13.375335,358
996,male,group C,high school,free/reduced,none,62,55,55,68.728216,69,...,17,100,14.463885,642,64.504673,65,10,100,14.999661,642
997,female,group C,high school,free/reduced,completed,59,71,65,63.633205,65,...,37,100,13.638384,358,74.418994,76,36,100,13.375335,358
998,female,group D,some college,standard,completed,68,78,77,63.633205,65,...,37,100,13.638384,358,74.418994,76,36,100,13.375335,358


In [8]:
from itertools import combinations
group_combos = list(combinations(groups, 2)) + list(combinations(groups, 3)) + list(combinations(groups, 4)) + list(combinations(groups, 5))
group_combos

[('gender', 'race/ethnicity'),
 ('gender', 'parental level of education'),
 ('gender', 'lunch'),
 ('gender', 'test preparation course'),
 ('race/ethnicity', 'parental level of education'),
 ('race/ethnicity', 'lunch'),
 ('race/ethnicity', 'test preparation course'),
 ('parental level of education', 'lunch'),
 ('parental level of education', 'test preparation course'),
 ('lunch', 'test preparation course'),
 ('gender', 'race/ethnicity', 'parental level of education'),
 ('gender', 'race/ethnicity', 'lunch'),
 ('gender', 'race/ethnicity', 'test preparation course'),
 ('gender', 'parental level of education', 'lunch'),
 ('gender', 'parental level of education', 'test preparation course'),
 ('gender', 'lunch', 'test preparation course'),
 ('race/ethnicity', 'parental level of education', 'lunch'),
 ('race/ethnicity', 'parental level of education', 'test preparation course'),
 ('race/ethnicity', 'lunch', 'test preparation course'),
 ('parental level of education', 'lunch', 'test preparation 

In [9]:
for group in group_combos:
    for m_col in metric_cols:
        for i in range(len(metric_names)):
            group_scores.insert_group_metric(group, m_col, metric_names[i], functions[i])

In [10]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,math score gender mean,math score gender median,...,"reading score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') min","reading score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') max","reading score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') stdev","reading score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') count","writing score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') mean","writing score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') median","writing score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') min","writing score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') max","writing score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') stdev","writing score ('gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course') count"
0,female,group B,bachelor's degree,standard,none,72,72,74,63.633205,65,...,65,97,11.758685,6,78.666667,77.0,69,96,10.112698,6
1,female,group C,some college,standard,completed,69,90,88,63.633205,65,...,71,95,8.112841,11,84.818182,84.0,76,94,6.896639,11
2,female,group B,master's degree,standard,none,90,95,93,63.633205,65,...,90,95,3.535534,2,88.500000,88.5,84,93,6.363961,2
3,male,group A,associate's degree,free/reduced,none,47,57,44,68.728216,69,...,57,61,2.828427,2,49.500000,49.5,44,55,7.778175,2
4,male,group C,some college,standard,none,76,78,75,68.728216,69,...,39,87,14.826778,13,58.153846,60.0,37,81,14.478986,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,63.633205,65,...,99,99,0.000000,2,97.500000,97.5,95,100,3.535534,2
996,male,group C,high school,free/reduced,none,62,55,55,68.728216,69,...,34,72,12.106669,7,52.571429,55.0,36,59,7.849780,7
997,female,group C,high school,free/reduced,completed,59,71,65,63.633205,65,...,66,79,6.557439,3,71.000000,65.0,64,84,11.269428,3
998,female,group D,some college,standard,completed,68,78,77,63.633205,65,...,75,97,7.565586,7,86.428571,83.0,77,98,8.482475,7


In [11]:
group_scores.data.to_csv('data/StudentsPerformance_Polished.csv', index=False)