# Student Performance - Feature Engineering

## Set Up Environment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
scores = pd.read_csv('data/StudentsPerformance.csv')
scores.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


In [3]:
scores.groupby(['gender', 'race/ethnicity']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,math score,reading score,writing score
gender,race/ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,group A,58.527778,69.0,67.861111
female,group B,61.403846,71.076923,70.048077
female,group C,62.033333,71.944444,71.777778
female,group D,65.248062,74.046512,75.023256
female,group E,70.811594,75.84058,75.536232
male,group A,63.735849,61.735849,59.150943
male,group B,65.930233,62.848837,60.22093
male,group C,67.611511,65.42446,62.71223
male,group D,69.413534,66.135338,65.413534
male,group E,76.746479,70.295775,67.394366


## Define Classes

In [4]:
class Group_Metrics:
    def __init__(self, df):
        self.data = df
    
    def insert_group_metric(self, group, metric_col, metric_name, func):
        group_values = {}
        if type(group) != list:
            for g in self.data[group].unique():
                subset_df = self._slice_df({group: g})
                value = func(subset_df, metric_col)
                group_values[g] = value
        else:
            filters = {}
            for g1 in group:
                for g2 in self.data[g1].unique():
                    if g1 not in filters:
                        filters[g1] = g2
        self.data['{} {} {}'.format(metric_col, group, metric_name)] = self.data[group].map(group_values)
    
    def _slice_df(self, filters):
        cols = list(filters)
        vals = list(filters.values())
        subset_df = self.data
        for i in range(len(filters)):
            subset_df = subset_df[subset_df[cols[i]] == vals[i]]
        return subset_df
    
    def _calc_mean(self, df, col):
        return round(df[col].mean(), 1)
    
    def _calc_median(self, df, col):
        return df[col].median()
    
    def _calc_min(self, df, col):
        return df[col].min()
    
    def _calc_max(self, df, col):
        return df[col].max()
    
    def _calc_stdev(self, df, col):
        return df[col].std()
    
#     def insert_group_metric(self, groups, metric_col, metric_name):
#         group_values = self.data.groupby(groups).mean()[metric_col]
#         group_values = {}
#         for g in self.data[group].unique():
#             subset_df = self.slice_df({group: g})
#             value = func(subset_df, metric_col)
#             group_values[g] = value
#         self.data['{} {} {}'.format(metric_col, group, metric_name)] = self.data[group].map(group_values)
    
    

In [5]:
group_scores = Group_Metrics(scores)

In [6]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [7]:
groups = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
metric_cols = ['math score', 'reading score', 'writing score']
metric_names = ['mean', 'median', 'min', 'max', 'stdev']
functions = [group_scores._calc_mean, group_scores._calc_median, group_scores._calc_min, group_scores._calc_max,
            group_scores._calc_stdev]

for group in groups:
    for m_col in metric_cols:
        for i in range(len(metric_names)):
            group_scores.insert_group_metric(group, m_col, metric_names[i], functions[i])

In [8]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,math score gender mean,math score gender median,...,reading score test preparation course mean,reading score test preparation course median,reading score test preparation course min,reading score test preparation course max,reading score test preparation course stdev,writing score test preparation course mean,writing score test preparation course median,writing score test preparation course min,writing score test preparation course max,writing score test preparation course stdev
0,female,group B,bachelor's degree,standard,none,72,72,74,63.6,65.0,...,66.5,67.0,17,100,14.463885,64.5,65.0,10,100,14.999661
1,female,group C,some college,standard,completed,69,90,88,63.6,65.0,...,73.9,75.0,37,100,13.638384,74.4,76.0,36,100,13.375335
2,female,group B,master's degree,standard,none,90,95,93,63.6,65.0,...,66.5,67.0,17,100,14.463885,64.5,65.0,10,100,14.999661
3,male,group A,associate's degree,free/reduced,none,47,57,44,68.7,69.0,...,66.5,67.0,17,100,14.463885,64.5,65.0,10,100,14.999661
4,male,group C,some college,standard,none,76,78,75,68.7,69.0,...,66.5,67.0,17,100,14.463885,64.5,65.0,10,100,14.999661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,63.6,65.0,...,73.9,75.0,37,100,13.638384,74.4,76.0,36,100,13.375335
996,male,group C,high school,free/reduced,none,62,55,55,68.7,69.0,...,66.5,67.0,17,100,14.463885,64.5,65.0,10,100,14.999661
997,female,group C,high school,free/reduced,completed,59,71,65,63.6,65.0,...,73.9,75.0,37,100,13.638384,74.4,76.0,36,100,13.375335
998,female,group D,some college,standard,completed,68,78,77,63.6,65.0,...,73.9,75.0,37,100,13.638384,74.4,76.0,36,100,13.375335


In [9]:
from itertools import combinations
group_combos = list(combinations(groups, 2))
group_combos

for combo in group_combos:
    for group in combo:
        for m_col in metric_cols:
            for i in range(4):
                group_scores.insert_group_metric(group, m_col, metric_names[i], functions[i])

In [10]:
('gender', 'race/ethnicity')
group_scores.data.groupby(list(('gender', 'race/ethnicity'))).min().index

MultiIndex([('female', 'group A'),
            ('female', 'group B'),
            ('female', 'group C'),
            ('female', 'group D'),
            ('female', 'group E'),
            (  'male', 'group A'),
            (  'male', 'group B'),
            (  'male', 'group C'),
            (  'male', 'group D'),
            (  'male', 'group E')],
           names=['gender', 'race/ethnicity'])

In [11]:
# hip_hop = pd.DataFrame(
#     {'mc': ['kane','kane','g rap','g rap'], 
#      'producer': ['marley marl','rick rubin','marley marl','rick rubin']})
# # hip_hop
# hh_dict = {}
# for i in range(len(combinations(, 2)))
# for m in hip_hop.mc.unique():
#     for p in hip_hop.producer.unique():
#         hh_dict['mc-producer'] = m
#         hh_dict['producer'] = p
# hh_dict

In [12]:
group_scores.data.to_csv('data/StudentsPerformance_Polished.csv', index=False)