# Student Performance - Feature Engineering

## Set Up Environment

In [1]:
# Import libraries.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read CSV file into a dataframe.
scores = pd.read_csv('data/StudentsPerformance.csv')
scores.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


## Define Classes

In [3]:
class Group_Metrics:
    """
    The Group_Metrics class is for generating group metrics for a dataframe.  These metrics can be added to the original
    dataframe or used to create summary tables for specific groups.
    
    Attributes:
        self.data: The dataframe that is passed in that group metrics can be calculated from.
        self.group_cols: A list of categorical columns in the dataframe passed in for which group metrics can be calculated.
    """
    
    def __init__(self, df, group_cols, numeric_cols):
        """The constructor method takes in a dataframe and saves it as an attribute."""
        self.data = df
        self.group_cols = group_cols
        self.numeric_cols = numeric_cols
    
    def insert_group_metric(self, groups, metric_col, metric_name, func):
        """This method takes the dataframe attribute, calculates a metric for each group of a categorical column, and
        adds the resulting values as a new column in the dataframe."""
        group_values = self.data.pivot_table(index=groups, values=metric_col, aggfunc=func)[metric_col]
        if type(groups) == str:
            scores['{} {} {}'.format(metric_col, groups, metric_name)] = self.data.set_index(groups).index.map(group_values)
        else:
            scores['{} {} {}'.format(metric_col, groups, metric_name)] = self.data.set_index(
                list(groups)).index.map(group_values)
    
    def gen_summary_table(self, groups, metrics, metric_names):
        categorical_cols = self.group_cols
        categorical_cols.remove(groups)
        
        df_list = []
        
        for col in categorical_cols:
            group_values = pd.crosstab(index=self.data[groups], columns=self.data[col])
            df_list.append(group_values)
        
        for col in self.numeric_cols:
            for i in range(len(metrics)):
                group_values = self.data.pivot_table(index=groups, values=col, aggfunc=metrics[i])
                group_values.rename(columns={col: '{} {}'.format(metric_names[i], col)}, inplace=True)
                df_list.append(group_values)
        
        summary_table = pd.concat(df_list, axis=1)
        return summary_table
    
    def _slice_df(self, filters):
        """This method subsets the dataframe attribute by the filters that are passed in."""
        cols = list(filters)
        vals = list(filters.values())
        subset_df = self.data
        for i in range(len(filters)):
            subset_df = subset_df[subset_df[cols[i]] == vals[i]]
        return subset_df

In [4]:
groups = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
metric_cols = ['math score', 'reading score', 'writing score']
group_scores = Group_Metrics(scores, groups, metric_cols)

In [5]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [6]:
metric_names = ['mean', 'median', 'min', 'max', 'stdev', 'count']
functions = [np.mean, np.median, np.min, np.max, np.std, len]

In [8]:
group_scores.gen_summary_table('gender', functions, metric_names)

Unnamed: 0_level_0,group A,group B,group C,group D,group E,associate's degree,bachelor's degree,high school,master's degree,some college,...,min reading score,max reading score,stdev reading score,count reading score,mean writing score,median writing score,min writing score,max writing score,stdev writing score,count writing score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
female,36,104,180,129,69,116,63,94,36,118,...,17,100,14.378245,518,72.467181,74,10,100,14.844842,518
male,53,86,139,133,71,106,55,102,23,108,...,23,100,13.931832,482,63.311203,64,15,100,14.113832,482


In [9]:
group_scores.gen_summary_table('race/ethnicity', functions, metric_names)

Unnamed: 0_level_0,associate's degree,bachelor's degree,high school,master's degree,some college,some high school,free/reduced,standard,mean math score,median math score,...,min reading score,max reading score,stdev reading score,count reading score,mean writing score,median writing score,min writing score,max writing score,stdev writing score,count writing score
race/ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
group A,14,12,18,3,18,24,36,53,61.629213,61.0,...,23,100,15.543762,89,62.674157,62,19,97,15.468278,89
group B,41,20,48,6,37,38,69,121,63.452632,63.0,...,24,97,15.177499,190,65.6,67,15,96,15.625173,190
group C,78,40,64,19,69,49,114,205,64.46395,65.0,...,17,100,13.997033,319,67.827586,68,10,100,14.983378,319
group D,50,28,44,23,67,50,95,167,67.362595,69.0,...,31,100,13.895306,262,70.145038,72,32,100,14.367707,262
group E,39,18,22,8,35,18,41,99,73.821429,74.5,...,26,100,14.874024,140,71.407143,72,22,100,15.113906,140


In [10]:
group_scores.gen_summary_table('parental level of education', functions, metric_names)

Unnamed: 0_level_0,free/reduced,standard,mean math score,median math score,min math score,max math score,stdev math score,count math score,mean reading score,median reading score,min reading score,max reading score,stdev reading score,count reading score,mean writing score,median writing score,min writing score,max writing score,stdev writing score,count writing score
parental level of education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
associate's degree,77,145,67.882883,67.0,26,100,15.112093,222,70.927928,72.5,31,100,13.868948,222,69.896396,70.5,35,100,14.311122,222
bachelor's degree,44,74,69.389831,68.0,29,100,14.943789,118,73.0,73.0,41,100,14.28525,118,73.381356,74.0,38,100,14.728262,118
high school,70,126,62.137755,63.0,8,99,14.539651,196,64.704082,66.0,24,99,14.13213,196,62.44898,64.0,15,100,14.085907,196
master's degree,24,35,69.745763,73.0,40,95,15.153915,59,75.372881,76.0,42,100,13.775163,59,75.677966,75.0,46,100,13.730711,59
some college,79,147,67.128319,67.5,19,100,14.312897,226,69.460177,70.5,23,100,14.057049,226,68.840708,70.0,19,99,15.012331,226
some high school,61,118,63.497207,65.0,0,97,15.927989,179,66.938547,67.0,17,100,15.479295,179,64.888268,66.0,10,100,15.736197,179


In [11]:
group_scores.gen_summary_table('lunch', functions, metric_names)

Unnamed: 0_level_0,mean math score,median math score,min math score,max math score,stdev math score,count math score,mean reading score,median reading score,min reading score,max reading score,stdev reading score,count reading score,mean writing score,median writing score,min writing score,max writing score,stdev writing score,count writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
free/reduced,58.921127,60,0,100,15.159956,355,64.653521,65,17,100,14.895339,355,63.022535,64,10,100,15.433823,355
standard,70.034109,69,19,100,13.653501,645,71.654264,72,26,100,13.830602,645,70.823256,72,22,100,14.339487,645


In [7]:
group_scores.gen_summary_table('test preparation course', functions, metric_names)

Unnamed: 0_level_0,female,male,group A,group B,group C,group D,group E,associate's degree,bachelor's degree,high school,...,min reading score,max reading score,stdev reading score,count reading score,mean writing score,median writing score,min writing score,max writing score,stdev writing score,count writing score
test preparation course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
completed,184,174,31,68,117,82,60,82,46,56,...,37,100,13.638384,358,74.418994,76,36,100,13.375335,358
none,334,308,58,122,202,180,80,140,72,140,...,17,100,14.463885,642,64.504673,65,10,100,14.999661,642


In [12]:
for group in groups:
    for m_col in metric_cols:
        for i in range(len(metric_names)):
            group_scores.insert_group_metric(group, m_col, metric_names[i], functions[i])

In [13]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [14]:
from itertools import combinations
group_combos = list(combinations(groups, 2))
group_combos

[]

In [15]:
for group in group_combos:
    for m_col in metric_cols:
        for i in range(len(metric_names)):
            group_scores.insert_group_metric(group, m_col, metric_names[i], functions[i])

In [16]:
group_scores.data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [17]:
group_scores.data.to_csv('data/StudentsPerformance_Polished.csv', index=False)