In [8]:
import numpy as np
import pandas as pd

In [9]:
students_performance = pd.read_csv('https://stepik.org/media/attachments/course/4852/StudentsPerformance.csv')
students_performance = students_performance.rename(columns = {
    'parental level of education': 'parental_level_of_education',
    'test preparation course': 'test_preparation_course',
    'math score': 'math_score',
    'reading score': 'reading_score',
    'writing score': 'writing_score',
})

In [10]:
students_performance.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [11]:
students_performance.groupby("gender").mean()

Unnamed: 0_level_0,math_score,reading_score,writing_score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,72.608108,72.467181
male,68.728216,65.473029,63.311203


In [12]:
students_performance.groupby("gender") \
    .aggregate({'math_score': 'mean', 'reading_score': 'mean'})

Unnamed: 0_level_0,reading_score,math_score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,72.608108,63.633205
male,65.473029,68.728216


In [13]:
students_performance.groupby("gender", as_index=False) \
    .aggregate({'math_score': 'mean', 'reading_score': 'mean'}) \
    .rename(columns = {'math_score': 'mean_math_score'})

Unnamed: 0,gender,reading_score,mean_math_score
0,female,72.608108,63.633205
1,male,65.473029,68.728216


In [19]:
mean_scores = students_performance.groupby(["gender", "race/ethnicity"]) \
    .aggregate({'math_score': 'mean', 'reading_score': 'mean'})

In [20]:
mean_scores

Unnamed: 0_level_0,Unnamed: 1_level_0,reading_score,math_score
gender,race/ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1
female,group A,69.0,58.527778
female,group B,71.076923,61.403846
female,group C,71.944444,62.033333
female,group D,74.046512,65.248062
female,group E,75.84058,70.811594
male,group A,61.735849,63.735849
male,group B,62.848837,65.930233
male,group C,65.42446,67.611511
male,group D,66.135338,69.413534
male,group E,70.295775,76.746479


In [25]:
mean_scores.index

MultiIndex(levels=[['female', 'male'], ['group A', 'group B', 'group C', 'group D', 'group E']],
           codes=[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]],
           names=['gender', 'race/ethnicity'])

In [31]:
mean_scores.loc[[('female', 'group A'), ('female', 'group B')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,reading_score,math_score
gender,race/ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1
female,group A,69.0,58.527778
female,group B,71.076923,61.403846


In [33]:
students_performance \
    .groupby(["gender", "race/ethnicity"]) \
    .math_score.nunique()

gender  race/ethnicity
female  group A           29
        group B           51
        group C           59
        group D           53
        group E           44
male    group A           38
        group B           43
        group C           56
        group D           49
        group E           38
Name: math_score, dtype: int64

In [35]:
students_performance.sort_values(['gender', 'math_score'], ascending=False) \
    .groupby('gender').head(5)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
149,male,group E,associate's degree,free/reduced,completed,100,100,93
623,male,group A,some college,standard,completed,100,96,86
625,male,group D,some college,standard,completed,100,97,99
916,male,group E,bachelor's degree,standard,completed,100,100,100
306,male,group E,some college,standard,completed,99,87,81
451,female,group E,some college,standard,none,100,92,97
458,female,group E,bachelor's degree,standard,none,100,100,100
962,female,group E,associate's degree,standard,none,100,100,100
114,female,group E,bachelor's degree,standard,completed,99,100,100
263,female,group E,high school,standard,none,99,93,90


In [36]:
students_performance['total_score'] = students_performance.math_score + students_performance.reading_score

In [37]:
students_performance.head(5)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score
0,female,group B,bachelor's degree,standard,none,72,72,74,144
1,female,group C,some college,standard,completed,69,90,88,159
2,female,group B,master's degree,standard,none,90,95,93,185
3,male,group A,associate's degree,free/reduced,none,47,57,44,104
4,male,group C,some college,standard,none,76,78,75,154


In [39]:
students_performance = students_performance.assign(total_score_log = np.log(students_performance.total_score))

In [40]:
students_performance.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,total_score_log
0,female,group B,bachelor's degree,standard,none,72,72,74,144,4.969813
1,female,group C,some college,standard,completed,69,90,88,159,5.068904
2,female,group B,master's degree,standard,none,90,95,93,185,5.220356
3,male,group A,associate's degree,free/reduced,none,47,57,44,104,4.644391
4,male,group C,some college,standard,none,76,78,75,154,5.036953


In [41]:
students_performance.drop(['total_score'], axis=1)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score_log
0,female,group B,bachelor's degree,standard,none,72,72,74,4.969813
1,female,group C,some college,standard,completed,69,90,88,5.068904
2,female,group B,master's degree,standard,none,90,95,93,5.220356
3,male,group A,associate's degree,free/reduced,none,47,57,44,4.644391
4,male,group C,some college,standard,none,76,78,75,5.036953
5,female,group B,associate's degree,standard,none,71,83,78,5.036953
6,female,group B,some college,standard,completed,88,95,92,5.209486
7,male,group B,some college,free/reduced,none,40,43,39,4.418841
8,male,group D,high school,free/reduced,completed,64,64,67,4.852030
9,female,group B,high school,free/reduced,none,38,60,50,4.584967
