In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('datasets/StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## Describe Method
Describe method provides descriptive statistical calculations. It only applies
to the numerical-values columns.

In [3]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## Aggregation methods

In [4]:
df.sum()

gender                         femalefemalefemalemalemalefemalefemalemalemale...
race/ethnicity                 group Bgroup Cgroup Bgroup Agroup Cgroup Bgrou...
parental level of education    bachelor's degreesome collegemaster's degreeas...
lunch                          standardstandardstandardfree/reducedstandardst...
test preparation course        nonecompletednonenonenonenonecompletednonecomp...
math score                                                                 66089
reading score                                                              69169
writing score                                                              68054
dtype: object

In [5]:
df.max()

gender                                     male
race/ethnicity                          group E
parental level of education    some high school
lunch                                  standard
test preparation course                    none
math score                                  100
reading score                               100
writing score                               100
dtype: object

In [6]:
df.min()

gender                                     female
race/ethnicity                            group A
parental level of education    associate's degree
lunch                                free/reduced
test preparation course                 completed
math score                                      0
reading score                                  17
writing score                                  10
dtype: object

### Sum by Rows
Setting axis attribute with 1, indicates we are gonna use rows to 

In [7]:
# ? Sum by Rows
df.sum(axis=1, numeric_only=True)

0      218
1      247
2      278
3      148
4      229
      ... 
995    282
996    172
997    195
998    223
999    249
Length: 1000, dtype: int64

## Conditionals using Where from np

In [8]:
df['average'] = df.mean(axis=1, numeric_only=True)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667
1,female,group C,some college,standard,completed,69,90,88,82.333333
2,female,group B,master's degree,standard,none,90,95,93,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333
4,male,group C,some college,standard,none,76,78,75,76.333333


In [9]:
df['Pass/Fail'] = np.where(df['average'] > 70, 'Pass', 'Fail')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average,Pass/Fail
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,Pass
1,female,group C,some college,standard,completed,69,90,88,82.333333,Pass
2,female,group B,master's degree,standard,none,90,95,93,92.666667,Pass
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,Fail
4,male,group C,some college,standard,none,76,78,75,76.333333,Pass


## Multiple Conditions

In [10]:
conditions = [
    (df['average'] >= 90),
    (df['average'] >= 80) & (df['average'] < 90),
    (df['average'] >= 70) & (df['average'] < 80),
    (df['average'] >= 60) & (df['average'] < 70),
    (df['average'] >= 50) & (df['average'] < 60),
    (df['average'] < 50)
]

values = ['A', 'B', 'C', 'D', 'E', 'F']

df['grade'] = np.select(conditions, values)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average,Pass/Fail,grade
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,Pass,C
1,female,group C,some college,standard,completed,69,90,88,82.333333,Pass,B
2,female,group B,master's degree,standard,none,90,95,93,92.666667,Pass,A
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,Fail,F
4,male,group C,some college,standard,none,76,78,75,76.333333,Pass,C


## Conditional indexing

In [14]:
# ? Shows only female students
df[df['gender'] == 'female']

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
...,...,...,...,...,...,...,...,...
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
995,female,group E,master's degree,standard,completed,88,99,95
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


### Using conditional indexing to count how many female students are per column!

In [16]:
df[df['gender'] == 'female'].count()

gender                         518
race/ethnicity                 518
parental level of education    518
lunch                          518
test preparation course        518
math score                     518
reading score                  518
writing score                  518
dtype: int64