# Groupby 

A groupby operation involves some combination of splitting the
object, applying a function, and combining the results. This can be
used to group large amounts of data and compute operations on these
groups.

Refs: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

In [1]:
import pandas as pd 

In [2]:
data = {'Department':['CS&IS','CS&IS','Biology','Biology','English','English'],
        'Cohort': ['Undergraduate','Graduate','Undergraduate','Graduate','Undergraduate','Graduate'],
        'Student':[300, 150,100, 70, 120,60]
    
}
data

{'Department': ['CS&IS', 'CS&IS', 'Biology', 'Biology', 'English', 'English'],
 'Cohort': ['Undergraduate',
  'Graduate',
  'Undergraduate',
  'Graduate',
  'Undergraduate',
  'Graduate'],
 'Student': [300, 150, 100, 70, 120, 60]}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Department,Cohort,Student
0,CS&IS,Undergraduate,300
1,CS&IS,Graduate,150
2,Biology,Undergraduate,100
3,Biology,Graduate,70
4,English,Undergraduate,120
5,English,Graduate,60


In [4]:
#Use groupby method to group rows together based on column 'Department' 
df_by_dept = df.groupby('Department')
df_by_dept

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe409b217f0>

In [5]:
df_by_dept.mean()

Unnamed: 0_level_0,Student
Department,Unnamed: 1_level_1
Biology,85
CS&IS,225
English,90


In [6]:
df_by_dept.std()

Unnamed: 0_level_0,Student
Department,Unnamed: 1_level_1
Biology,21.213203
CS&IS,106.066017
English,42.426407


In [7]:
df_by_dept.min()

Unnamed: 0_level_0,Cohort,Student
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Biology,Graduate,70
CS&IS,Graduate,150
English,Graduate,60


In [8]:
df_by_dept.max()

Unnamed: 0_level_0,Cohort,Student
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Biology,Undergraduate,100
CS&IS,Undergraduate,300
English,Undergraduate,120


In [9]:
df_by_dept.count()

Unnamed: 0_level_0,Cohort,Student
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Biology,2,2
CS&IS,2,2
English,2,2


In [10]:
#describe() method shows all stats
df_by_dept.describe()

Unnamed: 0_level_0,Student,Student,Student,Student,Student,Student,Student,Student
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Biology,2.0,85.0,21.213203,70.0,77.5,85.0,92.5,100.0
CS&IS,2.0,225.0,106.066017,150.0,187.5,225.0,262.5,300.0
English,2.0,90.0,42.426407,60.0,75.0,90.0,105.0,120.0


In [11]:
df_by_dept.describe().transpose()

Unnamed: 0,Department,Biology,CS&IS,English
Student,count,2.0,2.0,2.0
Student,mean,85.0,225.0,90.0
Student,std,21.213203,106.066017,42.426407
Student,min,70.0,150.0,60.0
Student,25%,77.5,187.5,75.0
Student,50%,85.0,225.0,90.0
Student,75%,92.5,262.5,105.0
Student,max,100.0,300.0,120.0


In [12]:
#To see only CS&IS information 
df_by_dept.describe().transpose()['CS&IS']

Student  count      2.000000
         mean     225.000000
         std      106.066017
         min      150.000000
         25%      187.500000
         50%      225.000000
         75%      262.500000
         max      300.000000
Name: CS&IS, dtype: float64

### Exercise group by cohort 

In [13]:
df_by_cohort = df.groupby('Cohort')

In [14]:
df_by_cohort.describe()

Unnamed: 0_level_0,Student,Student,Student,Student,Student,Student,Student,Student
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Cohort,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Graduate,3.0,93.333333,49.328829,60.0,65.0,70.0,110.0,150.0
Undergraduate,3.0,173.333333,110.151411,100.0,110.0,120.0,210.0,300.0


In [15]:
df_by_cohort.sum()

Unnamed: 0_level_0,Student
Cohort,Unnamed: 1_level_1
Graduate,280
Undergraduate,520
