[Source](https://www.marsja.se/python-pandas-groupby-tutorial-examples/)

In [1]:
import pandas as pd

data_url = 'http://vincentarelbundock.github.io/Rdatasets/csv/carData/Salaries.csv'
df = pd.read_csv(data_url, index_col=0)

df.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
1,Prof,B,19,18,Male,139750
2,Prof,B,20,16,Male,173200
3,AsstProf,B,4,3,Male,79750
4,Prof,B,45,39,Male,115000
5,Prof,B,40,41,Male,141500


In [2]:
import IPython

# Grouping by one factor
df_rank = df.groupby('rank')

# Getting all methods from the groupby object:
meth = [method_name for method_name in dir(df_rank)
 if callable(getattr(df_rank, method_name)) & ~method_name.startswith('_')]

# Printing the result
print(IPython.utils.text.columnize(meth))

agg        corr      cumsum     get_group  mean     pct_change  sem    transform
aggregate  corrwith  describe   head       median   pipe        shift  tshift   
all        count     diff       hist       min      plot        size   var      
any        cov       expanding  idxmax     ngroup   prod        skew 
apply      cumcount  ffill      idxmin     nth      quantile    std  
backfill   cummax    fillna     last       nunique  rank        sum  
bfill      cummin    filter     mad        ohlc     resample    tail 
boxplot    cumprod   first      max        pad      rolling     take 



In [10]:
print(df_rank.groups)

{'AssocProf': Int64Index([  6,  11,  25,  40,  42,  55,  56,  58,  59,  61,  64,  66,  90,
             92,  93,  97, 105, 107, 108, 109, 112, 124, 131, 133, 139, 141,
            142, 154, 157, 159, 163, 169, 175, 177, 178, 183, 187, 189, 194,
            195, 196, 215, 218, 219, 223, 228, 232, 256, 258, 261, 285, 286,
            294, 300, 317, 322, 323, 329, 335, 364, 368, 371, 380, 383],
           dtype='int64'), 'AsstProf': Int64Index([  3,  12,  13,  14,  28,  29,  32,  34,  35,  36,  38,  50,  53,
             60,  62,  65,  76,  79,  80,  84,  88,  91,  96, 113, 119, 120,
            128, 130, 134, 144, 147, 150, 152, 155, 158, 161, 164, 165, 171,
            180, 197, 198, 201, 209, 211, 227, 235, 238, 241, 254, 259, 273,
            274, 275, 288, 290, 307, 309, 316, 326, 349, 355, 360, 377, 378,
            381, 397],
           dtype='int64'), 'Prof': Int64Index([  1,   2,   4,   5,   7,   8,   9,  10,  15,  16,
            ...
            387, 388, 389, 390, 391, 392, 393

In [6]:
df_rank.get_group('AsstProf').head()

Unnamed: 0,discipline,yrs.since.phd,yrs.service,sex,salary
3,B,4,3,Male,79750
12,B,7,2,Male,79800
13,B,1,1,Male,77700
14,B,2,0,Male,78000
28,B,5,3,Male,82379


In [11]:
df_rank.size()

# Output:
#
# rank
# AssocProf     64
# AsstProf      67
# Prof         266
# dtype: int64

rank
AssocProf     64
AsstProf      67
Prof         266
dtype: int64

In [12]:
df_rank.count()

Unnamed: 0_level_0,discipline,yrs.since.phd,yrs.service,sex,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AssocProf,64,64,64,64,64
AsstProf,67,67,67,67,67
Prof,266,266,266,266,266


In [16]:
df.groupby(['rank', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,discipline,yrs.since.phd,yrs.service,salary
rank,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AssocProf,Female,10,10,10,10
AssocProf,Male,54,54,54,54
AsstProf,Female,11,11,11,11
AsstProf,Male,56,56,56,56
Prof,Female,18,18,18,18
Prof,Male,248,248,248,248


In [22]:
from IPython.display import HTML, display
display(df_rank.nunique())

Unnamed: 0_level_0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AssocProf,1,2,24,21,2,63
AsstProf,1,2,11,7,2,53
Prof,1,2,42,50,2,261


In [36]:
import numpy as np
# put nan in 10% of of values in df
df_null = df.mask(np.random.random(df.shape) < .1)
df_null.isnull().sum().to_frame('N Missing values')

Unnamed: 0,N Missing values
rank,45
discipline,43
yrs.since.phd,35
yrs.service,38
sex,37
salary,38


In [38]:
df_null.groupby('rank').nunique()

Unnamed: 0_level_0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AssocProf,1,2,22,18,2,48
AsstProf,1,2,11,7,2,44
Prof,1,2,41,49,2,213


In [44]:
display(df_rank['salary'].mean().reset_index(name='Mean Salary'))
display(df_rank['salary'].mean().reset_index().rename(columns={'rank':'Rank','salary' : 'Mean Salary'}))


Unnamed: 0,rank,Mean Salary
0,AssocProf,93876.4375
1,AsstProf,80775.985075
2,Prof,126772.109023


Unnamed: 0,Rank,Mean Salary
0,AssocProf,93876.4375
1,AsstProf,80775.985075
2,Prof,126772.109023


In [46]:
df.groupby('discipline')['salary'].median().reset_index().rename(
    columns={'rank':'Rank','salary' : 'MedianSalary'})

Unnamed: 0,discipline,MedianSalary
0,A,104350.0
1,B,113018.5


In [47]:
df_rank['salary'].agg(['mean', 'median', 'std', 'min', 'max']).reset_index()

Unnamed: 0,rank,mean,median,std,min,max
0,AssocProf,93876.4375,95626.5,13831.699844,62884,126431
1,AsstProf,80775.985075,79800.0,8174.112637,63100,97032
2,Prof,126772.109023,123321.5,27718.674999,57800,231545


In [50]:
def salary_range(df):
    mini = df.min()
    maxi = df.max()
    rang = '%s - %s' % (mini, maxi)
    
    return rang

df_descriptive = df_rank['salary'].agg(['mean', 'median', 'std', salary_range]).reset_index()
df_descriptive

Unnamed: 0,rank,mean,median,std,salary_range
0,AssocProf,93876.4375,95626.5,13831.699844,62884 - 126431
1,AsstProf,80775.985075,79800.0,8174.112637,63100 - 97032
2,Prof,126772.109023,123321.5,27718.674999,57800 - 231545


In [51]:
# Renaming Pandas Dataframe Columns
df_descriptive.rename(columns={'rank':'Rank', 'mean':'Mean', 'median':'Median', 
                               'std':'Standard Deviation', 'salary_range':'Range'})

Unnamed: 0,Rank,Mean,Median,Standard Deviation,Range
0,AssocProf,93876.4375,95626.5,13831.699844,62884 - 126431
1,AsstProf,80775.985075,79800.0,8174.112637,63100 - 97032
2,Prof,126772.109023,123321.5,27718.674999,57800 - 231545


In [52]:
from scipy.stats.mstats import gmean, hmean

df_descriptive = df_rank['salary'].agg(['mean', 'median', hmean, gmean]).reset_index()
df_descriptive

Unnamed: 0,rank,mean,median,hmean,gmean
0,AssocProf,93876.4375,95626.5,91784.174692,92844.275276
1,AsstProf,80775.985075,79800.0,79958.539934,80367.469951
2,Prof,126772.109023,123321.5,120947.872093,123836.10808


In [53]:
df_grp = df.groupby(['rank', 'discipline'])
df_grp.size().reset_index(name='count')

Unnamed: 0,rank,discipline,count
0,AssocProf,A,26
1,AssocProf,B,38
2,AsstProf,A,24
3,AsstProf,B,43
4,Prof,A,131
5,Prof,B,135


In [62]:
# Get two groups
display(df_grp.get_group(('AssocProf', 'A')).head())

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
25,AssocProf,A,13,8,Female,74830
105,AssocProf,A,18,10,Male,83850
107,AssocProf,A,11,8,Male,82099
108,AssocProf,A,10,8,Male,82600
109,AssocProf,A,15,8,Male,81500


In [58]:
df_3grps = df.groupby(['rank', 'discipline', 'sex'])
df_n_per_group = df_3grps.size().reset_index(name='n')
df_n_per_group

Unnamed: 0,rank,discipline,sex,n
0,AssocProf,A,Female,4
1,AssocProf,A,Male,22
2,AssocProf,B,Female,6
3,AssocProf,B,Male,32
4,AsstProf,A,Female,6
5,AsstProf,A,Male,18
6,AsstProf,B,Female,5
7,AsstProf,B,Male,38
8,Prof,A,Female,8
9,Prof,A,Male,123


In [100]:
perc = df.groupby(['rank', 'discipline', 'sex'])['salary'].size()
display(perc)

# Give the percentage on the level of Rank:
percbyrank = perc.groupby(level='rank').apply(lambda x: 100 * x / float(x.sum()))

display(percbyrank)
print('Total percentage in group AssocProf. ',
      percbyrank.reset_index().query('rank == "AssocProf"')['salary'].sum())

rank       discipline  sex   
AssocProf  A           Female      4
                       Male       22
           B           Female      6
                       Male       32
AsstProf   A           Female      6
                       Male       18
           B           Female      5
                       Male       38
Prof       A           Female      8
                       Male      123
           B           Female     10
                       Male      125
Name: salary, dtype: int64

rank       discipline  sex   
AssocProf  A           Female     6.250000
                       Male      34.375000
           B           Female     9.375000
                       Male      50.000000
AsstProf   A           Female     8.955224
                       Male      26.865672
           B           Female     7.462687
                       Male      56.716418
Prof       A           Female     3.007519
                       Male      46.240602
           B           Female     3.759398
                       Male      46.992481
Name: salary, dtype: float64

Total percentage in group AssocProf.  100.0


In [77]:
N = perc.reset_index()['salary'].sum()
totalperc =  perc.groupby(level=0).apply(lambda x: 100 * x / N).reset_index(name='% of total n')
totalperc.reset_index()

Unnamed: 0,index,rank,discipline,sex,% of total n
0,0,AssocProf,A,Female,1.007557
1,1,AssocProf,A,Male,5.541562
2,2,AssocProf,B,Female,1.511335
3,3,AssocProf,B,Male,8.060453
4,4,AsstProf,A,Female,1.511335
5,5,AsstProf,A,Male,4.534005
6,6,AsstProf,B,Female,1.259446
7,7,AsstProf,B,Male,9.571788
8,8,Prof,A,Female,2.015113
9,9,Prof,A,Male,30.982368


In [102]:
df_rn = df.groupby(['rank', 'discipline']).mean()
df_rn

Unnamed: 0_level_0,Unnamed: 1_level_0,yrs.since.phd,yrs.service,salary
rank,discipline,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AssocProf,A,17.846154,13.5,83061.115385
AssocProf,B,13.815789,10.894737,101276.394737
AsstProf,A,5.666667,2.416667,73935.541667
AsstProf,B,4.790698,2.348837,84593.906977
Prof,A,30.48855,24.442748,119948.274809
Prof,B,26.177778,21.237037,133393.755556


In [103]:
df_rn.index

MultiIndex(levels=[['AssocProf', 'AsstProf', 'Prof'], ['A', 'B']],
           codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
           names=['rank', 'discipline'])

In [105]:
df_rn = df_rn.add_suffix('_Mean').reset_index()
type(df_rn)

# Output: pandas.core.frame.DataFrame
df_rn.head()

Unnamed: 0,index,rank_Mean,discipline_Mean,yrs.since.phd_Mean_Mean,yrs.service_Mean_Mean,salary_Mean_Mean
0,0,AssocProf,A,17.846154,13.5,83061.115385
1,1,AssocProf,B,13.815789,10.894737,101276.394737
2,2,AsstProf,A,5.666667,2.416667,73935.541667
3,3,AsstProf,B,4.790698,2.348837,84593.906977
4,4,Prof,A,30.48855,24.442748,119948.274809


In [111]:
df_stats = df.groupby(['rank', 'discipline', 'sex']).agg(['mean', 'median', 'std'])
df_stats.columns = ["_".join(x) for x in df_stats.columns.ravel()]

df_stats.iloc[:,0:6].reset_index()
#df_stats.reset_index()

Unnamed: 0,rank,discipline,sex,yrs.since.phd_mean,yrs.since.phd_median,yrs.since.phd_std,yrs.service_mean,yrs.service_median,yrs.service_std
0,AssocProf,A,Female,18.5,19.0,8.185353,15.5,15.0,8.698659
1,AssocProf,A,Male,17.727273,12.5,12.209215,13.136364,8.0,12.302905
2,AssocProf,B,Female,13.5,12.5,2.880972,8.833333,9.5,1.94079
3,AssocProf,B,Male,13.875,10.5,8.507113,11.28125,8.0,9.585723
4,AsstProf,A,Female,4.833333,4.5,2.316607,2.5,2.5,2.073644
5,AsstProf,A,Male,5.944444,5.5,2.508157,2.388889,3.0,1.539247
6,AsstProf,B,Female,6.6,5.0,3.646917,2.6,3.0,1.81659
7,AsstProf,B,Male,4.552632,4.0,2.344601,2.315789,2.5,1.39701
8,Prof,A,Female,26.5,28.0,9.023778,16.125,14.5,11.319231
9,Prof,A,Male,30.747967,31.0,10.152601,24.98374,25.0,12.010913
