# Chapter 10 - Data Aggregation and Group Operations

## Data Aggregation

In [1]:
import pandas as pd
# Additional imports
import seaborn

In [2]:
# Read from CSV file
df = pd.read_csv('dataset-A3-loans.csv')
display(df.head(8))

Unnamed: 0,id,funded_amount,term,interest_rate,grade,employee_length,home_ownership,annual_income,purpose,title
0,721751,7000.0,36 months,14.91,D,2 years,RENT,46000.0,debt_consolidation,Debt Removal
1,40277218,16800.0,60 months,16.49,D,4 years,RENT,45500.0,home_improvement,Home improvement
2,68416017,1500.0,36 months,9.17,B,10+ years,MORTGAGE,83000.0,major_purchase,Major purchase
3,59481461,8000.0,36 months,12.29,C,2 years,RENT,74000.0,debt_consolidation,Debt consolidation
4,73003,3200.0,36 months,9.96,B,< 1 year,MORTGAGE,150000.0,other,New Bathroom
5,55917749,5000.0,36 months,12.29,C,10+ years,MORTGAGE,55000.0,home_improvement,Home improvement
6,1149328,11500.0,36 months,16.29,C,2 years,MORTGAGE,68000.0,credit_card,Credit Card
7,1614457,6000.0,36 months,15.8,C,5 years,MORTGAGE,36000.0,debt_consolidation,Debt consolidation


A summary of all the built-in aggregation functions after using `.groupby()` are:

- `count()`, `sum()`, `mean()`, `std()`, `var()`, `min()`, `max()`

In [3]:
df_by_grade_grouped = df.groupby('grade')[['funded_amount']]

In [4]:
print(df_by_grade_grouped.count())

       funded_amount
grade               
A                  6
B                 21
C                 15
D                  4
E                  3
F                  1


In [5]:
print(df_by_grade_grouped.sum())
print(df_by_grade_grouped.mean())
print(df_by_grade_grouped.std())
print(df_by_grade_grouped.var())

       funded_amount
grade               
A            72500.0
B           262875.0
C           160050.0
D            50275.0
E            56050.0
F            25000.0
       funded_amount
grade               
A       12083.333333
B       12517.857143
C       10670.000000
D       12568.750000
E       18683.333333
F       25000.000000
       funded_amount
grade               
A       11416.902674
B        7915.602247
C        7982.728007
D        8447.222793
E       13381.921885
F                NaN
       funded_amount
grade               
A       1.303457e+08
B       6.265676e+07
C       6.372395e+07
D       7.135557e+07
E       1.790758e+08
F                NaN


In [6]:
print(df_by_grade_grouped.min())
print(df_by_grade_grouped.max())

       funded_amount
grade               
A             4000.0
B             1500.0
C             1000.0
D             4175.0
E             4800.0
F            25000.0
       funded_amount
grade               
A            34000.0
B            35000.0
C            26000.0
D            22300.0
E            31500.0
F            25000.0


While `.quantile()` is not explicitly implemented for `.groupby()`, it is a `Series` method and thus available for use. Internally, `.groupby()` efficiently slices up the Series into its own slices, calls `quantile(0.9)` for each slice, and then assembles those results together into the final output. The same goes for `.median()`.

In [7]:
df_by_term = df.groupby('term')['funded_amount']

print(df_by_term.quantile(0.25))
print(df_by_term.median())
print(df_by_term.quantile(0.75))

term
 36 months     4800.0
 60 months    17500.0
Name: funded_amount, dtype: float64
term
 36 months     6500.0
 60 months    20375.0
Name: funded_amount, dtype: float64
term
 36 months    11075.0
 60 months    22950.0
Name: funded_amount, dtype: float64


To create custom aggregation functions, pass any function that aggregates an array. Use the `.agg()` method to call this defined function.

In [8]:
def diff_top_x(s):
    """
        Get difference between top and bottom value
    """
    std = s.sort_values()
    return std.tail(1).sum() - std.head(1).sum()

In [9]:
df.groupby('grade')['annual_income'].agg(diff_top_x)

grade
A     43000.0
B    110000.0
C     79000.0
D     63250.0
E     84000.0
F         0.0
Name: annual_income, dtype: float64

Since `.describe()` is a `Series` function, it can be used too following a `.groupby()`.

In [10]:
df.groupby('grade')['annual_income'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,6.0,81166.666667,16166.838487,67000.0,71500.0,74500.0,86500.0,110000.0
B,21.0,75089.571429,36543.053832,40000.0,45000.0,65000.0,83000.0,150000.0
C,15.0,60502.666667,20152.244635,36000.0,48520.0,55000.0,70000.0,115000.0
D,4.0,51187.5,26574.874569,25000.0,40375.0,45750.0,56562.5,88250.0
E,3.0,91691.666667,42483.940593,46000.0,72537.5,99075.0,114537.5,130000.0
F,1.0,68000.0,,68000.0,68000.0,68000.0,68000.0,68000.0


<hr>

In [11]:
tips_df = seaborn.load_dataset('tips')
tips_df['pct_tip'] = tips_df['tip'] /  tips_df['total_bill']
display(tips_df.info())
display(tips_df.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
pct_tip       244 non-null float64
dtypes: category(4), float64(3), int64(1)
memory usage: 9.1 KB


None

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,pct_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587


To aggregate using `.agg()`, pass the name of the function into the `.agg()` like so.

In [12]:
by_day_by_smoker_groupby = tips_df.groupby(['day', 'smoker'])

In [13]:
# Find the average percentage of all tips, by day and by smoker flag
by_day_by_smoker_groupby['pct_tip'].agg('mean')

day   smoker
Thur  Yes       0.163863
      No        0.160298
Fri   Yes       0.174783
      No        0.151650
Sat   Yes       0.147906
      No        0.158048
Sun   Yes       0.187250
      No        0.160113
Name: pct_tip, dtype: float64

Passing a list of functions yields a `df` with their respective function names and calculations.

In [14]:
# You can pass in names as strings or the name of functions directly.
by_day_by_smoker_groupby['pct_tip'].agg(['mean', 'median', 'std', diff_top_x])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,std,diff_top_x
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Thur,Yes,0.163863,0.153846,0.039389,0.15124
Thur,No,0.160298,0.153492,0.038774,0.19335
Fri,Yes,0.174783,0.173913,0.051293,0.159925
Fri,No,0.15165,0.149241,0.028123,0.067349
Sat,Yes,0.147906,0.153624,0.061375,0.290095
Sat,No,0.158048,0.150152,0.039767,0.235193
Sun,Yes,0.18725,0.138122,0.154134,0.644685
Sun,No,0.160113,0.161665,0.042347,0.193226


In [15]:
# To directly change the name of the columns, pass in a 2-tuple, with the first value
# as the column name and the second value as the function name.
by_day_by_smoker_groupby['pct_tip'].agg([('average_tip_pct', 'mean'), ('standard_dev_pct', 'std')])

Unnamed: 0_level_0,Unnamed: 1_level_0,average_tip_pct,standard_dev_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,Yes,0.163863,0.039389
Thur,No,0.160298,0.038774
Fri,Yes,0.174783,0.051293
Fri,No,0.15165,0.028123
Sat,Yes,0.147906,0.061375
Sat,No,0.158048,0.039767
Sun,Yes,0.18725,0.154134
Sun,No,0.160113,0.042347


In [16]:
# Mix it up with additional columns. Note that the result has columns as a hierarchical index.
def q25(x):
    return x.quantile(0.25)
def q75(x):
    return x.quantile(0.75)
by_day_by_smoker_groupby['pct_tip', 'total_bill'].agg(['mean', 'std', q25, q75])

Unnamed: 0_level_0,Unnamed: 1_level_0,pct_tip,pct_tip,pct_tip,pct_tip,total_bill,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,q25,q75,mean,std,q25,q75
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Thur,Yes,0.163863,0.039389,0.148038,0.194837,19.190588,8.355149,13.51,19.81
Thur,No,0.160298,0.038774,0.137741,0.184843,17.113111,7.721728,11.69,20.27
Fri,Yes,0.174783,0.051293,0.133739,0.20924,16.813333,9.086388,11.69,18.665
Fri,No,0.15165,0.028123,0.137239,0.163652,18.42,5.059282,15.1,22.555
Sat,Yes,0.147906,0.061375,0.091797,0.190502,21.276667,10.069138,13.405,26.7925
Sat,No,0.158048,0.039767,0.13624,0.183915,19.661778,8.939181,14.73,20.65
Sun,Yes,0.18725,0.154134,0.097723,0.215325,24.12,10.442511,17.165,32.375
Sun,No,0.160113,0.042347,0.13978,0.185185,20.506667,8.130189,14.78,25.0


In [17]:
# Passing in a dict will aggregate using the columns, and the list of functions to pass.
by_day_by_smoker_groupby['pct_tip', 'total_bill'].agg({'pct_tip' : ['sum', 'count'], 'total_bill':['sum',]})

Unnamed: 0_level_0,Unnamed: 1_level_0,pct_tip,pct_tip,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Thur,Yes,2.785676,17,326.24
Thur,No,7.213414,45,770.09
Fri,Yes,2.621746,15,252.2
Fri,No,0.606602,4,73.68
Sat,Yes,6.212055,42,893.62
Sat,No,7.112145,45,884.78
Sun,Yes,3.557756,19,458.28
Sun,No,9.126438,57,1168.88


In [18]:
# By adding as_index=False, the Series, where the index is the unique values
# of the specified groupby column will automatically be converted to a column in a df
display(tips_df.groupby('day')['total_bill'].mean())
display(tips_df.groupby('day', as_index=False)['total_bill'].mean())

day
Thur    17.682742
Fri     17.151579
Sat     20.441379
Sun     21.410000
Name: total_bill, dtype: float64

Unnamed: 0,day,total_bill
0,Thur,17.682742
1,Fri,17.151579
2,Sat,20.441379
3,Sun,21.41


In [19]:
# This applies to multiple grouping keys too.
display(tips_df.groupby(['day', 'smoker'], as_index=False)['total_bill'].mean())

Unnamed: 0,day,smoker,total_bill
0,Thur,Yes,19.190588
1,Thur,No,17.113111
2,Fri,Yes,16.813333
3,Fri,No,18.42
4,Sat,Yes,21.276667
5,Sat,No,19.661778
6,Sun,Yes,24.12
7,Sun,No,20.506667


In [20]:
tips_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,pct_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587


In [21]:
# This applies to multiple grouping keys too.
display(tips_df.groupby(['day', 'smoker'], as_index=False)['total_bill'].count())

Unnamed: 0,day,smoker,total_bill
0,Thur,Yes,17
1,Thur,No,45
2,Fri,Yes,15
3,Fri,No,4
4,Sat,Yes,42
5,Sat,No,45
6,Sun,Yes,19
7,Sun,No,57


**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)