# Grouping and Aggregating Data

## Outine
* Splitting data into groups
* Operations on groupby objects


In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

df = pd.read_csv(Path('data/employee_attrition.csv'))

# Select only people in this age range
df = df[df.Age >= 21]
df = df[df.Age <= 28]


## Splitting data into groups
This is where every grouping operation begins, by calling the `groupby()` function of a Dataframe object.

Split the data into groups by a single column's value, or multiple.

In [8]:

grouped = df.groupby('Age')
grouped = df.groupby(['Age', 'Attrition'])


Iterate over the groups, returning `(groupName, group)`. "groupName" is either the value of the single column of the group, or a tuple containing the values of multiple columns. "group" is a dataframe containing the rows of each group.

In [9]:
for groupName, group in grouped:
    print(f'{groupName}: {len(group)} rows')


(21, 'No'): 7 rows
(21, 'Yes'): 6 rows
(22, 'No'): 11 rows
(22, 'Yes'): 5 rows
(23, 'No'): 10 rows
(23, 'Yes'): 4 rows
(24, 'No'): 19 rows
(24, 'Yes'): 7 rows
(25, 'No'): 20 rows
(25, 'Yes'): 6 rows
(26, 'No'): 27 rows
(26, 'Yes'): 12 rows
(27, 'No'): 45 rows
(27, 'Yes'): 3 rows
(28, 'No'): 34 rows
(28, 'Yes'): 14 rows


Select a single group with `get_group()`, passing the value or a tuple of values.

In [10]:
grouped = df.groupby(['Age', 'Attrition', 'Gender'])

grouped.get_group((24, 'No', 'Female'))

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
20,24,No,Non-Travel,673,Research & Development,11,2,Other,1,26,...,4,80,1,5,5,2,4,2,1,3
96,24,No,Travel_Rarely,1353,Sales,3,2,Other,1,128,...,1,80,1,4,2,2,3,2,0,2
380,24,No,Travel_Rarely,1371,Sales,10,4,Marketing,1,507,...,4,80,1,5,2,4,5,2,0,3
724,24,No,Travel_Rarely,1206,Research & Development,17,1,Medical,1,1009,...,2,80,2,5,6,3,4,2,3,2
1025,24,No,Travel_Rarely,1476,Sales,4,1,Medical,1,1445,...,3,80,2,5,3,3,5,4,0,3
1061,24,No,Non-Travel,830,Sales,13,2,Life Sciences,1,1495,...,3,80,1,1,2,3,1,0,0,0
1168,24,No,Travel_Frequently,567,Research & Development,2,1,Technical Degree,1,1646,...,3,80,0,6,2,3,6,3,1,3


## Operations on groupby objects

After splitting the groups, we usually want to perform some operation on the groups of data returned.

### Aggregation

Aggregation can be performed by either using the available aggregating functions on the `groupby` object, or using the aggregate function to apply arbtrary function logic.

In [11]:
grouped = df.groupby('Age')

dfh = grouped['DistanceFromHome'] # pandas SeriesGroupBy object

dfh.mean()
dfh.std()



Age
21    6.435080
22    7.201562
23    6.780710
24    8.496244
25    8.353719
26    9.683033
27    6.045924
28    8.539333
Name: DistanceFromHome, dtype: float64

In [12]:
dfh.aggregate([np.mean, np.std])

dfh.aggregate(lambda x: sum(x) / len(x))
#dfh.mean()

Age
21     9.076923
22     8.437500
23     9.142857
24    11.884615
25     8.769231
26    10.230769
27     7.500000
28     8.875000
Name: DistanceFromHome, dtype: float64

In [12]:
dfh.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
21,13.0,9.076923,6.43508,1.0,5.0,9.0,12.0,22.0
22,16.0,8.4375,7.201562,1.0,3.0,6.5,12.0,26.0
23,14.0,9.142857,6.78071,1.0,4.5,8.5,10.0,26.0
24,26.0,11.884615,8.496244,1.0,4.0,10.0,17.75,29.0
25,26.0,8.769231,8.353719,1.0,2.25,5.0,12.25,28.0
26,39.0,10.230769,9.683033,1.0,2.0,6.0,18.5,29.0
27,48.0,7.5,6.045924,1.0,2.0,6.0,10.0,23.0
28,48.0,8.875,8.539333,1.0,2.0,5.5,15.25,29.0


You can also transform the data of each group.

In [13]:
grouped = df.groupby('Age')

# Assign every value of DailyRate within each age group to the mean of the group
standardized_rates = grouped['DailyRate'].transform(np.mean)

df.assign(DailyRate=standardized_rates)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
4,27,No,Travel_Rarely,907.333333,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
14,28,Yes,Travel_Rarely,927.500000,Research & Development,24,3,Life Sciences,1,19,...,2,80,0,6,4,3,4,2,0,3
17,22,No,Non-Travel,806.937500,Research & Development,16,2,Medical,1,22,...,2,80,2,1,2,2,1,0,0,0
20,24,No,Non-Travel,863.115385,Research & Development,11,2,Other,1,26,...,4,80,1,5,5,2,4,2,1,3
23,21,No,Travel_Rarely,762.846154,Research & Development,15,2,Life Sciences,1,30,...,4,80,0,0,6,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1433,25,No,Travel_Rarely,768.076923,Sales,8,2,Other,1,2018,...,2,80,1,6,3,2,5,3,0,4
1436,21,No,Travel_Rarely,762.846154,Sales,5,1,Medical,1,2021,...,4,80,0,2,6,3,2,2,1,2
1438,23,Yes,Travel_Frequently,707.928571,Sales,9,3,Marketing,1,2023,...,1,80,1,1,3,2,1,0,1,0
1464,26,No,Travel_Rarely,844.769231,Sales,5,3,Other,1,2060,...,4,80,0,5,2,3,4,2,0,0


In [24]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
14,28,Yes,Travel_Rarely,103,Research & Development,24,3,Life Sciences,1,19,...,2,80,0,6,4,3,4,2,0,3
17,22,No,Non-Travel,1123,Research & Development,16,2,Medical,1,22,...,2,80,2,1,2,2,1,0,0,0
20,24,No,Non-Travel,673,Research & Development,11,2,Other,1,26,...,4,80,1,5,5,2,4,2,1,3
23,21,No,Travel_Rarely,391,Research & Development,15,2,Life Sciences,1,30,...,4,80,0,0,6,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1433,25,No,Travel_Rarely,1382,Sales,8,2,Other,1,2018,...,2,80,1,6,3,2,5,3,0,4
1436,21,No,Travel_Rarely,501,Sales,5,1,Medical,1,2021,...,4,80,0,2,6,3,2,2,1,2
1438,23,Yes,Travel_Frequently,638,Sales,9,3,Marketing,1,2023,...,1,80,1,1,3,2,1,0,1,0
1464,26,No,Travel_Rarely,1167,Sales,5,3,Other,1,2060,...,4,80,0,5,2,3,4,2,0,0


In [16]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
14,28,Yes,Travel_Rarely,103,Research & Development,24,3,Life Sciences,1,19,...,2,80,0,6,4,3,4,2,0,3
17,22,No,Non-Travel,1123,Research & Development,16,2,Medical,1,22,...,2,80,2,1,2,2,1,0,0,0
20,24,No,Non-Travel,673,Research & Development,11,2,Other,1,26,...,4,80,1,5,5,2,4,2,1,3
23,21,No,Travel_Rarely,391,Research & Development,15,2,Life Sciences,1,30,...,4,80,0,0,6,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1433,25,No,Travel_Rarely,1382,Sales,8,2,Other,1,2018,...,2,80,1,6,3,2,5,3,0,4
1436,21,No,Travel_Rarely,501,Sales,5,1,Medical,1,2021,...,4,80,0,2,6,3,2,2,1,2
1438,23,Yes,Travel_Frequently,638,Sales,9,3,Marketing,1,2023,...,1,80,1,1,3,2,1,0,1,0
1464,26,No,Travel_Rarely,1167,Sales,5,3,Other,1,2060,...,4,80,0,5,2,3,4,2,0,0
