In [1]:
import pandas as pd

### Example One: Group by

Typically, groupby operations can be broken down into several steps:

- **Split** the object into different groups
- **Apply** a function to each group
- **Combine** the results

The following is an example from:  [In-Depth Pandas Tutorial. The Ultimate Python library for working… | by Mandy Gu | Towards Data Science](https://towardsdatascience.com/in-depth-pandas-tutorial-5d896483ba8a) 

In [2]:
data = pd.DataFrame({'customer_id': [1,2,3,4,5,6,7,8], 
                     'age': [29,43,22,82,41,33,63,57], 
                     'email_linked': [True,True,False,True,False,False,True,True],
                     'occupation': ['teacher','highschool teacher','student','retired',
                                    'tutor','unemployed','entrepreneur','professor']})

In [3]:
data.head()

Unnamed: 0,customer_id,age,email_linked,occupation
0,1,29,True,teacher
1,2,43,True,highschool teacher
2,3,22,False,student
3,4,82,True,retired
4,5,41,False,tutor


In [5]:
def is_educator(occupation):
  return 'teacher' in occupation.lower() or occupation.lower() in ['tutor', 'professor', 'lecturer']

data['is_educator'] = data['occupation'].apply(is_educator)
data

Unnamed: 0,customer_id,age,email_linked,occupation,is_educator
0,1,29,True,teacher,True
1,2,43,True,highschool teacher,True
2,3,22,False,student,False
3,4,82,True,retired,False
4,5,41,False,tutor,True
5,6,33,False,unemployed,False
6,7,63,True,entrepreneur,False
7,8,57,True,professor,True


In [6]:
def is_educator_over_50(row):
  return row['age'] > 50 and is_educator(row['occupation'])

data['is_educator_over_50'] = data.apply(is_educator_over_50, axis=1)
data

Unnamed: 0,customer_id,age,email_linked,occupation,is_educator,is_educator_over_50
0,1,29,True,teacher,True,False
1,2,43,True,highschool teacher,True,False
2,3,22,False,student,False,False
3,4,82,True,retired,False,False
4,5,41,False,tutor,True,False
5,6,33,False,unemployed,False,False
6,7,63,True,entrepreneur,False,False
7,8,57,True,professor,True,True


In [7]:
grouped_data = data.groupby(by=['occupation']).mean()
print(type(grouped_data))
grouped_data

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,customer_id,age,email_linked,is_educator,is_educator_over_50
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
entrepreneur,7.0,63.0,1.0,0.0,0.0
highschool teacher,2.0,43.0,1.0,1.0,0.0
professor,8.0,57.0,1.0,1.0,1.0
retired,4.0,82.0,1.0,0.0,0.0
student,3.0,22.0,0.0,0.0,0.0
teacher,1.0,29.0,1.0,1.0,0.0
tutor,5.0,41.0,0.0,1.0,0.0
unemployed,6.0,33.0,0.0,0.0,0.0


In [8]:
data.groupby(by=['email_linked', 'occupation']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_id,age,is_educator,is_educator_over_50
email_linked,occupation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,student,3.0,22.0,0.0,0.0
False,tutor,5.0,41.0,1.0,0.0
False,unemployed,6.0,33.0,0.0,0.0
True,entrepreneur,7.0,63.0,0.0,0.0
True,highschool teacher,2.0,43.0,1.0,0.0
True,professor,8.0,57.0,1.0,1.0
True,retired,4.0,82.0,0.0,0.0
True,teacher,1.0,29.0,1.0,0.0
