In [158]:
import pandas as pd
import numpy as np

In [159]:
#groupby() function takes a dataframe, splits it into relevant groups to perform operations on, and then combines the results back together
df = pd.read_csv('datasets/census.csv')
df = df[df['SUMLEV'] == 50]
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [160]:
%%timeit -n 3
# first example - iterate over all states and calculate average
states = sorted(set(df['STNAME']))
for i in states:
    avg = np.average(df.where(df['STNAME'] == i).dropna()['CENSUS2010POP'])
    print('counties in state', i, 'have average population', avg)

counties in state Alabama have average population 71339.34328358209
counties in state Alaska have average population 24490.724137931036
counties in state Arizona have average population 426134.4666666667
counties in state Arkansas have average population 38878.90666666667
counties in state California have average population 642309.5862068966
counties in state Colorado have average population 78581.1875
counties in state Connecticut have average population 446762.125
counties in state Delaware have average population 299311.3333333333
counties in state District of Columbia have average population 601723.0
counties in state Florida have average population 280616.5671641791
counties in state Georgia have average population 60928.63522012578
counties in state Hawaii have average population 272060.2
counties in state Idaho have average population 35626.86363636364
counties in state Illinois have average population 125790.50980392157
counties in state Indiana have average population 70476.10

In [161]:
%%timeit -n 3
# now, we do the same using the groupby function
for group, frame in df.groupby('STNAME'):
    avg = np.average(frame['CENSUS2010POP'])
    print('counties in state', group, 'have average population', avg)

counties in state Alabama have average population 71339.34328358209
counties in state Alaska have average population 24490.724137931036
counties in state Arizona have average population 426134.4666666667
counties in state Arkansas have average population 38878.90666666667
counties in state California have average population 642309.5862068966
counties in state Colorado have average population 78581.1875
counties in state Connecticut have average population 446762.125
counties in state Delaware have average population 299311.3333333333
counties in state District of Columbia have average population 601723.0
counties in state Florida have average population 280616.5671641791
counties in state Georgia have average population 60928.63522012578
counties in state Hawaii have average population 272060.2
counties in state Idaho have average population 35626.86363636364
counties in state Illinois have average population 125790.50980392157
counties in state Indiana have average population 70476.10

In [162]:
# we can also apply our own functions to group the data using groupby()

df = df.set_index('STNAME')
def set_batch_number(item):
    if item[0] < 'M':
        return 0
    elif item[0] < 'Q':
        return 1
    else:
        return 2
    
for group, frame in df.groupby(set_batch_number):
    print('There are', str(len(frame)), 'records in group', str(group), 'for processing')

# when groupby() doesn't have a column identifier, it will take the index as the input column

There are 1177 records in group 0 for processing
There are 1134 records in group 1 for processing
There are 831 records in group 2 for processing


In [163]:
# lets take the example of another dataset
df = pd.read_csv('datasets/listings.csv')
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,9.0,f,,,t,moderate,f,f,1,1.3
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,10.0,f,,,f,moderate,t,f,1,0.47
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,10.0,f,,,f,moderate,f,f,1,1.0
4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",...,10.0,f,,,f,flexible,f,f,1,2.25


In [164]:
# now, we want to group by cancellation_policy AND review_scores_value, first approach
df = df.set_index(['cancellation_policy', 'review_scores_value']) # multilevel indexing
# now, while using groupby with multilevel indexes, we need to specify the levels we want to include in our grouping
for group, frame in df.groupby(level = (0,1)):
    print(group)

('flexible', 2.0)
('flexible', 4.0)
('flexible', 5.0)
('flexible', 6.0)
('flexible', 7.0)
('flexible', 8.0)
('flexible', 9.0)
('flexible', 10.0)
('moderate', 2.0)
('moderate', 4.0)
('moderate', 6.0)
('moderate', 7.0)
('moderate', 8.0)
('moderate', 9.0)
('moderate', 10.0)
('strict', 2.0)
('strict', 3.0)
('strict', 4.0)
('strict', 5.0)
('strict', 6.0)
('strict', 7.0)
('strict', 8.0)
('strict', 9.0)
('strict', 10.0)
('super_strict_30', 6.0)
('super_strict_30', 7.0)
('super_strict_30', 8.0)
('super_strict_30', 9.0)
('super_strict_30', 10.0)


In [165]:
# now, we want to put the reviews with value 10 and those with value under 10 in separate groups
def grouping_fn(item):
    if(item[1] == 10):
        return(item[0], '10.0')
    else:
        return(item[0], 'less than 10.0')

for group, frame in df.groupby(grouping_fn):        
    print(group)

('flexible', '10.0')
('flexible', 'less than 10.0')
('moderate', '10.0')
('moderate', 'less than 10.0')
('strict', '10.0')
('strict', 'less than 10.0')
('super_strict_30', '10.0')
('super_strict_30', 'less than 10.0')


### Aggregation of grouped data using groupby()

In [166]:
# aggregate by passing a dictionary containing the column names and the functions inside the agg() function
df = df.reset_index()
df.groupby('cancellation_policy').agg({'review_scores_value':np.average})

Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,
moderate,
strict,
super_strict_30,


In [167]:
# this happened because np.average does not ignore NaN values
df.groupby('cancellation_policy').agg(review_scores_value_mean = ('review_scores_value', np.mean)) # mean excludes NaN values

  df.groupby('cancellation_policy').agg(review_scores_value_mean = ('review_scores_value', np.mean)) # mean excludes NaN values


Unnamed: 0_level_0,review_scores_value_mean
cancellation_policy,Unnamed: 1_level_1
flexible,9.237421
moderate,9.307398
strict,9.081441
super_strict_30,8.537313


In [168]:
# we can also aggregate multiple functions to the formed groups
df.groupby('cancellation_policy').agg({'review_scores_value' : [np.mean, np.std],
                                       'reviews_per_month' : np.mean})
# use of dictionaries inside agg() function has been depracated

  df.groupby('cancellation_policy').agg({'review_scores_value' : [np.mean, np.std],
  df.groupby('cancellation_policy').agg({'review_scores_value' : [np.mean, np.std],
  df.groupby('cancellation_policy').agg({'review_scores_value' : [np.mean, np.std],


Unnamed: 0_level_0,review_scores_value,review_scores_value,reviews_per_month
Unnamed: 0_level_1,mean,std,mean
cancellation_policy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
flexible,9.237421,1.096271,1.82921
moderate,9.307398,0.859859,2.391922
strict,9.081441,1.040531,1.873467
super_strict_30,8.537313,0.840785,0.340143


### Transformation of grouped data using transform()

In [169]:
# this will return a dataframe with the same number of rows as the original dataframe, and every row will have the resultant value corresponding to which group it lies in
cols = ['cancellation_policy', 'review_scores_value']
transform_df = df[cols].groupby('cancellation_policy').transform(np.mean)
transform_df.rename({'review_scores_value' : 'mean_review_scores'}, axis = 1, inplace = True)
transform_df

  transform_df = df[cols].groupby('cancellation_policy').transform(np.mean)


Unnamed: 0,mean_review_scores
0,9.307398
1,9.307398
2,9.307398
3,9.307398
4,9.237421
...,...
3580,9.081441
3581,9.081441
3582,9.237421
3583,9.081441


In [170]:
# now, we merge the two dfs to include the new column in our new df
df = df.merge(transform_df, left_index = True, right_index = True)
df.head()
# this displays the average rating, grouped by the cancellation policy, for every entry in the dataframe

Unnamed: 0,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,description,...,review_scores_location,requires_license,license,jurisdiction_names,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_scores
0,moderate,,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",...,,f,,,f,f,f,1,,9.307398
1,moderate,9.0,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,...,9.0,f,,,t,f,f,1,1.3,9.307398
2,moderate,10.0,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",...,9.0,f,,,f,t,f,1,0.47,9.307398
3,moderate,10.0,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,...,10.0,f,,,f,f,f,1,1.0,9.307398
4,flexible,10.0,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",...,9.0,f,,,f,f,f,1,2.25,9.237421


In [171]:
df['mean_diff'] = np.absolute(df['review_scores_value'] - df['mean_review_scores'])
df['mean_diff'].head()

0         NaN
1    0.307398
2    0.692602
3    0.692602
4    0.762579
Name: mean_diff, dtype: float64

### Filtering of grouped data using transform()

In [172]:
# sometimes, we want to perform group operations on only a certain number of values that pass a certain parameter
df.groupby('cancellation_policy').filter(lambda x: np.mean(x['review_scores_value']) > 9.2)
# this resultant df will be only display those values that have average review scores grouped by cancellation policy > 9.2 (moderate and flexible)

Unnamed: 0,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,description,...,requires_license,license,jurisdiction_names,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_scores,mean_diff
0,moderate,,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",...,f,,,f,f,f,1,,9.307398,
1,moderate,9.0,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,...,f,,,t,f,f,1,1.30,9.307398,0.307398
2,moderate,10.0,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",...,f,,,f,t,f,1,0.47,9.307398,0.692602
3,moderate,10.0,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,...,f,,,f,f,f,1,1.00,9.307398,0.692602
4,flexible,10.0,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",...,f,,,f,f,f,1,2.25,9.237421,0.762579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3576,flexible,,14689681,https://www.airbnb.com/rooms/14689681,20160906204935,2016-09-07,Beautiful loft style bedroom with large bathroom,You'd be living on the top floor of a four sto...,,You'd be living on the top floor of a four sto...,...,f,,,f,f,f,1,,9.237421,
3577,flexible,,13750763,https://www.airbnb.com/rooms/13750763,20160906204935,2016-09-07,Comfortable Space in the Heart of Brookline,"Our place is close to Coolidge Corner, Allston...",This space consists of 2 Rooms and a private b...,"Our place is close to Coolidge Corner, Allston...",...,f,,,f,f,f,1,,9.237421,
3579,flexible,,14852179,https://www.airbnb.com/rooms/14852179,20160906204935,2016-09-07,Spacious Queen Bed Room Close to Boston Univer...,- Grocery: A full-size Star market is 2 minute...,,- Grocery: A full-size Star market is 2 minute...,...,f,,,f,f,f,1,,9.237421,
3582,flexible,,14585486,https://www.airbnb.com/rooms/14585486,20160906204935,2016-09-07,Gorgeous funky apartment,Funky little apartment close to public transpo...,Modern and relaxed space with many facilities ...,Funky little apartment close to public transpo...,...,f,,,f,f,f,1,,9.237421,


### Applying to grouped data using apply()

In [173]:
# we can use this to apply any arbitrary function to a group of data in a df
df = pd.read_csv('datasets/listings.csv')
df = df[['cancellation_policy', 'review_scores_value']]
df.head()

Unnamed: 0,cancellation_policy,review_scores_value
0,moderate,
1,moderate,9.0
2,moderate,10.0
3,moderate,10.0
4,flexible,10.0


In [174]:
# example - calculate the deviation from group mean of review scores for each entry in dataframe
def calc_mean_review_scores(group):
    avg = np.mean(group['review_scores_value'])
    group['review_scores_mean'] = np.abs(avg - group['review_scores_value'])
    return group

# now, we will apply this function to our grouped dataset
df.groupby('cancellation_policy').apply(calc_mean_review_scores).head()

  df.groupby('cancellation_policy').apply(calc_mean_review_scores).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,cancellation_policy,review_scores_value,review_scores_mean
cancellation_policy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
flexible,4,flexible,10.0,0.762579
flexible,5,flexible,10.0,0.762579
flexible,10,flexible,10.0,0.762579
flexible,11,flexible,9.0,0.237421
flexible,12,flexible,10.0,0.762579
