# Grouping DataFrames
### Grouping a DataFrame allows you to aggregate the data at a different level
* For example, transform daily data into monthly, roll up transaction level data by store, etc.

In [1]:
import numpy as np
import pandas as pd

In [2]:
retail = pd.read_csv("../Agg_&_Reshape_DataFrames/retail_2016_2017.csv")
retail

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.000,0
1,1945945,2016-01-01,1,BABY CARE,0.000,0
2,1945946,2016-01-01,1,BEAUTY,0.000,0
3,1945947,2016-01-01,1,BEVERAGES,0.000,0
4,1945948,2016-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
1054939,3000883,2017-08-15,9,POULTRY,438.133,0
1054940,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
1054941,3000885,2017-08-15,9,PRODUCE,2419.729,148
1054942,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


### To group data, use the .groupby() method and specify a column to group by
* The grouped cokumn becomes the index by default

In [5]:
retail.groupby('family') # This returns a groupby object

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D195A6B350>

In [6]:
retail.groupby('family')['sales'].sum().head() # to return the groups created, you need to calculate aggregate statistics:

# 1. Specify the column for the calculations
# 2. Apply an aggregation method
# Single brackets around the sales column return Series and double brackets return a DataFrame

family
AUTOMOTIVE       226139.0
BABY CARE          7903.0
BEAUTY           166189.0
BEVERAGES     105700279.0
BOOKS              6438.0
Name: sales, dtype: float64

### More examples

In [7]:
premier_league = pd.read_excel("../Agg_&_Reshape_DataFrames/premier_league_games.xlsx")
premier_league.head()

Unnamed: 0,id,league_name,season,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,4389,England Premier League,2015/2016,Arsenal,West Ham United,0,2
1,4390,England Premier League,2015/2016,Bournemouth,Aston Villa,0,1
2,4391,England Premier League,2015/2016,Chelsea,Swansea City,2,2
3,4392,England Premier League,2015/2016,Everton,Watford,2,2
4,4393,England Premier League,2015/2016,Leicester City,Sunderland,4,2


In [17]:
premier_league.groupby('HomeTeam')[['HomeGoals']].mean().sort_values('HomeGoals', ascending=False) # We used our double brackets for 'HomeGoals' to create a DataFrame.
# And we used sort_values() to sort from highest to lowest for the mean. 

Unnamed: 0_level_0,HomeGoals
HomeTeam,Unnamed: 1_level_1
Manchester City,2.473684
Southampton,2.052632
Tottenham Hotspur,1.842105
Everton,1.842105
Leicester City,1.842105
West Ham United,1.789474
Liverpool,1.736842
Newcastle United,1.684211
Chelsea,1.684211
Arsenal,1.631579


# Grouping Multiple Columns
### You can group by multiple columns by passing the list of columns into .groupby()
* This creates a multi-index object with an index for each column the data was grouped by
* Specify as_index=False to prevent the grouped columns from becoming indices

In [25]:
retail.groupby(['family', 'store_nbr'],
               as_index=False)[['sales']].sum() # This still returns the sum for each combination of 'family' and 'store_nbr', but keeps a numeric index

Unnamed: 0,family,store_nbr,sales
0,AUTOMOTIVE,1,2524.000000
1,AUTOMOTIVE,2,3918.000000
2,AUTOMOTIVE,3,6790.000000
3,AUTOMOTIVE,4,2565.000000
4,AUTOMOTIVE,5,3667.000000
...,...,...,...
1777,SEAFOOD,50,12773.966999
1778,SEAFOOD,51,34250.948976
1779,SEAFOOD,52,1219.475999
1780,SEAFOOD,53,3745.180001


In [24]:
retail.groupby(['family', 'store_nbr'])[['sales']].sum() # This returns the sum of sales for each combination of 'family' and 'store_nbr'
# This is now a multi-index DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
family,store_nbr,Unnamed: 2_level_1
AUTOMOTIVE,1,2524.000000
AUTOMOTIVE,2,3918.000000
AUTOMOTIVE,3,6790.000000
AUTOMOTIVE,4,2565.000000
AUTOMOTIVE,5,3667.000000
...,...,...
SEAFOOD,50,12773.966999
SEAFOOD,51,34250.948976
SEAFOOD,52,1219.475999
SEAFOOD,53,3745.180001


### More Examples

In [27]:
premier_league_full = pd.read_excel("../Agg_&_Reshape_DataFrames/premier_league_games_full.xlsx")
premier_league_full.head()

Unnamed: 0,id,league_name,season,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,1729,England Premier League,2008/2009,Manchester United,Newcastle United,1,1
1,1730,England Premier League,2008/2009,Arsenal,West Bromwich Albion,1,0
2,1731,England Premier League,2008/2009,Sunderland,Liverpool,0,1
3,1732,England Premier League,2008/2009,West Ham United,Wigan Athletic,2,1
4,1733,England Premier League,2008/2009,Aston Villa,Manchester City,4,2


In [35]:
premier_league_full.groupby(['HomeTeam', 'season'])[['HomeGoals']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,HomeGoals
HomeTeam,season,Unnamed: 2_level_1
Arsenal,2008/2009,31
Arsenal,2009/2010,48
Arsenal,2010/2011,33
Arsenal,2011/2012,39
Arsenal,2012/2013,47
...,...,...
Wigan Athletic,2011/2012,22
Wigan Athletic,2012/2013,26
Wolverhampton Wanderers,2009/2010,13
Wolverhampton Wanderers,2010/2011,30


In [38]:
premier_league_full.groupby(['season', 'HomeTeam'], as_index=False)[['HomeGoals']].sum()

Unnamed: 0,season,HomeTeam,HomeGoals
0,2008/2009,Arsenal,31
1,2008/2009,Aston Villa,27
2,2008/2009,Blackburn Rovers,22
3,2008/2009,Bolton Wanderers,21
4,2008/2009,Chelsea,33
...,...,...,...
155,2015/2016,Swansea City,20
156,2015/2016,Tottenham Hotspur,35
157,2015/2016,Watford,20
158,2015/2016,West Bromwich Albion,20


In [39]:
premier_league_full.groupby(['season', 'HomeTeam'], as_index=False)[['HomeGoals']].sum().query("HomeTeam == 'Arsenal'")

Unnamed: 0,season,HomeTeam,HomeGoals
0,2008/2009,Arsenal,31
20,2009/2010,Arsenal,48
40,2010/2011,Arsenal,33
60,2011/2012,Arsenal,39
80,2012/2013,Arsenal,47
100,2013/2014,Arsenal,36
120,2014/2015,Arsenal,41
140,2015/2016,Arsenal,31
