In [2]:
import pandas as pd
import numpy as np


In [3]:
df=pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                 'key2' : ['one', 'two', 'one', 'two', 'one'],
                 'key3' : [1, 2, 2, 1, 2],
                 'data1': np.random.randn(5),
                 'data2': np.random.randn(5),
                 'data3': np.random.randn(5)})

df

Unnamed: 0,key1,key2,key3,data1,data2,data3
0,a,one,1,-0.471045,-1.405259,1.49593
1,a,two,2,-1.433884,-1.191686,-0.641039
2,b,one,2,-0.72879,0.706079,-1.470238
3,b,two,1,-1.964638,-1.00953,-0.446941
4,a,one,2,0.787157,0.008433,1.042879


In [4]:
# Let's compute the mean of the data1 column using the labels from key1
grouped=df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fbd5b293400>

In [5]:
#it is now a groupby object. I thas not computed anything yet. to compute the group means, we can call the groupby's mean method
grouped.mean()

key1
a   -0.372591
b   -1.346714
Name: data1, dtype: float64

In [6]:
# Let's compute the means of two other data groups, data 1 and data 2
grouped2=df['data2'].groupby(df['key2'])
grouped3=df['data3'].groupby(df['key3'])

print(grouped2.mean())
print()
print(grouped3.mean())

key2
one   -0.230249
two   -1.100608
Name: data2, dtype: float64

key3
1    0.524494
2   -0.356133
Name: data3, dtype: float64


In [7]:
# we can also get the mean of a data based on different keys
means=df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.158056
      two    -1.433884
b     one    -0.728790
      two    -1.964638
Name: data1, dtype: float64

In [8]:
#let's compute the means of data 1 based on all three keys
means=df['data1'].groupby([df['key1'], df['key2'], df['key3']]).mean()
means

key1  key2  key3
a     one   1      -0.471045
            2       0.787157
      two   2      -1.433884
b     one   2      -0.728790
      two   1      -1.964638
Name: data1, dtype: float64

In [9]:
means.unstack()

Unnamed: 0_level_0,key3,1,2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.471045,0.787157
a,two,,-1.433884
b,one,,-0.72879
b,two,-1.964638,


In [10]:
#Let's work on another example
states=np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years=np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()

California  2005   -1.433884
            2006   -0.728790
Ohio        2005   -1.217842
            2006    0.787157
Name: data1, dtype: float64

In [11]:
df.groupby('key1').mean()

Unnamed: 0_level_0,key3,data1,data2,data3
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1.666667,-0.372591,-0.862837,0.63259
b,1.5,-1.346714,-0.151726,-0.95859


In [12]:
df.groupby(['key1', 'key2', 'key3']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,data1,data2,data3
key1,key2,key3,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,one,1,-0.471045,-1.405259,1.49593
a,one,2,0.787157,0.008433,1.042879
a,two,2,-1.433884,-1.191686,-0.641039
b,one,2,-0.72879,0.706079,-1.470238
b,two,1,-1.964638,-1.00953,-0.446941


# # Iterating Over Groups

In [13]:

# The GroupBy object supports iteration, generating a sequence of 2-tuples containing
#the group name along with the chunk of data.
for name, group, in df.groupby('key1'):
    print('group:', name)
    print()
    print('group:', group)

group: a

group:   key1 key2  key3     data1     data2     data3
0    a  one     1 -0.471045 -1.405259  1.495930
1    a  two     2 -1.433884 -1.191686 -0.641039
4    a  one     2  0.787157  0.008433  1.042879
group: b

group:   key1 key2  key3     data1     data2     data3
2    b  one     2 -0.728790  0.706079 -1.470238
3    b  two     1 -1.964638 -1.009530 -0.446941


In [14]:
# in case of multiple keys, the first element in the tuple will be a tuple of key values
for (k1, k2, k3), group in df.groupby(['key1', 'key2', 'key3']):
    print((k1, k2, k3))
    print()
    print(group)

('a', 'one', 1)

  key1 key2  key3     data1     data2    data3
0    a  one     1 -0.471045 -1.405259  1.49593
('a', 'one', 2)

  key1 key2  key3     data1     data2     data3
4    a  one     2  0.787157  0.008433  1.042879
('a', 'two', 2)

  key1 key2  key3     data1     data2     data3
1    a  two     2 -1.433884 -1.191686 -0.641039
('b', 'one', 2)

  key1 key2  key3    data1     data2     data3
2    b  one     2 -0.72879  0.706079 -1.470238
('b', 'two', 1)

  key1 key2  key3     data1    data2     data3
3    b  two     1 -1.964638 -1.00953 -0.446941


In [15]:
# we can also groupby the data based on their dtype
grouped=df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print()
    print(group)

int64

   key3
0     1
1     2
2     2
3     1
4     2
float64

      data1     data2     data3
0 -0.471045 -1.405259  1.495930
1 -1.433884 -1.191686 -0.641039
2 -0.728790  0.706079 -1.470238
3 -1.964638 -1.009530 -0.446941
4  0.787157  0.008433  1.042879
object

  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# Selecting a Column or Subset of Columns

In [16]:
#especially for large datasets, it maybe desirable to aggregate only a few columns. for example:
s_grouped=df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fbd5b4370a0>

In [17]:
s_grouped.mean()

key1  key2
a     one    -0.698413
      two    -1.191686
b     one     0.706079
      two    -1.009530
Name: data2, dtype: float64

# Grouping with Dicts and Series

In [18]:
#Grouping information may exist in a form other than an array. Let's take another exmaple of dataframe:
people=pd.DataFrame(np.random.randn(5, 5), 
                   columns=['a', 'b', 'c', 'd', 'e'], 
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
#let's add some NA values too
people.iloc[2:3, [1, 2]]=np.nan

people

Unnamed: 0,a,b,c,d,e
Joe,-0.454184,-1.348076,0.246432,1.413905,-1.455935
Steve,-0.375097,0.088262,1.122622,-1.27284,-0.642754
Wes,0.544384,,,0.166389,0.156337
Jim,0.755577,0.698193,1.177249,1.302848,0.366729
Travis,-0.09334,2.14589,-1.174867,-0.152671,0.41733


In [19]:
# Now, suppose I have a group correspondence for the columns and want to sum
#together the columns by group:
mapping={'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

# Now, you could construct an array from this dict to pass to groupby, but instead we can just pass the dict.
by_column=people.groupby(mapping, axis=1)
by_column.sum()


Unnamed: 0,blue,red
Joe,1.660337,-3.258195
Steve,-0.150218,-0.929589
Wes,0.166389,0.700721
Jim,2.480097,1.8205
Travis,-1.327537,2.46988


# Grouping with Functions

In [20]:
# Let's group by the length of the names this time
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.845777,-0.649883,1.423681,2.883143,-0.932869
5,-0.375097,0.088262,1.122622,-1.27284,-0.642754
6,-0.09334,2.14589,-1.174867,-0.152671,0.41733


# Grouping by Index Levels

In [21]:
# A final convenience for hierarchically indexed datasets is the ability to aggregate
#using one of the levels of an axis index. Let’s look at an example:
columns=pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], 
                                 [1, 3, 5, 1, 3]],
                                 names=['cty', 'tenor'])
hier_df=pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.77572,0.162355,-0.814815,1.058243,0.34578
1,-0.443724,0.450158,0.413546,-0.42509,-0.837479
2,-0.6817,0.134821,-0.33401,1.118165,0.878573
3,1.639553,0.638787,-0.812455,1.168142,1.088765


In [22]:
# to group by leve, just pass the level number or name using the level keyword:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# Data Aggregation

In [23]:
df.head()

Unnamed: 0,key1,key2,key3,data1,data2,data3
0,a,one,1,-0.471045,-1.405259,1.49593
1,a,two,2,-1.433884,-1.191686,-0.641039
2,b,one,2,-0.72879,0.706079,-1.470238
3,b,two,1,-1.964638,-1.00953,-0.446941
4,a,one,2,0.787157,0.008433,1.042879


In [24]:
grouped=df.groupby('key1')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fbd5b40c550>

In [25]:
grouped['data1'].quantile(0.9)

key1
a    0.535517
b   -0.852375
Name: data1, dtype: float64

In [26]:
#agg method
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)

Unnamed: 0_level_0,key3,data1,data2,data3
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1,2.221042,1.413692,2.136969
b,1,1.235848,1.715609,1.023297


In [27]:
#You may notice that some methods like describe also work, even though they are not
#aggregations, strictly speaking:
grouped.describe()

Unnamed: 0_level_0,key3,key3,key3,key3,key3,key3,key3,key3,data1,data1,...,data2,data2,data3,data3,data3,data3,data3,data3,data3,data3
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,3.0,1.666667,0.57735,1.0,1.5,2.0,2.0,2.0,3.0,-0.372591,...,-0.591627,0.008433,3.0,0.63259,1.126016,-0.641039,0.20092,1.042879,1.269405,1.49593
b,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,2.0,-1.346714,...,0.277176,0.706079,2.0,-0.95859,0.72358,-1.470238,-1.214414,-0.95859,-0.702766,-0.446941


# Column-Wise and Multiple Function Application

In [29]:
tips=pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [31]:
tips['tip%']=tips['tip']/tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip%
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [32]:
#Let's group the tips based on day and smoker
grouped=tips.groupby(['day', 'smoker'])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fbd5b43fbe0>

In [33]:
#now can find the tip percentage based on days and smoker columns
group_pct=grouped['tip%']
group_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip%, dtype: float64

# Returning Aggregated Data Without Row Indexes

In [34]:
# usually, the aggregated data comes back with index, potenitally hierarchical, composed from the unique group 
#key combination. since this is not desirable all the time, we can disable it in most cases by passing as_index=false
#to groupby.
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip%
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


# Apply: General split-apply-combine

In [36]:
# the most general purpose of groupby method is apply. Apply splits the object being manipulated
#into pieces, invokes the passed function on each piece, and then attempts to concatenate the pieces together. 
#Suppose we wanted to select the top five tip percentage values by group. To do it, we need to write a function.
def top(df, n=5, column='tip%'):
    return df.sort_values(by=column)[-n:]

top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip%
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [37]:
# if we group by smoker, then we call apply with this function.
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip%
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [41]:
# let's take more argument or keywords this tim. 
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip%
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


# Pivot Tables and Cross-Tabulation

In [43]:
# A pivot table is a data summarization tool frequently found in spreadsheet programs and other data analysis 
#software. It aggregates a table of data by one or more keys, arranging the data in a rectangle with some of the
#group keys along the rows and some along the columns. Pivot tables in Python with pandas are made possible
#through the groupby facility. 

# suppose we want to compute a table of group means arranged by day and smoker on the rows:
tips.pivot_table(index=['day', 'smoker'])



Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip%,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [45]:
# it can also be done by groupby method. suppose we want to aggregate only by tip% and size and group it by time. 
#I will put smoker in the table columns and day in the rows.
tips.pivot_table(['tip%', 'size'], index=['time', 'day'], columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip%,tip%
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [46]:
# to have a cross-tabulation, we can use a different aggregation function by passing it to aggfunc. 
#we can put the days in columns and time and smoker in rows
tips.pivot_table('tip%', index=['time', 'smoker'], columns='day', aggfunc=len, margins=True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106.0
Dinner,Yes,9.0,42.0,19.0,,70.0
Lunch,No,1.0,,,44.0,45.0
Lunch,Yes,6.0,,,17.0,23.0
All,,19.0,87.0,76.0,62.0,244.0


In [49]:
# we can see here that some values are empty. we can fill them with 0
tips.pivot_table('tip%', index=['time', 'size','smoker'], columns='day', aggfunc=len, margins=True, fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur,All
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dinner,1.0,No,0,1,0,0,1.0
Dinner,1.0,Yes,0,1,0,0,1.0
Dinner,2.0,No,3,25,27,1,56.0
Dinner,2.0,Yes,8,28,12,0,48.0
Dinner,3.0,No,0,12,11,0,23.0
Dinner,3.0,Yes,0,6,4,0,10.0
Dinner,4.0,No,0,7,16,0,23.0
Dinner,4.0,Yes,1,6,2,0,9.0
Dinner,5.0,No,0,0,2,0,2.0
Dinner,5.0,Yes,0,1,1,0,2.0
