# Data Aggregation and Group Operations

In [10]:
import pandas as pd
import numpy as np

## GroupBy mechanics

In [11]:
nrows=10
df = pd.DataFrame({'company': np.random.choice(list('ab'),10), 
                   'data1' : np.random.randn(nrows)*50 + 100,
                   'city' : np.random.choice(list('MP'), nrows),
                    'income': np.random.randn(nrows) *30000 + 50000})
df

Unnamed: 0,company,data1,city,income
0,a,120.145881,M,48182.938964
1,a,106.84464,M,29041.026809
2,a,77.455915,M,28361.943148
3,b,154.81862,P,87851.260264
4,a,170.918383,P,-15630.651336
5,a,81.977516,P,52102.323278
6,b,104.064474,P,42158.775972
7,a,164.957737,P,42686.01937
8,a,1.03292,M,57344.473314
9,b,58.381416,P,9954.471506


In [15]:

groupedCompanies = df.groupby('company')
df.groupby('company').sum()
df.groupby('company').mean()

Unnamed: 0_level_0,data1,income
company,Unnamed: 1_level_1,Unnamed: 2_level_1
a,103.333284,34584.010507
b,105.754836,46654.835914


In [20]:
means = df.groupby(['company', 'city'])['income'].mean()
means

company  city
a        M       40732.595559
         P       26385.897104
b        P       46654.835914
Name: income, dtype: float64

In [19]:
means['a','P']

26385.89710394877

### Iterating over groups

In [21]:
for name, group in df.groupby('company'):
    print(group)

  company       data1 city        income
0       a  120.145881    M  48182.938964
1       a  106.844640    M  29041.026809
2       a   77.455915    M  28361.943148
4       a  170.918383    P -15630.651336
5       a   81.977516    P  52102.323278
7       a  164.957737    P  42686.019370
8       a    1.032920    M  57344.473314
  company       data1 city        income
3       b  154.818620    P  87851.260264
6       b  104.064474    P  42158.775972
9       b   58.381416    P   9954.471506


In [24]:
all_data = dict(list(df.groupby('company')))
all_data

{'a':   company       data1 city        income
 0       a  120.145881    M  48182.938964
 1       a  106.844640    M  29041.026809
 2       a   77.455915    M  28361.943148
 4       a  170.918383    P -15630.651336
 5       a   81.977516    P  52102.323278
 7       a  164.957737    P  42686.019370
 8       a    1.032920    M  57344.473314,
 'b':   company       data1 city        income
 3       b  154.818620    P  87851.260264
 6       b  104.064474    P  42158.775972
 9       b   58.381416    P   9954.471506}

### Selecting a column or subset of columns

In [26]:
df.groupby('company')['data1']
df.groupby('company')[['city','income']].max()

Unnamed: 0_level_0,city,income
company,Unnamed: 1_level_1,Unnamed: 2_level_1
a,P,57344.473314
b,P,87851.260264


## Data aggregation

In [30]:
stats = df.groupby('city').describe()
stats

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,income,income,income,income,income,income,income,income
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
M,4.0,76.369839,53.297549,1.03292,58.350166,92.150277,110.16995,120.145881,4.0,40732.595559,14389.671299,28361.943148,28871.255894,38611.982886,50473.322551,57344.473314
P,6.0,122.519691,47.507101,58.381416,87.499255,129.441547,162.422957,170.918383,6.0,36520.366509,35701.373661,-15630.651336,18005.547622,42422.397671,49748.247301,87851.260264


In [31]:
stats['data1','std'] 

city
M    53.297549
P    47.507101
Name: (data1, std), dtype: float64

In [32]:
!wget https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv

--2019-11-22 21:19:56--  https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.132.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.132.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943 (7.8K) [text/plain]
Saving to: ‘tips.csv’


2019-11-22 21:19:57 (11.7 MB/s) - ‘tips.csv’ saved [7943/7943]



In [33]:
%ll

total 100768
-rwxrwxrwx 1 carcrupe   278482 Nov 22 18:42 [0m[01;32m01-numpy_pandas_introduction.inclass.ipynb[0m*
-rwxrwxrwx 1 carcrupe    48019 Nov 22 19:40 [01;32m02-loading_and_saving_data.empty.ipynb[0m*
-rwxrwxrwx 1 carcrupe    37713 Nov 22 20:52 [01;32m03-merge_concatenate_transform.empty.ipynb[0m*
-rwxrwxrwx 1 carcrupe    19570 Nov 22 21:18 [01;32m04-group_by.empty.ipynb[0m*
-rwxrwxrwx 1 carcrupe    25333 Nov 22 16:49 [01;32m05-visualization_introduction.empty.ipynb[0m*
-rwxrwxrwx 1 carcrupe 51575427 Nov 22 19:03 [01;32m914310910_T_T100_SEGMENT_ALL_CARRIER_2015_All.csv[0m*
-rwxrwxrwx 1 carcrupe     4341 Nov 22 19:03 [01;32m914310910_T_T100_SEGMENT_ALL_CARRIER_ReadMe.csv[0m*
-rwxrwxrwx 1 carcrupe     4268 Nov 22 19:03 [01;32m914310910_T_T100_SEGMENT_ALL_CARRIER_Terms.csv[0m*
-rw-rw-rw- 1 carcrupe    74700 Nov 16 12:58 Matplotlib.ipynb
-rw-rw-rw- 1 carcrupe    27669 Nov 16 13:55 Pandas.ipynb
-rw-r--r-- 1 carcrupe 38543360 Nov 22 19:22 newdb.db
-rw-rw-r

In [36]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [40]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [41]:
tips.groupby('sex').mean()

Unnamed: 0_level_0,total_bill,tip,size,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,18.056897,2.833448,2.45977,0.166491
Male,20.744076,3.089618,2.630573,0.157651


In [42]:
tips.groupby('sex').agg(['mean', 'std'])

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,18.056897,8.009209,2.833448,1.159495,2.45977,0.937644,0.166491,0.053632
Male,20.744076,9.246469,3.089618,1.489102,2.630573,0.955997,0.157651,0.064778


In [43]:
tips.groupby(['sex', 'smoker']).agg(['mean', 'std', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Female,No,18.105185,7.286455,54,2.773519,1.128425,54,2.592593,1.073146,54,0.156921,0.036421,54
Female,Yes,17.977879,9.189751,33,2.931515,1.219916,33,2.242424,0.613917,33,0.18215,0.071595,33
Male,No,19.791237,8.726566,97,3.113402,1.489559,97,2.71134,0.989094,97,0.160669,0.041849,97
Male,Yes,22.2845,9.911845,60,3.051167,1.50012,60,2.5,0.89253,60,0.152771,0.090588,60


In [45]:
tips.groupby(['sex', 'smoker']).agg([np.mean, np.std, np.count_nonzero]) #se pueden pasar funciones externas

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count_nonzero,mean,std,count_nonzero,mean,std,count_nonzero,mean,std,count_nonzero
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Female,No,18.105185,7.286455,54.0,2.773519,1.128425,54.0,2.592593,1.073146,54,0.156921,0.036421,54.0
Female,Yes,17.977879,9.189751,33.0,2.931515,1.219916,33.0,2.242424,0.613917,33,0.18215,0.071595,33.0
Male,No,19.791237,8.726566,97.0,3.113402,1.489559,97.0,2.71134,0.989094,97,0.160669,0.041849,97.0
Male,Yes,22.2845,9.911845,60.0,3.051167,1.50012,60.0,2.5,0.89253,60,0.152771,0.090588,60.0


In [47]:
tips.groupby(['sex', 'smoker']).agg({'total_bill' : ['mean', 'std', 'count']}) #seleccionar solo una de las columnas tras agrupar

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,No,18.105185,7.286455,54
Female,Yes,17.977879,9.189751,33
Male,No,19.791237,8.726566,97
Male,Yes,22.2845,9.911845,60


### Column-wise and multiple function application

In [50]:
tips_by_smoker = tips.groupby('smoker')['tip_pct'].agg(['mean' , 'std'])
tips_by_smoker

Unnamed: 0_level_0,mean,std
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.159328,0.03991
Yes,0.163196,0.085119


In [53]:
merged = tips.merge(tips_by_smoker, left_on='smoker', right_index=True)

## Group-wise operations and transformations

### Apply: General split-apply-combine

In [54]:
def top(df, n=3, col='tip_pct'):
    return df.sort_values(by=col, ascending=False).head(n)

top(merged)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345,0.163196,0.085119
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667,0.163196,0.085119
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733,0.163196,0.085119


In [55]:
merged.groupby('sex').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Female,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667,0.163196,0.085119
Female,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733,0.163196,0.085119
Female,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525,0.163196,0.085119
Male,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345,0.163196,0.085119
Male,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199,0.159328,0.03991
Male,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535,0.163196,0.085119


#### Suppressing the group keys

### Quantile and bucket analysis

In [56]:
pd.cut(merged['total_bill'], 5) #clasifies in 5 intervals

0      (12.618, 22.166]
1       (3.022, 12.618]
2      (12.618, 22.166]
3      (22.166, 31.714]
4      (22.166, 31.714]
             ...       
234    (12.618, 22.166]
236     (3.022, 12.618]
237    (31.714, 41.262]
240    (22.166, 31.714]
241    (22.166, 31.714]
Name: total_bill, Length: 244, dtype: category
Categories (5, interval[float64]): [(3.022, 12.618] < (12.618, 22.166] < (22.166, 31.714] < (31.714, 41.262] < (41.262, 50.81]]

In [59]:
merged.groupby(pd.cut(merged['total_bill'], 5))['tip_pct'].agg(['mean', 'std'])

Unnamed: 0_level_0,mean,std
total_bill,Unnamed: 1_level_1,Unnamed: 2_level_1
"(3.022, 12.618]",0.190854,0.094609
"(12.618, 22.166]",0.163942,0.041264
"(22.166, 31.714]",0.143799,0.051131
"(31.714, 41.262]",0.12153,0.042792
"(41.262, 50.81]",0.125121,0.05265


## Pivot tables and Cross-tabulation

In [61]:
tips.pivot_table(index='size', columns='smoker', values='tip_pct') #transform column values to indexes

smoker,No,Yes
size,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.159829,0.274755
2,0.164996,0.166706
3,0.149671,0.157543
4,0.147604,0.142036
5,0.178415,0.086116
6,0.156229,


In [65]:
tips.pivot_table(index=['size', 'sex'], columns=['smoker', 'time'], values='total_bill', aggfunc='sum') #conviertes los valores de una columna en indices

Unnamed: 0_level_0,smoker,No,No,Yes,Yes
Unnamed: 0_level_1,time,Dinner,Lunch,Dinner,Lunch
size,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,Female,7.25,10.07,3.07,
1,Male,,,,8.58
2,Female,263.62,237.11,274.23,114.73
2,Male,635.86,244.22,618.07,178.05
3,Female,139.05,34.62,111.52,16.47
3,Male,349.76,22.82,191.6,18.71
4,Female,140.35,58.91,30.14,43.11
4,Male,495.54,27.2,242.92,20.53
5,Female,29.85,,,
5,Male,20.69,41.19,58.61,
