In [1]:
import numpy as np
import pandas as pd

## Group Transforms and “Unwrapped” GroupBys

In [2]:
df = pd.DataFrame({'key':['a', 'b', 'c'] * 4,
                  'value': np.arange(12)})

In [3]:
df

Unnamed: 0,key,value
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,c,5
6,a,6
7,b,7
8,c,8
9,a,9


In [4]:
g = df.groupby('key')

In [6]:
g['value'].mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [7]:
def get_mean(group):
    return group.mean()
g.transform(get_mean)

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [8]:
g.transform('mean')

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [9]:
def times_two(group):
    return group * 2
g.transform(times_two)

Unnamed: 0,value
0,0
1,2
2,4
3,6
4,8
5,10
6,12
7,14
8,16
9,18


In [11]:
def normalize(x):
    return (x - x.mean()) / x.std()
g.transform(normalize)

Unnamed: 0,value
0,-1.161895
1,-1.161895
2,-1.161895
3,-0.387298
4,-0.387298
5,-0.387298
6,0.387298
7,0.387298
8,0.387298
9,1.161895


In [12]:
g.apply(normalize)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,-1.161895
a,3,-0.387298
a,6,0.387298
a,9,1.161895
b,1,-1.161895
b,4,-0.387298
b,7,0.387298
b,10,1.161895
c,2,-1.161895
c,5,-0.387298


In [13]:
g.transform('mean')

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


## Pivot Tables and Cross-Tabulation

In [22]:
tips = pd.read_csv("examples/tips.csv")
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [23]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


### Pivot Table in `pandas`

#### Purpose
To summarize and aggregate data by grouping it based on specific columns.

#### Syntax
```python
pd.pivot_table(data, values, index, columns, aggfunc)
```

#### Parameters
- **data**: DataFrame to pivot.
- **values**: Column(s) to aggregate.
- **index**: Column(s) to group by as rows.
- **columns**: Column(s) to group by as columns.
- **aggfunc**: Aggregation function(s) (e.g., 'sum', 'mean', 'count').

In [25]:
# Create a pivot table
pivot_table = tips.pivot_table(
    values=['total_bill', 'tip'], 
    index=['day', 'smoker'], 
    aggfunc='mean'
)

print(pivot_table)

                  tip  total_bill
day  smoker                      
Fri  No      2.812500   18.420000
     Yes     2.714000   16.813333
Sat  No      3.102889   19.661778
     Yes     2.875476   21.276667
Sun  No      3.167895   20.506667
     Yes     3.516842   24.120000
Thur No      2.673778   17.113111
     Yes     3.030000   19.190588


In [27]:
 tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"], margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [28]:
 tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"], aggfunc = len,margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,3.0,9.0,12,3.0,9.0,12
Dinner,Sat,45.0,42.0,87,45.0,42.0,87
Dinner,Sun,57.0,19.0,76,57.0,19.0,76
Dinner,Thur,1.0,,1,1.0,,1
Lunch,Fri,1.0,6.0,7,1.0,6.0,7
Lunch,Thur,44.0,17.0,61,44.0,17.0,61
All,,151.0,93.0,244,151.0,93.0,244


In [29]:
 tips.pivot_table(index=["time", "size", "smoker"], columns="day",
                   values="tip_pct", fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,1,No,0.0,0.137931,0.0,0.0
Dinner,1,Yes,0.0,0.325733,0.0,0.0
Dinner,2,No,0.139622,0.162705,0.168859,0.159744
Dinner,2,Yes,0.171297,0.148668,0.207893,0.0
Dinner,3,No,0.0,0.154661,0.152663,0.0
Dinner,3,Yes,0.0,0.144995,0.15266,0.0
Dinner,4,No,0.0,0.150096,0.148143,0.0
Dinner,4,Yes,0.11775,0.124515,0.19337,0.0
Dinner,5,No,0.0,0.0,0.206928,0.0
Dinner,5,Yes,0.0,0.106572,0.06566,0.0


![pivot](Assets\pivot.png)

## Cross-Tabulations: Crosstab

In [30]:
from io import StringIO

In [31]:
data = """Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed
"""

In [32]:
data = pd.read_table(StringIO(data), sep="\s+")

In [33]:
data

Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right-handed
1,2,Japan,Left-handed
2,3,USA,Right-handed
3,4,Japan,Right-handed
4,5,Japan,Left-handed
5,6,Japan,Right-handed
6,7,USA,Right-handed
7,8,USA,Left-handed
8,9,Japan,Right-handed
9,10,USA,Right-handed


In [34]:
 pd.crosstab(data["Nationality"], data["Handedness"], margins=True)

Handedness,Left-handed,Right-handed,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10
