In [3]:
import pandas as pd
import numpy as np

In [4]:
payment_df = pd.read_csv("./data/payment.csv")

In [5]:
df = pd.DataFrame(
    np.random.rand(4, 5),
    index=["Apple", "Banana", "Cocoa", "Coconut"],
)

In [6]:
df

Unnamed: 0,0,1,2,3,4
Apple,0.544257,0.96781,0.244875,0.939738,0.636421
Banana,0.246099,0.726031,0.434151,0.330755,0.084674
Cocoa,0.286717,0.386161,0.175282,0.014846,0.972476
Coconut,0.835001,0.583742,0.297761,0.948548,0.305059


In [7]:
def get_first_char(char):
    return char[0]

In [8]:
df.groupby(get_first_char).size()

A    1
B    1
C    2
dtype: int64

In [9]:
df.groupby(lambda x: x[0]).size()

A    1
B    1
C    2
dtype: int64

In [10]:
df.groupby(lambda x: len(x)).size()

5    2
6    1
7    1
dtype: int64

In [11]:
mapping = {
    "Apple": "Fruit",
    "Banana": "Fruit",
    "Cocoa": "Not Fruit",
    "Coconut": "Fruit",
}

In [12]:
df.groupby(
    lambda x : "Fruit" if x in ["Apple", "Banana", "Coconut"] else "Not Fruit"
).size()

Fruit        3
Not Fruit    1
dtype: int64

In [13]:
payment_df["month"] = payment_df.payment_date.apply(lambda x: str(x)[:7])

In [14]:
payment_df.groupby(["customer_id", "month"]).agg("sum").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,payment_id,staff_id,rental_id,amount
customer_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2005-05,3,2,649.0,3.98
1,2005-06,42,10,13763.0,31.93
1,2005-07,186,18,83104.0,50.88
1,2005-08,297,17,143621.0,31.89
2,2005-05,33,1,320.0,4.99


In [15]:
df = pd.DataFrame(
    np.random.randn(4, 4),
    index=list("ABCD"),
)

In [16]:
df.T.groupby(lambda x: int(x) % 2 == 0).size()

False    2
True     2
dtype: int64

In [17]:
payment_df

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update,month
0,1,1,1,76.0,2.99,2005-05-25 11:30,2006-02-15 22:12,2005-05
1,2,1,1,573.0,0.99,2005-05-28 10:35,2006-02-15 22:12,2005-05
2,3,1,1,1185.0,5.99,2005-06-15 0:54,2006-02-15 22:12,2005-06
3,4,1,2,1422.0,0.99,2005-06-15 18:02,2006-02-15 22:12,2005-06
4,5,1,2,1476.0,9.99,2005-06-15 21:08,2006-02-15 22:12,2005-06
5,6,1,1,1725.0,4.99,2005-06-16 15:18,2006-02-15 22:12,2005-06
6,7,1,1,2308.0,4.99,2005-06-18 8:41,2006-02-15 22:12,2005-06
7,8,1,2,2363.0,0.99,2005-06-18 13:33,2006-02-15 22:12,2005-06
8,9,1,1,3284.0,3.99,2005-06-21 6:24,2006-02-15 22:12,2005-06
9,10,1,2,4526.0,5.99,2005-07-08 3:17,2006-02-15 22:12,2005-07


In [18]:
temp_df = payment_df.amount\
    .groupby(payment_df.customer_id)\
    .agg(["mean", "count"])\
    .add_prefix("customer_amount_")

In [19]:
temp_df.head(1)

Unnamed: 0_level_0,customer_amount_mean,customer_amount_count
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.70875,32


In [20]:
pd.merge(
    payment_df,
    temp_df,
    left_on="customer_id",
    right_index=True,
)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update,month,customer_amount_mean,customer_amount_count
0,1,1,1,76.0,2.99,2005-05-25 11:30,2006-02-15 22:12,2005-05,3.708750,32
1,2,1,1,573.0,0.99,2005-05-28 10:35,2006-02-15 22:12,2005-05,3.708750,32
2,3,1,1,1185.0,5.99,2005-06-15 0:54,2006-02-15 22:12,2005-06,3.708750,32
3,4,1,2,1422.0,0.99,2005-06-15 18:02,2006-02-15 22:12,2005-06,3.708750,32
4,5,1,2,1476.0,9.99,2005-06-15 21:08,2006-02-15 22:12,2005-06,3.708750,32
5,6,1,1,1725.0,4.99,2005-06-16 15:18,2006-02-15 22:12,2005-06,3.708750,32
6,7,1,1,2308.0,4.99,2005-06-18 8:41,2006-02-15 22:12,2005-06,3.708750,32
7,8,1,2,2363.0,0.99,2005-06-18 13:33,2006-02-15 22:12,2005-06,3.708750,32
8,9,1,1,3284.0,3.99,2005-06-21 6:24,2006-02-15 22:12,2005-06,3.708750,32
9,10,1,2,4526.0,5.99,2005-07-08 3:17,2006-02-15 22:12,2005-07,3.708750,32


In [21]:
temp_df = payment_df["amount"].groupby(payment_df.customer_id).transform("sum")

In [22]:
pd.concat(
    [payment_df, temp_df],
    axis=1,
)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update,month,amount.1
0,1,1,1,76.0,2.99,2005-05-25 11:30,2006-02-15 22:12,2005-05,118.68
1,2,1,1,573.0,0.99,2005-05-28 10:35,2006-02-15 22:12,2005-05,118.68
2,3,1,1,1185.0,5.99,2005-06-15 0:54,2006-02-15 22:12,2005-06,118.68
3,4,1,2,1422.0,0.99,2005-06-15 18:02,2006-02-15 22:12,2005-06,118.68
4,5,1,2,1476.0,9.99,2005-06-15 21:08,2006-02-15 22:12,2005-06,118.68
5,6,1,1,1725.0,4.99,2005-06-16 15:18,2006-02-15 22:12,2005-06,118.68
6,7,1,1,2308.0,4.99,2005-06-18 8:41,2006-02-15 22:12,2005-06,118.68
7,8,1,2,2363.0,0.99,2005-06-18 13:33,2006-02-15 22:12,2005-06,118.68
8,9,1,1,3284.0,3.99,2005-06-21 6:24,2006-02-15 22:12,2005-06,118.68
9,10,1,2,4526.0,5.99,2005-07-08 3:17,2006-02-15 22:12,2005-07,118.68


In [23]:
temp_df = payment_df.amount.groupby(payment_df.customer_id).transform(
    lambda x: x - x.mean()
)
pd.concat([payment_df, temp_df], axis=1)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update,month,amount.1
0,1,1,1,76.0,2.99,2005-05-25 11:30,2006-02-15 22:12,2005-05,-0.718750
1,2,1,1,573.0,0.99,2005-05-28 10:35,2006-02-15 22:12,2005-05,-2.718750
2,3,1,1,1185.0,5.99,2005-06-15 0:54,2006-02-15 22:12,2005-06,2.281250
3,4,1,2,1422.0,0.99,2005-06-15 18:02,2006-02-15 22:12,2005-06,-2.718750
4,5,1,2,1476.0,9.99,2005-06-15 21:08,2006-02-15 22:12,2005-06,6.281250
5,6,1,1,1725.0,4.99,2005-06-16 15:18,2006-02-15 22:12,2005-06,1.281250
6,7,1,1,2308.0,4.99,2005-06-18 8:41,2006-02-15 22:12,2005-06,1.281250
7,8,1,2,2363.0,0.99,2005-06-18 13:33,2006-02-15 22:12,2005-06,-2.718750
8,9,1,1,3284.0,3.99,2005-06-21 6:24,2006-02-15 22:12,2005-06,0.281250
9,10,1,2,4526.0,5.99,2005-07-08 3:17,2006-02-15 22:12,2005-07,2.281250


In [24]:
payment_df

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update,month
0,1,1,1,76.0,2.99,2005-05-25 11:30,2006-02-15 22:12,2005-05
1,2,1,1,573.0,0.99,2005-05-28 10:35,2006-02-15 22:12,2005-05
2,3,1,1,1185.0,5.99,2005-06-15 0:54,2006-02-15 22:12,2005-06
3,4,1,2,1422.0,0.99,2005-06-15 18:02,2006-02-15 22:12,2005-06
4,5,1,2,1476.0,9.99,2005-06-15 21:08,2006-02-15 22:12,2005-06
5,6,1,1,1725.0,4.99,2005-06-16 15:18,2006-02-15 22:12,2005-06
6,7,1,1,2308.0,4.99,2005-06-18 8:41,2006-02-15 22:12,2005-06
7,8,1,2,2363.0,0.99,2005-06-18 13:33,2006-02-15 22:12,2005-06
8,9,1,1,3284.0,3.99,2005-06-21 6:24,2006-02-15 22:12,2005-06
9,10,1,2,4526.0,5.99,2005-07-08 3:17,2006-02-15 22:12,2005-07
