# https://www.youtube.com/watch?v=w26V2p9fC5A&list=PLo9Vi5B84_dfAuwJqNYG4XhZMrGTF3sBx&index=5

In [20]:
import polars as pl
import pandas as pd
import numpy as np
import pyarrow
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_path = r'F:\DataSpell\Polars_library_learn\Youtube Lessons\Polars Tutorial\data\2019-Nov.parquet'

In [3]:
df = pl.read_parquet(df_path)

In [4]:
df

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-01 00:00:00 UTC""","""view""",1003461,2053013555631882655,"""electronics.smartphone""","""xiaomi""",489.07,520088904,"""4d3b30da-a5e4-49df-b1a8-ba5943…"
"""2019-11-01 00:00:00 UTC""","""view""",5000088,2053013566100866035,"""appliances.sewing_machine""","""janome""",293.65,530496790,"""8e5f4f83-366c-4f70-860e-ca7417…"
"""2019-11-01 00:00:01 UTC""","""view""",17302664,2053013553853497655,,"""creed""",28.31,561587266,"""755422e7-9040-477b-9bd2-6a6e8f…"
"""2019-11-01 00:00:01 UTC""","""view""",3601530,2053013563810775923,"""appliances.kitchen.washer""","""lg""",712.87,518085591,"""3bfb58cd-7892-48cc-8020-2f17e6…"
"""2019-11-01 00:00:01 UTC""","""view""",1004775,2053013555631882655,"""electronics.smartphone""","""xiaomi""",183.27,558856683,"""313628f1-68b8-460d-84f6-cec7a8…"
…,…,…,…,…,…,…,…,…
"""2019-11-30 23:59:58 UTC""","""view""",15700137,2053013559733912211,,,277.74,532714000,"""02b4131c-0112-4231-aafa-ceaa08…"
"""2019-11-30 23:59:58 UTC""","""view""",28719425,2053013565639492569,"""apparel.shoes""","""baden""",62.81,545223467,"""734c5eef-0742-4f8b-9d22-48f75b…"
"""2019-11-30 23:59:59 UTC""","""view""",1004833,2053013555631882655,"""electronics.smartphone""","""samsung""",167.03,557794415,"""6fecf566-ebb0-4e70-a243-cdc13c…"
"""2019-11-30 23:59:59 UTC""","""view""",2701706,2053013563911439225,"""appliances.kitchen.refrigerato…","""samsung""",566.27,531607492,"""368ddc8b-5db9-40fb-b7ff-b6582a…"


In [7]:
df = df.with_columns(
    pl.col('event_time').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S %Z')
)

In [10]:
df.filter(pl.col('user_id') == 520088904).to_pandas().sample(5)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
16,2019-11-01 00:44:03,view,1701474,2053013553031414015,computers.peripherals.monitor,acer,227.03,520088904,bbd7ce29-5406-4c52-b239-f0dd3b12427e
19,2019-11-03 07:55:11,view,1480706,2053013561092866779,computers.desktop,pulser,530.23,520088904,4a2763a0-a7ec-40e8-89f8-317b84f6658f
27,2019-11-13 03:16:09,view,4800675,2053013554658804075,electronics.audio.headphone,sony,14.93,520088904,e9882787-2f34-4cbd-80b0-70a8d076971f
41,2019-11-19 08:18:37,view,9101644,2053013552888807671,computers.peripherals.mouse,msi,16.96,520088904,f443e47a-f346-47b5-82fb-6a2c40058d28
35,2019-11-16 06:46:29,view,4802351,2053013554658804075,electronics.audio.headphone,xiaomi,12.84,520088904,a5660afe-6e0c-4555-ba81-be21c587e2bb


# 1. Keep rows with purchases only

In [11]:
df_purchases = df.filter(pl.col('event_type') == 'purchase')

In [13]:
df_purchases.shape

(916939, 9)

# 2. Let`s compute RFM

In [15]:
df_purchases = df_purchases.select(
  ['event_time', 'user_id', 'price']
).unique()

In [16]:
df_purchases.shape

(916930, 3)

In [18]:
df_purchases

event_time,user_id,price
datetime[μs],i64,f64
2019-11-17 13:08:00,516272854,177.07
2019-11-30 17:39:48,571609631,195.54
2019-11-27 10:49:58,556031520,125.85
2019-11-05 21:04:42,550594573,1422.31
2019-11-09 00:23:41,522633425,437.57
…,…,…
2019-11-14 15:17:49,556716479,159.33
2019-11-28 08:33:01,518131448,916.37
2019-11-16 14:01:16,534087565,617.75
2019-11-29 16:14:04,574055245,55.78


# Compute time difference

In [21]:
anchor_date = dt.datetime(2019, 11, 30)

In [23]:
df_purchases = df_purchases.with_columns(
    (anchor_date - pl.col('event_time')).alias('date_diff') / (1e6 * 3600 * 24)
)

In [25]:
df_purchases

event_time,user_id,price,date_diff
datetime[μs],i64,f64,f64
2019-11-17 13:08:00,516272854,177.07,12.452778
2019-11-30 17:39:48,571609631,195.54,-0.735972
2019-11-27 10:49:58,556031520,125.85,2.548634
2019-11-05 21:04:42,550594573,1422.31,24.121736
2019-11-09 00:23:41,522633425,437.57,20.983553
…,…,…,…
2019-11-14 15:17:49,556716479,159.33,15.362627
2019-11-28 08:33:01,518131448,916.37,1.643738
2019-11-16 14:01:16,534087565,617.75,13.415787
2019-11-29 16:14:04,574055245,55.78,0.323565


In [27]:
df_agg = df_purchases.group_by('user_id').agg([
  pl.col('date_diff').min().alias('recency'),
  pl.len().alias('frequency'),
  pl.col('price').sum().alias('monetary')
]
)

In [28]:
df_agg

user_id,recency,frequency,monetary
i64,f64,u32,f64
547116191,16.772454,1,280.57
550956182,17.428194,1,88.64
554831299,15.71316,2,159.08
512424706,12.291771,11,1442.26
542775483,21.589734,2,856.68
…,…,…,…
526642800,0.611343,2,481.82
517444664,22.334606,1,261.91
518449391,-0.228137,2,256.84
516437314,3.080104,1,82.37


# 3. Convert to pandas for analysis and plotting

In [29]:
df_agg = df_agg.to_pandas()

In [31]:
breaks = np.arange(0, 1.1, 0.1)
num_vars = ['recency', 'frequency', 'monetary']
df_agg[num_vars].quantile([0, 0.25, 0.5, 0.75, 1])

Unnamed: 0,recency,frequency,monetary
0.0,-0.999815,1.0,0.77
0.25,6.570278,1.0,108.6
0.5,12.492106,1.0,246.04
0.75,17.780564,2.0,616.9175
1.0,28.999525,519.0,203986.07
