# https://www.youtube.com/watch?v=UbcKZKPXhDA&list=PLo9Vi5B84_dfAuwJqNYG4XhZMrGTF3sBx&index=4

# Group-By in Polars

In [64]:
import polars as pl
import pandas as pd
import numpy as np
import pyarrow

import matplotlib.pyplot as plt
import seaborn as sns

In [65]:
df_path = r'F:\DataSpell\Polars_library_learn\Youtube Lessons\Polars Tutorial\data\2019-Nov.parquet'

In [66]:
df = pl.read_parquet(df_path)

In [67]:
df.group_by('brand').agg([
  pl.len()
]).sort('len').reverse()

brand,len
str,u32
,9218235
"""samsung""",7889245
"""apple""",6259379
"""xiaomi""",4638062
"""huawei""",1410126
…,…
"""mirex""",1
"""zazu""",1
"""dogrular""",1
"""slovo""",1


In [68]:
df_count = df.group_by('brand').agg([
  pl.len()
]).to_pandas().sort_values('len', ascending=False)

df_count

Unnamed: 0,brand,len
1629,,9218235
3408,samsung,7889245
3112,apple,6259379
2117,xiaomi,4638062
996,huawei,1410126
...,...,...
671,nordica,1
1610,casato,1
2749,individuum,1
3268,reshin,1


In [69]:
(
  df.group_by('brand').agg([
    pl.len()
  ])
  .sort('len').reverse()
  .with_columns([
      (pl.col('len') / pl.col('len').sum()).alias('percentage')
  ])
)

brand,len,percentage
str,u32,f64
,9218235,0.136562
"""samsung""",7889245,0.116874
"""apple""",6259379,0.092729
"""xiaomi""",4638062,0.06871
"""huawei""",1410126,0.02089
…,…,…
"""invotone""",1,1.4814e-8
"""reshin""",1,1.4814e-8
"""emily""",1,1.4814e-8
"""interlink""",1,1.4814e-8


In [70]:
df.sample(5)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-09 04:17:32 UTC""","""view""",5700785,2053013553970938175,"""auto.accessories.player""",,272.83,539617147,"""45ecb36d-c3d6-4b9a-9f97-40c2e1…"
"""2019-11-25 08:27:32 UTC""","""view""",1005119,2053013555631882655,"""electronics.smartphone""","""apple""",1003.6,576364160,"""86b3b570-116d-4185-84c7-5040bf…"
"""2019-11-16 05:33:02 UTC""","""view""",1005105,2053013555631882655,"""electronics.smartphone""","""apple""",1364.0,516506587,"""998f960b-6bbf-4399-acbc-2f6916…"
"""2019-11-16 13:12:12 UTC""","""view""",12703015,2053013553559896355,,"""cordiant""",42.99,572174557,"""eb1a8a44-7327-401b-a8ea-04611e…"
"""2019-11-05 07:59:35 UTC""","""view""",1802070,2053013554415534427,"""electronics.video.tv""","""philips""",566.27,514524841,"""d6aa2a75-aacb-45dd-9825-b8fe0b…"


In [71]:
df['event_type'].value_counts()

event_type,count
str,u32
"""cart""",3028930
"""view""",63556110
"""purchase""",916939


In [72]:
df.group_by('event_type').agg(pl.len())

event_type,len
str,u32
"""purchase""",916939
"""cart""",3028930
"""view""",63556110


# 2. Group_by - pl.len (two groups

In [73]:
df_brand_event = df.group_by(['brand', 'event_type']).agg([
  pl.len()
  ]).sort('len').reverse()

df_brand_event

brand,event_type,len
str,str,u32
,"""view""",8886720
"""samsung""","""view""",7091998
"""apple""","""view""",5603650
"""xiaomi""","""view""",4309751
"""huawei""","""view""",1312663
…,…,…
"""hercules""","""cart""",1
"""smarttrike""","""cart""",1
"""teknetics""","""cart""",1
"""indiana""","""cart""",1


In [74]:
# Convert from long to wide format

df_brand_event_wide = df_brand_event.pivot(
  values='len',
  index='brand',
  columns='event_type'
)

df_brand_event_wide

brand,view,cart,purchase
str,u32,u32,u32
,8886720,258242,73273
"""samsung""",7091998,597220,200027
"""apple""",5603650,489665,166064
"""xiaomi""",4309751,260019,68292
"""huawei""",1312663,73760,23703
…,…,…,…
"""casato""",1,,
"""della""",1,,
"""build-a-bear""",1,,
"""grifon""",1,,


In [84]:
# Pivot and sorting

df_brand_event_wide = (
  df_brand_event
  .pivot(values='len', index='brand', columns='event_type')
  .sort('purchase').reverse()
)

df_brand_event_wide

brand,view,cart,purchase
str,u32,u32,u32
"""samsung""",7091998,597220,200027
"""apple""",5603650,489665,166064
,8886720,258242,73273
"""xiaomi""",4309751,260019,68292
"""huawei""",1312663,73760,23703
…,…,…,…
"""roblox""",1857,16,
"""tec""",2118,3,
"""wera""",4872,14,
"""certina""",6723,4,


In [85]:
# Keep original column and compute % values

agg_performance = df_brand_event_wide.with_columns([
    (pl.col('cart') / pl.col('view')).alias('cart_by_views'),
    (pl.col('purchase') / pl.col('cart')).alias('buy_by_cart'),
    (pl.col('purchase') / pl.col('view')).alias('buy_by_views')
])

agg_performance

brand,view,cart,purchase,cart_by_views,buy_by_cart,buy_by_views
str,u32,u32,u32,f64,f64,f64
"""samsung""",7091998,597220,200027,0.08421,0.33493,0.028205
"""apple""",5603650,489665,166064,0.087383,0.339138,0.029635
,8886720,258242,73273,0.029059,0.283738,0.008245
"""xiaomi""",4309751,260019,68292,0.060333,0.262642,0.015846
"""huawei""",1312663,73760,23703,0.056191,0.321353,0.018057
…,…,…,…,…,…,…
"""roblox""",1857,16,,0.008616,,
"""tec""",2118,3,,0.001416,,
"""wera""",4872,14,,0.002874,,
"""certina""",6723,4,,0.000595,,


In [78]:
agg_performance.filter(pl.col('buy_by_cart') < 1) \
    .select(['cart_by_views', 'buy_by_cart', 'buy_by_views']) \
    .to_pandas().quantile([0, 0.25, 0.5, 0.75, 0.99, 1])
 # .boxplot())

Unnamed: 0,cart_by_views,buy_by_cart,buy_by_views
0.0,0.001115,0.02681,0.000353
0.25,0.01464,0.210685,0.003704
0.5,0.024419,0.279579,0.006616
0.75,0.03722,0.343597,0.010874
0.99,0.121112,0.75,0.043861
1.0,0.4,0.9,0.2


# 3. Group_by - UserId + 2 Groups

In [79]:
df_user_brand_event = df.group_by(['user_id', 'brand', 'event_type']).agg([pl.len()]).sort('len').reverse()

df_user_brand_event

user_id,brand,event_type,len
i64,str,str,u32
568778435,,"""view""",5850
569335945,"""belecoo""","""view""",3651
512365995,"""samsung""","""view""",3619
569335945,"""wingoffly""","""view""",3327
573277455,,"""view""",2467
…,…,…,…
516708803,"""nika""","""view""",1
542998789,"""elenberg""","""view""",1
570421579,"""lenovo""","""view""",1
565492053,"""huawei""","""view""",1


In [90]:
# pivot and sort

df_user_brand_event = df_brand_event_wide.with_columns([
    pl.col('purchase').fill_null(strategy='zero'),
    pl.col('view').fill_null(strategy='zero'),
    pl.col('cart').fill_null(strategy='zero')
])


In [94]:
df_user_brand_event = df_brand_event_wide.with_columns(
    (pl.col('purchase') / pl.col('view')).alias('pct_buy_views'))

df_user_brand_event

brand,view,cart,purchase,pct_buy_views
str,u32,u32,u32,f64
"""samsung""",7091998,597220,200027,0.028205
"""apple""",5603650,489665,166064,0.029635
,8886720,258242,73273,0.008245
"""xiaomi""",4309751,260019,68292,0.015846
"""huawei""",1312663,73760,23703,0.018057
…,…,…,…,…
"""roblox""",1857,16,,
"""tec""",2118,3,,
"""wera""",4872,14,,
"""certina""",6723,4,,


In [96]:
df_user_brand_event = df_user_brand_event.with_columns([
  pl.when(pl.col('pct_buy_views').is_infinite())
  .then(None)
  .otherwise(pl.col('pct_buy_views')).alias('pct_buy_views')
])

df_user_brand_event

brand,view,cart,purchase,pct_buy_views
str,u32,u32,u32,f64
"""samsung""",7091998,597220,200027,0.028205
"""apple""",5603650,489665,166064,0.029635
,8886720,258242,73273,0.008245
"""xiaomi""",4309751,260019,68292,0.015846
"""huawei""",1312663,73760,23703,0.018057
…,…,…,…,…
"""roblox""",1857,16,,
"""tec""",2118,3,,
"""wera""",4872,14,,
"""certina""",6723,4,,
