<a href="https://colab.research.google.com/github/cbonnin88/EDA_Projects/blob/main/Polars_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import polars as pl
import numpy as np
import json
import random
from datetime import datetime, timedelta
import plotly.express as px

In [2]:
np.random.seed(42)
n_rows = 10000
user_ids = [f'u_{i}' for i in range(100,500)]
events = ['login','view_item','add_to_cart','checkout_start','purchase','error']

In [3]:
def dirty_price():
  val = round(random.uniform(10,500),2)
  if random.random() < 0.05: return f'€{val}'
  if random.random() < 0.05: return None
  return val

In [4]:
def generate_metadata():
  meta = {'browser': random.choice(['Chrome','Firefox','Safari']),
          'is_mobile': random.choice(['True','False'])}
  if random.random() < 0.1: return None
  return json.dumps(meta)

In [5]:
product_data = {
    'event_id': [f'evt_{i}' for i in range(n_rows)],
    'user_id': np.random.choice(user_ids + [None], n_rows),
    'event_name': np.random.choice(events, n_rows, p=[0.2,0.4,0.2,0.1,0.05,0.05]),
    'timestamp': [datetime(2024,1,1) + timedelta(minutes=random.randint(0,40000)) for _ in range(n_rows)],
    'price': [dirty_price() for _ in range(n_rows)],
    'metadata': [generate_metadata() for _ in range(n_rows)],
    'session_source': np.random.choice(['google','direct','meta','Google','META'], n_rows)
}

df_product = pd.DataFrame(product_data)

In [6]:
df_product = pd.concat([df_product,df_product.sample(200)])

In [7]:
df_product.to_csv('raw_product_data.csv', index=False)
print('Files ready: raw_product_data.csv')

Files ready: raw_product_data.csv


# **Data Cleaning**

In [8]:
# Lazy Evaluation (Scan)
q1 = (
    pl.scan_csv('raw_product_data.csv')
    .unique()
    .filter(pl.col('user_id').is_not_null())
    .with_columns([
        pl.col('price')
        .str.replace(r'€','')
        .cast(pl.Float64,strict=False)
        .alias('price_clean'),

        pl.col('timestamp').str.to_datetime()
    ])
    .with_columns(
        pl.col('metadata')
        .str.json_decode(pl.Struct({'browser':pl.String,'is_mobile':pl.String}))
        .alias('meta_struct')
    )
    .unnest('meta_struct')
    .with_columns(
        pl.when(pl.col('is_mobile') == 'True')
        .then(pl.lit(True))
        .otherwise(pl.lit(False))
        .alias('is_mobile')
    )
)

df_product_clean = q1.collect()
display(df_product_clean.head())

event_id,user_id,event_name,timestamp,price,metadata,session_source,price_clean,browser,is_mobile
str,str,str,datetime[μs],str,str,str,f64,str,bool
"""evt_9256""","""u_309""","""login""",2024-01-01 14:19:00,"""73.39""","""{""browser"": ""Chrome"", ""is_mobi…","""direct""",73.39,"""Chrome""",True
"""evt_7479""","""u_173""","""view_item""",2024-01-13 12:56:00,"""407.06""","""{""browser"": ""Firefox"", ""is_mob…","""direct""",407.06,"""Firefox""",True
"""evt_6423""","""u_250""","""view_item""",2024-01-17 13:14:00,"""174.81""","""{""browser"": ""Safari"", ""is_mobi…","""meta""",174.81,"""Safari""",True
"""evt_692""","""u_244""","""login""",2024-01-18 14:07:00,"""288.64""","""{""browser"": ""Firefox"", ""is_mob…","""Google""",288.64,"""Firefox""",False
"""evt_2195""","""u_473""","""view_item""",2024-01-12 08:44:00,"""334.78""",,"""google""",334.78,,False


# **Metrics**

In [9]:
# Simple Funnel
funnel = (
    df_product_clean
    .filter(pl.col('event_name').is_in(['view_item','add_to_cart','purchase']))
    .group_by('event_name')
    .agg(pl.col('user_id').n_unique().alias('user_count'))
    .with_columns(
        pl.col('event_name')
        .replace_strict({'view_item': 1, 'add_to_cart':2, 'purchase':3}, default=99)
        .alias('step_order')
    )
    .sort('step_order')
)
display('Funnel:', funnel)

'Funnel:'

event_name,user_count,step_order
str,u32,i64
"""view_item""",400,1
"""add_to_cart""",396,2
"""purchase""",287,3


In [10]:
# AOV
aov = (
    df_product_clean
    .filter(pl.col('event_name')== 'purchase')
    .select(pl.col('price_clean').mean().round(0).alias('average_order_value'))
)
display('AOV:',aov)

'AOV:'

average_order_value
f64
251.0


In [12]:
# User Sessions
sessions = (
    df_product_clean
    .sort(['user_id','timestamp'])
    .with_columns([
        pl.col('timestamp').diff().over('user_id').alias('time_diff')
    ])
    .with_columns([
        pl.when((pl.col('time_diff').dt.total_seconds()> 1800) | (pl.col('time_diff').is_null()))
        .then(1)
        .otherwise(0)
        .alias('is_new_session')
    ])
    .with_columns([
        pl.col('is_new_session').cum_sum().alias('global_session_id')
    ])
)

display(sessions.select(['user_id','timestamp','is_new_session','global_session_id']).head(10))

user_id,timestamp,is_new_session,global_session_id
str,datetime[μs],i32,i32
"""u_100""",2024-01-01 03:44:00,1,1
"""u_100""",2024-01-02 01:44:00,1,2
"""u_100""",2024-01-02 16:07:00,1,3
"""u_100""",2024-01-03 11:19:00,1,4
"""u_100""",2024-01-03 22:26:00,1,5
"""u_100""",2024-01-05 07:53:00,1,6
"""u_100""",2024-01-06 05:30:00,1,7
"""u_100""",2024-01-06 20:16:00,1,8
"""u_100""",2024-01-08 16:09:00,1,9
"""u_100""",2024-01-09 18:43:00,1,10


# **Plotly**

In [14]:
# Aggregate Session Metrics
# Grouping by the calculate 'global_session_id
session_metrics = (
    sessions
    .group_by(['user_id','global_session_id'])
    .agg([
        pl.col('timestamp').min().alias('session_start'),
        pl.col('timestamp').max().alias('session_end'),
        pl.col('event_name').count().alias('event_count')
    ])
    .with_columns(
        (pl.col('session_end') - pl.col('session_start')).dt.total_minutes().alias('duration_minutes')
    )
    .filter(pl.col('duration_minutes') > 0)
    .sort('session_start')
)

display(session_metrics.head())

user_id,global_session_id,session_start,session_end,event_count,duration_minutes
str,i32,datetime[μs],datetime[μs],u32,i64
"""u_314""",5258,2024-01-01 01:36:00,2024-01-01 01:53:00,2,17
"""u_382""",6883,2024-01-01 07:00:00,2024-01-01 07:22:00,2,22
"""u_120""",513,2024-01-01 09:19:00,2024-01-01 09:22:00,2,3
"""u_425""",7934,2024-01-01 11:18:00,2024-01-01 11:35:00,2,17
"""u_379""",6814,2024-01-01 18:48:00,2024-01-01 19:15:00,2,27


In [16]:
# Converting to Pandas because Plotly works better with Pandas
plot_df = session_metrics.select('duration_minutes').to_pandas()

In [17]:
fig_sessions = px.histogram(
    plot_df,
    x='duration_minutes',
    nbins=50,
    title='Session Distribution Lengths',
    labels= {'duration_minutes': 'Session Duration (Minutes)'},
    template = 'plotly_white'
)

median_val = plot_df['duration_minutes'].median()
fig_sessions.add_vline(x=median_val,line_dash='dash',line_color='red',annotation_text=f'Median:{median_val:.1f}m')

fig_sessions.show()

# **User Activity Gantt Chart**

In [20]:
# Filter for top 5 user so the chart is readable
top_users = (
    session_metrics
    .group_by('user_id')
    .agg(pl.count('global_session_id').alias('total_sessions'))
    .top_k(5, by='total_sessions')
    .select('user_id')
)

display(top_users)

user_id
str
"""u_210"""
"""u_262"""
"""u_336"""
"""u_250"""
"""u_461"""


In [21]:
timeline_df = (
    session_metrics
    .join(top_users, on='user_id',how='inner')
    .to_pandas()
)

display(timeline_df)

Unnamed: 0,user_id,global_session_id,session_start,session_end,event_count,duration_minutes
0,u_210,2697,2024-01-02 05:02:00,2024-01-02 05:11:00,2,9
1,u_210,2702,2024-01-05 18:47:00,2024-01-05 18:56:00,2,9
2,u_336,5811,2024-01-07 18:36:00,2024-01-07 18:45:00,2,9
3,u_262,4001,2024-01-10 06:59:00,2024-01-10 07:08:00,2,9
4,u_250,3695,2024-01-10 21:09:00,2024-01-10 21:24:00,2,15
5,u_210,2706,2024-01-15 06:22:00,2024-01-15 06:30:00,2,8
6,u_262,4007,2024-01-18 14:56:00,2024-01-18 15:10:00,2,14
7,u_461,8801,2024-01-18 21:49:00,2024-01-18 21:54:00,2,5
8,u_461,8803,2024-01-20 09:09:00,2024-01-20 09:25:00,2,16
9,u_336,5835,2024-01-21 19:51:00,2024-01-21 20:19:00,2,28


In [23]:
fig_timeline = px.timeline(
    timeline_df,
    x_start='session_start',
    x_end='session_end',
    y='user_id',
    color='event_count',
    title='User Session Timeline (Top 5 Users)',
    labels={'user_id':'User ID','event_count':'Events in Session'},
    template='plotly_white'
)

fig_timeline.update_yaxes(categoryorder='total ascending')

fig_timeline.show()

# **Time of Day Challenge**

In [25]:
tod_analytics = (
    session_metrics
    .with_columns(
        pl.col('session_start').dt.hour().alias('hour')
    )
    .with_columns(
        pl.when((pl.col('hour') >= 5) & (pl.col('hour') < 12))
        .then(pl.lit('Morning'))
        .when((pl.col('hour') >= 12) & (pl.col('hour') < 17))
        .then(pl.lit('Afternoon'))
        .when((pl.col('hour') >= 17) & (pl.col('hour') < 21))
        .then(pl.lit('Evening'))
        .otherwise(pl.lit('Night'))
        .alias('time_of_day')
    )
)

display(tod_analytics)

user_id,global_session_id,session_start,session_end,event_count,duration_minutes,hour,time_of_day
str,i32,datetime[μs],datetime[μs],u32,i64,i8,str
"""u_314""",5258,2024-01-01 01:36:00,2024-01-01 01:53:00,2,17,1,"""Night"""
"""u_382""",6883,2024-01-01 07:00:00,2024-01-01 07:22:00,2,22,7,"""Morning"""
"""u_120""",513,2024-01-01 09:19:00,2024-01-01 09:22:00,2,3,9,"""Morning"""
"""u_425""",7934,2024-01-01 11:18:00,2024-01-01 11:35:00,2,17,11,"""Morning"""
"""u_379""",6814,2024-01-01 18:48:00,2024-01-01 19:15:00,2,27,18,"""Evening"""
…,…,…,…,…,…,…,…
"""u_486""",9458,2024-01-27 05:48:00,2024-01-27 06:17:00,2,29,5,"""Morning"""
"""u_228""",3164,2024-01-27 09:09:00,2024-01-27 09:36:00,2,27,9,"""Morning"""
"""u_340""",5923,2024-01-27 22:19:00,2024-01-27 22:35:00,2,16,22,"""Night"""
"""u_453""",8620,2024-01-28 11:01:00,2024-01-28 11:17:00,2,16,11,"""Morning"""


In [28]:
tod_stats = (
    tod_analytics
    .group_by('time_of_day')
    .agg([
        pl.col('duration_minutes').median().alias('median_duration'),
        pl.col('duration_minutes').count().alias('session_count')
    ])
    .with_columns(
        pl.col('time_of_day')
        .replace_strict({'Morning':1,'Afternoon':2,'Evening':3,'Night':4},default=5)
        .cast(pl.Int8)
        .alias('order')
    )
    .sort('order')
)
display(tod_stats)

time_of_day,median_duration,session_count,order
str,f64,u32,i8
"""Morning""",16.0,64,1
"""Afternoon""",11.0,38,2
"""Evening""",19.5,38,3
"""Night""",14.5,56,4


In [30]:
fig_tod = px.box(
    tod_analytics.to_pandas(),
    x='time_of_day',
    y='duration_minutes',
    color='time_of_day',
    category_orders={'time_of_day':['Morning','Afternoon','Evening','Night']},
    title='Session Duration by Time of Day',
    labels={'duration_minutes':'Minutes','time_of_day':'Time of Day'},
    points='outliers'
)

fig_tod.show()