In [75]:
# Importing Libraries
import pandas as pd

This notebook uses a processed subset of the original Kaggle dataset created in data_preparation.ipynb to ensure reproducibility and efficient analysis.

In [76]:
# Loading dataset
df = pd.read_csv("../data/processed/funnel_events.csv")

df.head()

Unnamed: 0,event_time,event_type,product_id,category_code,price,user_id
0,2019-10-01 00:00:00+00:00,view,44600062,,35.79,541312140
1,2019-10-01 00:00:00+00:00,view,3900821,appliances.environment.water_heater,33.2,554748717
2,2019-10-01 00:00:01+00:00,view,17200506,furniture.living_room.sofa,543.1,519107250
3,2019-10-01 00:00:01+00:00,view,1307067,computers.notebook,251.74,550050854
4,2019-10-01 00:00:04+00:00,view,1004237,electronics.smartphone,1081.98,535871217


In [77]:
# Converting "event_time" to datetime format and dropping null conversions
df["event_time"] = pd.to_datetime(df["event_time"], errors="coerce")
df= df.dropna(subset=["event_time"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6610211 entries, 0 to 6610210
Data columns (total 6 columns):
 #   Column         Dtype              
---  ------         -----              
 0   event_time     datetime64[ns, UTC]
 1   event_type     object             
 2   product_id     int64              
 3   category_code  object             
 4   price          float64            
 5   user_id        int64              
dtypes: datetime64[ns, UTC](1), float64(1), int64(2), object(2)
memory usage: 302.6+ MB


In [78]:
df["event_type"].value_counts()

event_type
view        6354871
cart         130600
purchase     124740
Name: count, dtype: int64

In [79]:
df["user_id"].nunique()

200000

In [80]:
steps = ["view", "cart", "purchase"]

# Create per-user flag
user_steps = (
    df[df["event_type"].isin(steps)]
    .assign(flag=1)
    .pivot_table(
        index="user_id",
        columns="event_type",
        values="flag",
        aggfunc="max",
        fill_value=0,
    )
)

# Ensure all step columns exist
for s in steps:
    if s not in user_steps.columns:
        user_steps[s] = 0

# Sequential funnel counts
view_users = (user_steps["view"] == 1).sum()
cart_users = ((user_steps["view"] == 1) & (user_steps["cart"] == 1)).sum()
purchase_users = (
    (user_steps["view"] == 1)
    & (user_steps["cart"] == 1)
    & (user_steps["purchase"] == 1)
).sum()

funnel_counts_seq = pd.DataFrame(
    {"event_type": steps, "users": [view_users, cart_users, purchase_users]}
)

funnel_counts_seq

Unnamed: 0,event_type,users
0,view,199994
1,cart,35506
2,purchase,23472


In [81]:
funnel_counts_seq["step_conversion"] = funnel_counts_seq["users"].div(funnel_counts_seq["users"].shift(1))
funnel_counts_seq.loc[funnel_counts_seq["event_type"] == "view", "step_conversion"] = 1.0

funnel_counts_seq["drop_off_users"] = (funnel_counts_seq["users"].shift(1) - funnel_counts_seq["users"])
funnel_counts_seq.loc[funnel_counts_seq["event_type"] == "view", "drop_off_users"] = 0

funnel_counts_seq["drop_off_rate"] = 1 - funnel_counts_seq["step_conversion"]
funnel_counts_seq.loc[funnel_counts_seq["event_type"] == "view", "drop_off_rate"] = 0

funnel_counts_seq

Unnamed: 0,event_type,users,step_conversion,drop_off_users,drop_off_rate
0,view,199994,1.0,0.0,0.0
1,cart,35506,0.177535,164488.0,0.822465
2,purchase,23472,0.661071,12034.0,0.338929


In [82]:
overall_conversion = (
    funnel_counts_seq.loc[funnel_counts_seq["event_type"] == "purchase", "users"].iloc[0]
    / funnel_counts_seq.loc[funnel_counts_seq["event_type"] == "view", "users"].iloc[0]
)
overall_conversion

0.11736352090562717

In [83]:
df["event_date"] = df["event_time"].dt.date

daily_funnel = (
    df[df["event_type"].isin(steps)]
    .groupby(["event_date", "event_type"])["user_id"]
    .nunique()
    .reset_index(name="users")
)

daily_funnel.head()

Unnamed: 0,event_date,event_type,users
0,2019-10-01,cart,7743
1,2019-10-01,purchase,12394
2,2019-10-01,view,166027
3,2019-10-02,cart,3521
4,2019-10-02,purchase,5999


In [84]:
df["event_time"].dt.date.min(), df["event_time"].dt.date.max()
df["event_time"].dt.date.nunique()

31

In [85]:
daily_funnel_pivot = daily_funnel.pivot(
    index="event_date", columns="event_type", values="users"
).reindex(columns=steps)
daily_funnel_pivot = daily_funnel_pivot.fillna(0)

daily_funnel_pivot["cart_rate"] = (
    daily_funnel_pivot["cart"] / daily_funnel_pivot["view"]
)
daily_funnel_pivot["purchase_rate"] = (
    daily_funnel_pivot["purchase"] / daily_funnel_pivot["view"]
)

daily_funnel_pivot.head()

event_type,view,cart,purchase,cart_rate,purchase_rate
event_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-10-01,166027,7743,12394,0.046637,0.074651
2019-10-02,62669,3521,5999,0.056184,0.095725
2019-10-03,26778,2150,3056,0.08029,0.114124
2019-10-04,29169,4345,3862,0.14896,0.132401
2019-10-05,24127,2989,2995,0.123886,0.124135


In [86]:
funnel_counts.to_csv("../data/processed/funnel_counts_summary.csv", index=False)
daily_funnel_pivot.reset_index().to_csv(
    "../data/processed/daily_funnel_metrics.csv", index=False
)