In [23]:
# Importing libraries
import pandas as pd

In [24]:
# Loading dataset in chunks to avoid crashing

use_cols = [
    "user_id",
    "event_type",
    "event_time",
    "product_id",
    "category_code",
    "price",
]
steps = {"view", "cart", "purchase"}

chunksize = 500_000  # adjust if needed: 200kâ€“1M
target_users = 200_000  # how many users to keep in subset

user_pool = set()
kept_chunks = []

In [25]:
# Pass 1: collect user_ids from funnel events (without loading full file)
for chunk in pd.read_csv(
    "../data/raw/2019-Oct.csv", usecols=use_cols, chunksize=chunksize
):
    chunk = chunk[chunk["event_type"].isin(steps)]
    user_pool.update(chunk["user_id"].dropna().astype("int64").unique())

    if len(user_pool) >= target_users:
        break

sample_users = set(list(user_pool)[:target_users])

In [26]:
# Pass 2: pull only rows for sampled users (again in chunks)
for chunk in pd.read_csv(
    "../data/raw/2019-Oct.csv", usecols=use_cols, chunksize=chunksize
):
    chunk = chunk[chunk["event_type"].isin(steps)]
    chunk = chunk[chunk["user_id"].isin(sample_users)]

    kept_chunks.append(chunk)

df_subset = pd.concat(kept_chunks, ignore_index=True)

In [27]:
# Parse datetime and drop bad rows
df_subset["event_time"] = pd.to_datetime(df_subset["event_time"], errors="coerce")
df_subset = df_subset.dropna(subset=["event_time"])

In [29]:
# Save processed subset
df_subset.to_csv("../data/processed/funnel_events.csv", index=False)

df_subset.shape

(6610211, 6)

Creating sample dataset of funnel_events.csv to push onto github

In [31]:
df = pd.read_csv("../data/processed/funnel_events.csv")

sample_users = df["user_id"].dropna().drop_duplicates().sample(35000, random_state=42)
df_small = df[df["user_id"].isin(sample_users)].copy()

df_small.to_csv("../data/processed/funnel_events_sample.csv", index=False)

df_small.shape

(1155774, 6)