In [4]:
import pandas as pd

# Read raw clickstream CSVs
df_views = pd.read_csv("./data/raw/product_views.csv")
df_cart = pd.read_csv("./data/raw/add_to_cart.csv")
df_ads = pd.read_csv("./data/raw/ad_clicks.csv")

print(f"Loaded product_views: {df_views.shape}")
print(f"Loaded add_to_cart: {df_cart.shape}")
print(f"Loaded ad_clicks: {df_ads.shape}")

# Example: Load user profile
# Replace with your actual user profile file
df_profile = pd.DataFrame({
    "user_id": df_views["user_id"].unique(),
    "loyalty_status": ["Gold"] * len(df_views["user_id"].unique()),
    "location": ["US"] * len(df_views["user_id"].unique()),
    "device_type": ["mobile"] * len(df_views["user_id"].unique())
})

print(f"Generated dummy user profile: {df_profile.shape}")

# Merge events: example features
# 1) Count page views per user
views_per_user = df_views.groupby("user_id").agg(
    views_count=("event_id", "count")
).reset_index()

# 2) Count add-to-cart events per user
cart_per_user = df_cart.groupby("user_id").agg(
    cart_count=("event_id", "count")
).reset_index()

# 3) Count ad clicks per user
ads_per_user = df_ads.groupby("user_id").agg(
    ad_click_count=("event_id", "count")
).reset_index()

# Merge all counts
features = df_profile \
    .merge(views_per_user, on="user_id", how="left") \
    .merge(cart_per_user, on="user_id", how="left") \
    .merge(ads_per_user, on="user_id", how="left")

# Fill NaNs with zero where events didn’t occur
features.fillna(0, inplace=True)

# Add simple binary label for demonstration
# Example: if user did any add-to-cart → label = 1
features["label"] = features["cart_count"].apply(lambda x: 1 if x > 0 else 0)

print("Final feature snapshot:")
print(features.head())

# Save features as Parquet for train.py
features.to_parquet("./data/processed/features.parquet", index=False)
print("Features saved to data/processed/features.parquet")


Loaded product_views: (10000, 8)
Loaded add_to_cart: (10000, 9)
Loaded ad_clicks: (10000, 8)
Generated dummy user profile: (10000, 4)
Final feature snapshot:
                                user_id loyalty_status location device_type  \
0  1061988f-b92e-4f6c-b95b-5ccb02446533           Gold       US      mobile   
1  7d82f638-48de-4fe1-86f1-9940465ae416           Gold       US      mobile   
2  687e14d2-4f6f-4766-ab6b-b18f7e72838e           Gold       US      mobile   
3  5acc33ec-68d4-47a9-93e2-fdbaa957a273           Gold       US      mobile   
4  53f56fa2-bde7-4c7a-bffc-c7966216819b           Gold       US      mobile   

   views_count  cart_count  ad_click_count  label  
0            1         0.0             0.0      0  
1            1         0.0             0.0      0  
2            1         0.0             0.0      0  
3            1         0.0             0.0      0  
4            1         0.0             0.0      0  
Features saved to data/processed/features.parquet
