In [None]:
import os

os.environ['KAGGLE_USERNAME'] = 'USERNAME_GOES_HERE'
os.environ['KAGGLE_KEY'] = 'KEY_GOES_HERE'

!pip install kaggle
!pip install pandas

In [None]:
!kaggle datasets download retailrocket/ecommerce-dataset

In [19]:
import zipfile
with zipfile.ZipFile("./ecommerce-dataset.zip","r") as zip_ref:
    zip_ref.extractall("ecommerce-dataset")

In [24]:
import pandas as pd
import os

# Step 1: Load 1 million rows from events.csv
events_path = os.path.expanduser("events.csv")
events = pd.read_csv(events_path, nrows=1_000_000)

# Step 2: Filter for "view" events
filtered_events = events[events["event"] == "view"]

# Step 3: Load 1 million rows from both parts of item_properties
prop1_path = os.path.expanduser("item_properties_part1.csv")
prop2_path = os.path.expanduser("item_properties_part2.csv")

prop1 = pd.read_csv(prop1_path, nrows=1_000_000)
prop2 = pd.read_csv(prop2_path, nrows=1_000_000)

item_features = pd.concat([prop1, prop2], ignore_index=True)

# Drop timestamp and keep only features of items seen in filtered_events
item_features = item_features.drop(columns=["timestamp"], errors="ignore")
item_features = item_features[item_features["itemid"].isin(filtered_events["itemid"].unique())]

# Pivot to get wide format item features
pivoted_item_features = item_features.pivot_table(
    index="itemid",
    columns="property",
    values="value",
    aggfunc="first"  # take the first non-null value if duplicates
).reset_index()

# Output the shapes to confirm
print("Filtered Events:", filtered_events.shape)
print("Pivoted Item Features:", pivoted_item_features.shape)

Filtered Events: (966283, 5)
Pivoted Item Features: (120034, 998)


In [None]:
import numpy as np

# Convert timestamp to datetime
filtered_events["event_time"] = pd.to_datetime(filtered_events["timestamp"], unit='ms')

# Sort by user and time
filtered_events = filtered_events.sort_values(["visitorid", "event_time"])

# Feature 1: User activity level
user_activity = (
    filtered_events.groupby("visitorid")
    .size()
    .rename("user_event_count")
    .reset_index()
)

# Feature 2: Item popularity
item_popularity = (
    filtered_events.groupby("itemid")
    .size()
    .rename("item_view_count")
    .reset_index()
)

# Feature 3: Time since last event for user
filtered_events["time_diff_sec"] = (
    filtered_events.groupby("visitorid")["event_time"]
    .diff()
    .dt.total_seconds()
    .fillna(0)
)

# Merge user and item features back to filtered_events
filtered_events = filtered_events.merge(user_activity, on="visitorid", how="left")
filtered_events = filtered_events.merge(item_popularity, on="itemid", how="left")

# Output sample
print(filtered_events.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_events["event_time"] = pd.to_datetime(filtered_events["timestamp"], unit='ms')


       timestamp  visitorid event  itemid  transactionid  \
0  1439487966444          1  view   72028            NaN   
1  1438969904567          2  view  325215            NaN   
2  1438970013790          2  view  325215            NaN   
3  1438970212664          2  view  259884            NaN   
4  1438970468920          2  view  216305            NaN   

               event_time  time_diff_sec  user_event_count  item_view_count  
0 2015-08-13 17:46:06.444          0.000                 1               17  
1 2015-08-07 17:51:44.567          0.000                 8               30  
2 2015-08-07 17:53:33.790        109.223                 8               30  
3 2015-08-07 17:56:52.664        198.874                 8               81  
4 2015-08-07 18:01:08.920        256.256                 8              254  


In Summary
| **Stage**             | **What We Did**                          | **Why It Matters**                                          |
|-----------------------|------------------------------------------|--------------------------------------------------------------|
| Raw Events            | Loaded raw clickstream data              | Realistic user behavior source                               |
| View Filtering        | Focused on view events                   | Primary signal for recommendation                            |
| Time Conversion       | Converted UNIX to datetime               | Enables recency, session modeling                            |
| Metadata Join         | Added item-level features (pivoted)      | Supports content-based and hybrid recommendations            |
| Feature Engineering   | Created user/item/time features          | Empowers predictive models with behavioral signals           |

In [None]:
import pandas as pd

# Convert 'timestamp' to datetime if not already done
filtered_events['event_time'] = pd.to_datetime(filtered_events['timestamp'], unit='s')


# Step 4.1: Calculate time difference between consecutive events per user
SESSION_TIMEOUT = 30 * 60  # 30 minutes in seconds

filtered_events = filtered_events.sort_values(['visitorid', 'event_time'])
filtered_events['time_gap'] = (
    filtered_events.groupby('visitorid')['event_time']
    .diff()
    .dt.total_seconds()
    .fillna(0)
)

# Step 4.2: Mark start of new sessions
filtered_events['new_session'] = (filtered_events['time_gap'] > SESSION_TIMEOUT).astype(int)

# Step 4.3: Assign session IDs per user
filtered_events['session_id'] = filtered_events.groupby('visitorid')['new_session'].cumsum()

# Step 4.4: Create a unique session key combining user and session ID
filtered_events['session'] = filtered_events['visitorid'].astype(str) + '_' + filtered_events['session_id'].astype(str)

# Check number of sessions
print("Number of sessions:", filtered_events['session'].nunique())

# Preview sessions
print(filtered_events[['visitorid', 'event_time', 'session', 'itemid']].head(10))