In [7]:
import pandas as pd
import numpy as np

# Load preprocessed data
df = pd.read_csv("../data/eda_summary.csv")

# Convert timestamp to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")

# Display the first few rows
df.head()


Unnamed: 0,user_id,event,content_id,timestamp
0,65,like,content_6,1970-01-01 00:00:00.000000000
1,9,like,content_12,1970-01-01 00:00:00.000536525
2,18,like,content_9,1970-01-01 00:00:00.001076357
3,18,share,content_24,1970-01-01 00:00:00.001618562
4,13,like,content_32,1970-01-01 00:00:00.004856023


In [8]:
# Extract Hour, Day of Week, and Weekend Indicator
df["hour_of_day"] = df["timestamp"].dt.hour
df["day_of_week"] = df["timestamp"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].apply(lambda x: 1 if x >= 5 else 0)

# Display updated dataframe
df[["timestamp", "hour_of_day", "day_of_week", "is_weekend"]].head()

Unnamed: 0,timestamp,hour_of_day,day_of_week,is_weekend
0,1970-01-01 00:00:00.000000000,0,3,0
1,1970-01-01 00:00:00.000536525,0,3,0
2,1970-01-01 00:00:00.001076357,0,3,0
3,1970-01-01 00:00:00.001618562,0,3,0
4,1970-01-01 00:00:00.004856023,0,3,0


In [9]:
# Count of total interactions per user
df["user_event_count"] = df.groupby("user_id")["event"].transform("count")

# Count of unique content items a user interacted with
df["unique_content_count"] = df.groupby("user_id")["content_id"].transform("nunique")

# Display updated dataframe
df[["user_id", "user_event_count", "unique_content_count"]].head()

Unnamed: 0,user_id,user_event_count,unique_content_count
0,65,6,6
1,9,7,7
2,18,14,12
3,18,14,12
4,13,14,12


In [10]:
# Compute time differences between user interactions
df["prev_timestamp"] = df.groupby("user_id")["timestamp"].shift()
df["time_between_interactions"] = (df["timestamp"] - df["prev_timestamp"]).dt.total_seconds()

# Compute average time between interactions per user
df["avg_time_between_interactions"] = df.groupby("user_id")["time_between_interactions"].transform("mean")

# Drop unnecessary columns
df.drop(columns=["prev_timestamp", "time_between_interactions"], inplace=True)

# Display updated dataframe
df[["user_id", "avg_time_between_interactions"]].head()

Unnamed: 0,user_id,avg_time_between_interactions
0,65,0.193228
1,9,0.166577
2,18,0.076304
3,18,0.076304
4,13,0.072331


In [11]:
# Total number of interactions per content item
df["content_interaction_count"] = df.groupby("content_id")["event"].transform("count")

# Share Ratio: Fraction of shares among total interactions per content item
df["content_share_ratio"] = df.groupby("content_id")["event"].transform(lambda x: (x == "share").sum() / len(x))

# Display updated dataframe
df[["content_id", "content_interaction_count", "content_share_ratio"]].head()

Unnamed: 0,content_id,content_interaction_count,content_share_ratio
0,content_6,12,0.083333
1,content_12,20,0.05
2,content_9,9,0.111111
3,content_24,10,0.2
4,content_32,14,0.0


In [12]:
# Define session threshold (30 minutes inactivity)
session_threshold = pd.Timedelta(minutes=30)

# Identify session breaks
df["prev_timestamp"] = df.groupby("user_id")["timestamp"].shift()
df["session_change"] = (df["timestamp"] - df["prev_timestamp"]) > session_threshold

# Assign unique session IDs
df["session_id"] = df.groupby("user_id")["session_change"].cumsum()

# Count number of events in each session
df["session_length"] = df.groupby(["user_id", "session_id"])["event"].transform("count")

# Drop temporary columns
df.drop(columns=["prev_timestamp", "session_change"], inplace=True)

# Display updated dataframe
df[["user_id", "session_id", "session_length"]].head()

Unnamed: 0,user_id,session_id,session_length
0,65,0,6
1,9,0,7
2,18,0,14
3,18,0,14
4,13,0,14


In [13]:
# Save the final dataset with new features
df.to_csv("../data/features.csv", index=False)

# Display confirmation message
print("Feature engineering complete. Saved as features.csv!")

Feature engineering complete. Saved as features.csv!
