In [1]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\anujp\Desktop\Data-Visualization-Final-Project\data\user_behavior_sample_data.csv"
df = pd.read_csv(file_path)

# Display basic information
print("✅ Basic Info:")
print(df.info())
print("\n✅ First 5 rows:")
print(df.head())

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n✅ Number of duplicate rows: {duplicates}")

# Check for missing values
print("\n✅ Missing values per column:")
print(df.isnull().sum())

# Unique values in each column
print("\n✅ Unique values:")
print("Unique Users:", df['UserID'].nunique())
print("Unique Items:", df['ItemID'].nunique())
print("Unique Categories:", df['CategoryID'].nunique())
print("Unique Behavior Types:", df['BehaviorType'].nunique())

# Behavior type distribution
print("\n✅ Behavior Type Counts:")
print(df['BehaviorType'].value_counts())

# Convert timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Date'] = df['Timestamp'].dt.date
df['Hour'] = df['Timestamp'].dt.hour
df['Weekday'] = df['Timestamp'].dt.day_name()

# Interactions per user
user_interactions = df['UserID'].value_counts()
print("\n✅ Top 5 most active users:")
print(user_interactions.head())

# Interactions per item
item_interactions = df['ItemID'].value_counts()
print("\n✅ Top 5 most interacted items:")
print(item_interactions.head())

# Interactions per category
category_interactions = df['CategoryID'].value_counts()
print("\n✅ Top 5 most interacted categories:")
print(category_interactions.head())

# Behavior type by day
print("\n✅ Behavior Type Counts by Date:")
print(df.groupby(['Date', 'BehaviorType']).size().unstack(fill_value=0).head())

# Behavior type by hour
print("\n✅ Behavior Type Counts by Hour of Day:")
print(df.groupby(['Hour', 'BehaviorType']).size().unstack(fill_value=0).head())

# Behavior funnel: number of users per behavior type
print("\n✅ Users per Behavior Type:")
print(df.groupby('BehaviorType')['UserID'].nunique())

# Optional: Calculate user funnel
behavior_pivot = df.pivot_table(index='UserID', columns='BehaviorType', aggfunc='size', fill_value=0)
print("\n✅ Sample User Behavior Funnel:")
print(behavior_pivot.head())

# Time between actions per user/item (Optional deeper insight)
df_sorted = df.sort_values(by=['UserID', 'ItemID', 'Timestamp'])
df_sorted['TimeDiff'] = df_sorted.groupby(['UserID', 'ItemID'])['Timestamp'].diff()
print("\n✅ Time difference between actions (sample):")
print(df_sorted[['UserID', 'ItemID', 'BehaviorType', 'Timestamp', 'TimeDiff']].head(10))


✅ Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86953525 entries, 0 to 86953524
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   UserID        int64 
 1   ItemID        int64 
 2   CategoryID    int64 
 3   BehaviorType  object
 4   Timestamp     object
dtypes: int64(3), object(2)
memory usage: 3.2+ GB
None

✅ First 5 rows:
   UserID   ItemID  CategoryID BehaviorType            Timestamp
0       1  2576651      149192           pv  2017-11-25 01:21:25
1       1  3830808     4181361           pv  2017-11-25 07:04:53
2       1  4365585     2520377           pv  2017-11-25 07:49:06
3       1  4606018     2735466           pv  2017-11-25 13:28:01
4       1   230380      411153           pv  2017-11-25 21:22:22

✅ Number of duplicate rows: 47

✅ Missing values per column:
UserID          0
ItemID          0
CategoryID      0
BehaviorType    0
Timestamp       0
dtype: int64

✅ Unique values:
Unique Users: 987982
Unique Items: 3962559
Un

In [2]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\anujp\Desktop\Data-Visualization-Final-Project\data\UserBehavior\UserBehavior.csv"
df = pd.read_csv(file_path, header=None)

# Rename columns for clarity
df.columns = ['UserID', 'ItemID', 'CategoryID', 'BehaviorType', 'Timestamp']

# Display basic information
print("✅ Basic Info:")
print(df.info())
print("\n✅ First 5 rows:")
print(df.head())

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n✅ Number of duplicate rows: {duplicates}")

# Check for missing values
print("\n✅ Missing values per column:")
print(df.isnull().sum())

# Unique values in each column
print("\n✅ Unique values:")
print("Unique Users:", df['UserID'].nunique())
print("Unique Items:", df['ItemID'].nunique())
print("Unique Categories:", df['CategoryID'].nunique())
print("Unique Behavior Types:", df['BehaviorType'].nunique())

# Behavior type distribution
print("\n✅ Behavior Type Counts:")
print(df['BehaviorType'].value_counts())

# Convert Unix timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df['Date'] = df['Timestamp'].dt.date
df['Hour'] = df['Timestamp'].dt.hour
df['Weekday'] = df['Timestamp'].dt.day_name()

# Interactions per user
user_interactions = df['UserID'].value_counts()
print("\n✅ Top 5 most active users:")
print(user_interactions.head())

# Interactions per item
item_interactions = df['ItemID'].value_counts()
print("\n✅ Top 5 most interacted items:")
print(item_interactions.head())

# Interactions per category
category_interactions = df['CategoryID'].value_counts()
print("\n✅ Top 5 most interacted categories:")
print(category_interactions.head())

# Behavior type by day
print("\n✅ Behavior Type Counts by Date:")
print(df.groupby(['Date', 'BehaviorType']).size().unstack(fill_value=0).head())

# Behavior type by hour
print("\n✅ Behavior Type Counts by Hour of Day:")
print(df.groupby(['Hour', 'BehaviorType']).size().unstack(fill_value=0).head())

# Behavior funnel: number of users per behavior type
print("\n✅ Users per Behavior Type:")
print(df.groupby('BehaviorType')['UserID'].nunique())

# Optional: Calculate user funnel
behavior_pivot = df.pivot_table(index='UserID', columns='BehaviorType', aggfunc='size', fill_value=0)
print("\n✅ Sample User Behavior Funnel:")
print(behavior_pivot.head())

# Time between actions per user/item (Optional deeper insight)
df_sorted = df.sort_values(by=['UserID', 'ItemID', 'Timestamp'])
df_sorted['TimeDiff'] = df_sorted.groupby(['UserID', 'ItemID'])['Timestamp'].diff()
print("\n✅ Time difference between actions (sample):")
print(df_sorted[['UserID', 'ItemID', 'BehaviorType', 'Timestamp', 'TimeDiff']].head(10))


✅ Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100150807 entries, 0 to 100150806
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   UserID        int64 
 1   ItemID        int64 
 2   CategoryID    int64 
 3   BehaviorType  object
 4   Timestamp     int64 
dtypes: int64(4), object(1)
memory usage: 3.7+ GB
None

✅ First 5 rows:
   UserID   ItemID  CategoryID BehaviorType   Timestamp
0       1  2268318     2520377           pv  1511544070
1       1  2333346     2520771           pv  1511561733
2       1  2576651      149192           pv  1511572885
3       1  3830808     4181361           pv  1511593493
4       1  4365585     2520377           pv  1511596146

✅ Number of duplicate rows: 49

✅ Missing values per column:
UserID          0
ItemID          0
CategoryID      0
BehaviorType    0
Timestamp       0
dtype: int64

✅ Unique values:
Unique Users: 987994
Unique Items: 4162024
Unique Categories: 9439
Unique Behavior Types: 4

✅ Be