In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import ast
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
import pickle

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization parameters
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['font.size'] = 12

# Create directories for outputs
os.makedirs('outputs/figures', exist_ok=True)
os.makedirs('outputs/models', exist_ok=True)
os.makedirs('outputs/data', exist_ok=True)
os.makedirs('outputs/data/processed', exist_ok=True)

# Define dataset path
DATA_PATH = "KuaiRec/data/"

# Load datasets
print("Loading datasets...")
big_matrix = pd.read_csv(f"{DATA_PATH}big_matrix.csv")
small_matrix = pd.read_csv(f"{DATA_PATH}small_matrix.csv")
social_network = pd.read_csv(f"{DATA_PATH}social_network.csv")
user_features = pd.read_csv(f"{DATA_PATH}user_features.csv")
item_daily_features = pd.read_csv(f"{DATA_PATH}item_daily_features.csv")
item_categories = pd.read_csv(f"{DATA_PATH}item_categories.csv")

print(f"Loaded {len(big_matrix)} interactions from big matrix")
print(f"Loaded {len(small_matrix)} interactions from small matrix")


Loading datasets...
Loaded 12530806 interactions from big matrix
Loaded 4676570 interactions from small matrix


In [8]:
# Clean and preprocess data
def preprocess_data(df):
    """Clean and preprocess interaction data"""
    initial_size = len(df)
    # Remove missing values and duplicates
    df = df.dropna().drop_duplicates()
    # Remove negative timestamps (invalid data)
    df = df[df['timestamp'] > 0]
    final_size = len(df)
    print(f"Removed {initial_size - final_size} rows ({(initial_size - final_size)/initial_size:.2%})")
    return df

# Process interaction data
print("Preprocessing interaction data...")
big_matrix = preprocess_data(big_matrix)
small_matrix = preprocess_data(small_matrix)

# Process social network data
print("Processing social network data...")
social_network['friend_list'] = social_network['friend_list'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)

# Process item categories
print("Processing item categories...")
item_categories['feat'] = item_categories['feat'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)

# Fill missing values in user and item features
user_features.fillna(-1, inplace=True)
item_daily_features.fillna(-1, inplace=True)

# Basic statistics
print(f"\nNumber of unique users: {big_matrix['user_id'].nunique()}")
print(f"Number of unique videos: {big_matrix['video_id'].nunique()}")
print(f"Average watch ratio: {big_matrix['watch_ratio'].mean():.4f}")

# Calculate sparsity
total_possible_interactions = big_matrix['user_id'].nunique() * big_matrix['video_id'].nunique()
sparsity = 1 - (len(big_matrix) / total_possible_interactions)
print(f"Sparsity: {sparsity:.6f}")


Preprocessing interaction data...
Removed 965819 rows (7.71%)
Removed 181992 rows (3.89%)
Processing social network data...
Processing item categories...

Number of unique users: 7176
Number of unique videos: 10728
Average watch ratio: 0.9466
Sparsity: 0.849774


In [9]:
# Process item categories with MultiLabelBinarizer
print("Creating one-hot encoded category features...")
mlb = MultiLabelBinarizer()
item_categories_encoded = pd.DataFrame(
    mlb.fit_transform(item_categories['feat']),
    columns=[f'category_{i}' for i in mlb.classes_],
    index=item_categories['video_id']
)

# Process item daily features
print("Processing item daily features...")
# Get the most recent features for each video
item_daily_latest = item_daily_features.loc[
    item_daily_features.groupby('video_id')['date'].idxmax()
].reset_index(drop=True)

# Select numerical features
numerical_features = [
    'video_duration', 'video_width', 'video_height', 
    'play_cnt', 'play_user_num', 'play_duration',
    'complete_play_cnt', 'valid_play_cnt', 'long_time_play_cnt',
    'show_cnt', 'show_user_num', 'like_cnt', 'comment_cnt',
    'share_cnt', 'download_cnt'
]

# Select categorical features for one-hot encoding
categorical_features = ['video_type', 'upload_type', 'visible_status']

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categorical_encoded = pd.DataFrame(
    encoder.fit_transform(item_daily_latest[categorical_features]),
    columns=encoder.get_feature_names_out(categorical_features),
    index=item_daily_latest.index
)

# Combine numerical and categorical features
item_features_combined = pd.concat([
    item_daily_latest[['video_id'] + numerical_features],
    categorical_encoded.reset_index(drop=True)
], axis=1)

# Merge with category features
item_features_df = pd.merge(
    item_features_combined,
    item_categories_encoded.reset_index(),
    on='video_id',
    how='left'
)

# Process user features
print("Processing user features...")
# Select relevant user features
user_features_selected = user_features[[
    'user_id', 'user_active_degree', 'follow_user_num', 'fans_user_num',
    'friend_user_num', 'register_days', 'is_lowactive_period',
    'is_live_streamer', 'is_video_author'
] + [f'onehot_feat{i}' for i in range(18)]].copy()

# Convert categorical features to numeric
categorical_user_cols = ['user_active_degree']
for col in categorical_user_cols:
    user_features_selected[col] = user_features_selected[col].astype('category').cat.codes


Creating one-hot encoded category features...
Processing item daily features...
Processing user features...


In [10]:
# Save processed data
print("Saving processed data...")
big_matrix.to_csv('outputs/data/processed/big_matrix_processed.csv', index=False)
small_matrix.to_csv('outputs/data/processed/small_matrix_processed.csv', index=False)
item_features_df.to_csv('outputs/data/processed/item_features_processed.csv', index=False)
user_features_selected.to_csv('outputs/data/processed/user_features_processed.csv', index=False)

print("Data preprocessing complete!")


Saving processed data...
Data preprocessing complete!
