In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pickle
import os
from scipy import sparse

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization parameters
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['font.size'] = 12

# Load processed data
print("Loading processed data...")
big_matrix = pd.read_csv('outputs/data/processed/big_matrix_processed.csv')
small_matrix = pd.read_csv('outputs/data/processed/small_matrix_processed.csv')
item_features_df = pd.read_csv('outputs/data/processed/item_features_processed.csv')
user_features_selected = pd.read_csv('outputs/data/processed/user_features_processed.csv')

print(f"Loaded {len(big_matrix)} interactions from big matrix")


Loading processed data...
Loaded 11564987 interactions from big matrix


In [2]:
# Create user and video mappings
print("Creating user and video mappings...")
# Convert user_id and video_id to categorical indices
big_matrix['user_id'] = big_matrix['user_id'].astype('category')
big_matrix['video_id'] = big_matrix['video_id'].astype('category')

# Create interaction matrix with user and video categorical codes
interaction_matrix = big_matrix[['user_id', 'video_id', 'watch_ratio', 'play_duration', 'video_duration']].copy()
interaction_matrix['user_idx'] = interaction_matrix['user_id'].cat.codes
interaction_matrix['video_idx'] = interaction_matrix['video_id'].cat.codes

# Create mapping dictionaries for later use
user_id_to_idx = dict(zip(interaction_matrix['user_id'], interaction_matrix['user_idx']))
video_id_to_idx = dict(zip(interaction_matrix['video_id'], interaction_matrix['video_idx']))
idx_to_user_id = dict(zip(interaction_matrix['user_idx'], interaction_matrix['user_id']))
idx_to_video_id = dict(zip(interaction_matrix['video_idx'], interaction_matrix['video_id']))

# Save these mappings for later use
with open('outputs/data/processed/user_mapping.pkl', 'wb') as f:
    pickle.dump((user_id_to_idx, idx_to_user_id), f)
    
with open('outputs/data/processed/video_mapping.pkl', 'wb') as f:
    pickle.dump((video_id_to_idx, idx_to_video_id), f)


Creating user and video mappings...


In [3]:
# Create user and video mappings
print("Creating user and video mappings...")
# Convert user_id and video_id to categorical indices
big_matrix['user_id'] = big_matrix['user_id'].astype('category')
big_matrix['video_id'] = big_matrix['video_id'].astype('category')

# Create interaction matrix with user and video categorical codes
interaction_matrix = big_matrix[['user_id', 'video_id', 'watch_ratio', 'play_duration', 'video_duration']].copy()
interaction_matrix['user_idx'] = interaction_matrix['user_id'].cat.codes
interaction_matrix['video_idx'] = interaction_matrix['video_id'].cat.codes

# Create mapping dictionaries for later use
user_id_to_idx = dict(zip(interaction_matrix['user_id'], interaction_matrix['user_idx']))
video_id_to_idx = dict(zip(interaction_matrix['video_id'], interaction_matrix['video_idx']))
idx_to_user_id = dict(zip(interaction_matrix['user_idx'], interaction_matrix['user_id']))
idx_to_video_id = dict(zip(interaction_matrix['video_idx'], interaction_matrix['video_id']))

# Save these mappings for later use
with open('outputs/data/processed/user_mapping.pkl', 'wb') as f:
    pickle.dump((user_id_to_idx, idx_to_user_id), f)
    
with open('outputs/data/processed/video_mapping.pkl', 'wb') as f:
    pickle.dump((video_id_to_idx, idx_to_video_id), f)


Creating user and video mappings...


In [4]:
# Calculate engagement metrics per user-video pair
print("Creating enhanced interaction features...")
interaction_agg = interaction_matrix.groupby(['user_idx', 'video_idx']).agg({
    'watch_ratio': ['mean', 'sum', 'count'],
    'play_duration': ['mean', 'sum'],
    'video_duration': ['mean']
}).reset_index()

# Flatten multi-level column names
interaction_agg.columns = ['user_idx', 'video_idx', 'watch_ratio_mean', 'watch_ratio_sum', 
                          'interaction_count', 'play_duration_mean', 'play_duration_sum', 
                          'video_duration_mean']

# Calculate additional engagement metrics
interaction_agg['completion_rate'] = np.minimum(interaction_agg['play_duration_mean'] / 
                                              interaction_agg['video_duration_mean'], 1.0)
interaction_agg['replay_factor'] = interaction_agg['interaction_count'] / interaction_agg['interaction_count'].mean()

# Cap extremely high watch_ratio values
max_watch_ratio = 3.0
interaction_agg['watch_ratio_capped'] = interaction_agg['watch_ratio_mean'].clip(upper=max_watch_ratio)


Creating enhanced interaction features...


In [5]:
# Calculate advanced confidence scores
print("Calculating advanced confidence scores...")

# Calculate confidence using a more sophisticated approach for better recall
base_confidence = 2.0  # Higher base value
alpha = 60  # Much higher weight for watch_ratio
beta = 0.8   # Higher weight for completion_rate
gamma = 0.5  # Higher weight for replay_factor

# First, create the recall-focused confidence
interaction_agg['confidence_recall'] = base_confidence + \
                              alpha * np.log1p(interaction_agg['watch_ratio_capped']) * \
                              (1 + beta * interaction_agg['completion_rate']) * \
                              (1 + gamma * np.log1p(interaction_agg['replay_factor']))

# Create a precision-focused confidence with more conservative parameters
base_confidence_precision = 1.0
alpha_precision = 40
beta_precision = 0.5
gamma_precision = 0.3

interaction_agg['confidence_precision'] = base_confidence_precision + \
                              alpha_precision * np.log1p(interaction_agg['watch_ratio_capped']) * \
                              (1 + beta_precision * interaction_agg['completion_rate']) * \
                              (1 + gamma_precision * np.log1p(interaction_agg['replay_factor']))

# Use the recall-focused confidence as the default
interaction_agg['confidence'] = interaction_agg['confidence_recall']

# Visualize confidence distribution
plt.figure(figsize=(10, 6))
sns.histplot(interaction_agg['confidence'], bins=50)
plt.title('Distribution of Confidence Scores')
plt.xlabel('Confidence Score')
plt.ylabel('Count')
plt.savefig('outputs/figures/confidence_distribution.png')
plt.close()


Calculating advanced confidence scores...


In [6]:
# Sample interactions for training and testing
print("Sampling interactions for training and testing...")
SAMPLE_SIZE = 2_000_000  # Number of interactions to sample

# Sample from big matrix for training
train_interactions = big_matrix.sample(n=min(SAMPLE_SIZE, len(big_matrix)), random_state=42)

# Get unique users and videos from training
train_users = set(train_interactions['user_id'])
train_videos = set(train_interactions['video_id'])

# Filter small matrix for testing to ensure we have features for all users and videos
test_interactions = small_matrix[
    small_matrix['user_id'].isin(train_users) & 
    small_matrix['video_id'].isin(train_videos)
].sample(n=min(SAMPLE_SIZE, len(small_matrix)), random_state=42)

# Add user and video indices to test interactions
test_interactions['user_idx'] = test_interactions['user_id'].map(user_id_to_idx)
test_interactions['video_idx'] = test_interactions['video_id'].map(video_id_to_idx)

print(f"Training set: {len(train_interactions)} interactions")
print(f"Testing set: {len(test_interactions)} interactions")


Sampling interactions for training and testing...
Training set: 2000000 interactions
Testing set: 2000000 interactions


In [7]:
# Extract user and item features
print("Extracting features for users and items...")

# Create mapping dictionaries for faster lookups
user_features_dict = user_features_selected.set_index('user_id').to_dict('index')
item_features_dict = item_features_df.set_index('video_id').to_dict('index')

# Extract user features
user_feature_cols = [col for col in user_features_selected.columns if col != 'user_id']
user_features_train = np.array([
    [user_features_dict.get(uid, {}).get(col, 0) for col in user_feature_cols]
    for uid in train_interactions['user_id']
])
user_features_test = np.array([
    [user_features_dict.get(uid, {}).get(col, 0) for col in user_feature_cols]
    for uid in test_interactions['user_id']
])

# Extract item features
item_feature_cols = [col for col in item_features_df.columns if col != 'video_id']
item_features_train = np.array([
    [item_features_dict.get(vid, {}).get(col, 0) for col in item_feature_cols]
    for vid in train_interactions['video_id']
])
item_features_test = np.array([
    [item_features_dict.get(vid, {}).get(col, 0) for col in item_feature_cols]
    for vid in test_interactions['video_id']
])

print(f"User features shape: {user_features_train.shape}")
print(f"Item features shape: {item_features_train.shape}")


Extracting features for users and items...
User features shape: (2000000, 26)
Item features shape: (2000000, 69)


In [8]:
# Scale features and apply dimensionality reduction
print("Scaling features and applying dimensionality reduction...")

# Scale item features
item_scaler = StandardScaler()
item_features_train_scaled = item_scaler.fit_transform(item_features_train)
item_features_test_scaled = item_scaler.transform(item_features_test)

# Scale user features
user_scaler = StandardScaler()
user_features_train_scaled = user_scaler.fit_transform(user_features_train)
user_features_test_scaled = user_scaler.transform(user_features_test)

# Scale target values
target_scaler = StandardScaler()
y_train = train_interactions[['watch_ratio']].values
y_test = test_interactions[['watch_ratio']].values
y_train_scaled = target_scaler.fit_transform(y_train)
y_test_scaled = target_scaler.transform(y_test)

# Apply PCA to reduce dimensionality while preserving 95% variance
pca_item = PCA(n_components=0.95)
item_features_train_reduced = pca_item.fit_transform(item_features_train_scaled)
item_features_test_reduced = pca_item.transform(item_features_test_scaled)

pca_user = PCA(n_components=0.95)
user_features_train_reduced = pca_user.fit_transform(user_features_train_scaled)
user_features_test_reduced = pca_user.transform(user_features_test_scaled)

print(f"Reduced user features shape: {user_features_train_reduced.shape}")
print(f"Reduced item features shape: {item_features_train_reduced.shape}")
print(f"Explained variance (user): {sum(pca_user.explained_variance_ratio_):.4f}")
print(f"Explained variance (item): {sum(pca_item.explained_variance_ratio_):.4f}")


Scaling features and applying dimensionality reduction...
Reduced user features shape: (2000000, 23)
Reduced item features shape: (2000000, 51)
Explained variance (user): 0.9639
Explained variance (item): 0.9543


In [10]:
# Create neural network features
print("Creating neural network features...")

# Create user and video indices if they don't exist
if 'user_idx' not in train_interactions.columns:
    print("Creating user and video indices...")
    train_interactions['user_id'] = train_interactions['user_id'].astype('category')
    train_interactions['video_id'] = train_interactions['video_id'].astype('category')
    train_interactions['user_idx'] = train_interactions['user_id'].cat.codes
    train_interactions['video_idx'] = train_interactions['video_id'].cat.codes

# Create DataFrames with reduced features
user_features_nn_df = pd.DataFrame(
    user_features_train_reduced, 
    columns=[f'pca_user_{i}' for i in range(user_features_train_reduced.shape[1])]
)
user_features_nn_df['user_idx'] = train_interactions['user_idx'].values

item_features_nn_df = pd.DataFrame(
    item_features_train_reduced, 
    columns=[f'pca_item_{i}' for i in range(item_features_train_reduced.shape[1])]
)
item_features_nn_df['video_idx'] = train_interactions['video_idx'].values

# Save neural network features
user_features_nn_df.to_csv('outputs/data/processed/user_nn_features.csv', index=False)
item_features_nn_df.to_csv('outputs/data/processed/video_nn_features.csv', index=False)

# Create sparse matrices for ALS model
n_users = max(interaction_agg['user_idx']) + 1
n_items = max(interaction_agg['video_idx']) + 1

# Create training matrix with confidence values
train_sparse = sparse.csr_matrix(
    (interaction_agg['confidence'], (interaction_agg['user_idx'], interaction_agg['video_idx'])),
    shape=(n_users, n_items)
)

# Create test matrix with watch_ratio for evaluation
test_sparse = sparse.csr_matrix(
    (test_interactions['watch_ratio'], (test_interactions['user_idx'], test_interactions['video_idx'])),
    shape=(n_users, n_items)
)

# Create binary matrix for BPR model
train_binary = sparse.csr_matrix(
    (np.ones(len(interaction_agg)), (interaction_agg['user_idx'], interaction_agg['video_idx'])),
    shape=(n_users, n_items)
)

# Save sparse matrices
sparse.save_npz('outputs/data/processed/train_sparse.npz', train_sparse)
sparse.save_npz('outputs/data/processed/test_sparse.npz', test_sparse)
sparse.save_npz('outputs/data/processed/train_binary.npz', train_binary)

print(f"Created sparse matrices with {n_users} users and {n_items} items")
print(f"Sparsity: {1.0 - (len(interaction_agg) / float(n_users * n_items)):.6f}")


Creating neural network features...
Creating user and video indices...
Created sparse matrices with 7176 users and 10728 items
Sparsity: 0.866194


In [11]:
# Create sequence features
print("Creating interaction sequences for sequential models...")

# Sort interactions by timestamp for each user
sorted_interactions = big_matrix.sort_values(['user_id', 'timestamp'])

# Create sequences of video interactions per user
max_seq_length = 20  # Maximum sequence length
user_sequences = {}

for user_id, group in sorted_interactions.groupby('user_id'):
    if user_id in user_id_to_idx:
        user_idx = user_id_to_idx[user_id]
        videos = group['video_id'].map(video_id_to_idx).dropna().astype(int).tolist()
        
        # Keep only the most recent interactions if sequence is too long
        if len(videos) > max_seq_length:
            videos = videos[-max_seq_length:]
            
        user_sequences[user_idx] = videos

# Calculate sequence statistics
seq_lengths = [len(seq) for seq in user_sequences.values()]
avg_seq_length = np.mean(seq_lengths)
median_seq_length = np.median(seq_lengths)
max_seq_length = max(seq_lengths)

print(f"Average sequence length: {avg_seq_length:.2f}")
print(f"Median sequence length: {median_seq_length:.2f}")
print(f"Max sequence length: {max_seq_length}")

# Save sequences
with open('outputs/data/processed/user_sequences.pkl', 'wb') as f:
    pickle.dump(user_sequences, f)

# Save train and test interactions
train_interactions.to_csv('outputs/data/processed/train_interactions.csv', index=False)
test_interactions.to_csv('outputs/data/processed/test_interactions.csv', index=False)

# Save interaction aggregations
interaction_agg.to_csv('outputs/data/processed/interaction_agg.csv', index=False)

# Save scalers and PCA objects
with open('outputs/data/processed/user_scaler.pkl', 'wb') as f:
    pickle.dump(user_scaler, f)
with open('outputs/data/processed/item_scaler.pkl', 'wb') as f:
    pickle.dump(item_scaler, f)
with open('outputs/data/processed/target_scaler.pkl', 'wb') as f:
    pickle.dump(target_scaler, f)
with open('outputs/data/processed/pca_user.pkl', 'wb') as f:
    pickle.dump(pca_user, f)
with open('outputs/data/processed/pca_item.pkl', 'wb') as f:
    pickle.dump(pca_item, f)

print("Feature engineering complete!")


Creating interaction sequences for sequential models...
Average sequence length: 20.00
Median sequence length: 20.00
Max sequence length: 20
Feature engineering complete!
