In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import pickle
from scipy import sparse
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, ndcg_score, mean_absolute_error

# For Two-Tower model
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Layer, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization parameters
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['font.size'] = 12

# Create directories if they don't exist
os.makedirs('outputs/models', exist_ok=True)
os.makedirs('outputs/figures', exist_ok=True)
os.makedirs('outputs/recommendations', exist_ok=True)

# Load data
print("Loading data...")
train_interactions = pd.read_csv('outputs/data/processed/train_interactions.csv')
test_interactions = pd.read_csv('outputs/data/processed/test_interactions.csv')
interaction_agg = pd.read_csv('outputs/data/processed/interaction_agg.csv')

# Load sparse matrices
train_sparse = sparse.load_npz('outputs/data/processed/train_sparse.npz')
test_sparse = sparse.load_npz('outputs/data/processed/test_sparse.npz')

# Load mappings
with open('outputs/data/processed/user_mapping.pkl', 'rb') as f:
    user_id_to_idx, idx_to_user_id = pickle.load(f)
    
with open('outputs/data/processed/video_mapping.pkl', 'rb') as f:
    video_id_to_idx, idx_to_video_id = pickle.load(f)

# Load neural network features
user_nn_features = pd.read_csv('outputs/data/processed/user_nn_features.csv')
video_nn_features = pd.read_csv('outputs/data/processed/video_nn_features.csv')

# Load scalers and PCA objects
with open('outputs/data/processed/target_scaler.pkl', 'rb') as f:
    target_scaler = pickle.load(f)

n_users = train_sparse.shape[0]
n_items = train_sparse.shape[1]

print(f"Loaded data with {n_users} users and {n_items} items")
print(f"Sparsity: {1.0 - (train_sparse.count_nonzero() / (n_users * n_items)):.6f}")


Loading data...
Loaded data with 7176 users and 10728 items
Sparsity: 0.866194


In [21]:
# Create a custom L2 normalization layer for Two-Tower model
class L2NormalizationLayer(Layer):
    def call(self, inputs):
        return tf.nn.l2_normalize(inputs, axis=1)

# Define Two-Tower model architecture
def create_two_tower_model(user_dim, item_dim, embedding_dim=64):
    # User tower
    user_input = Input(shape=(user_dim,), name='user_input')
    user_tower = Sequential([
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(embedding_dim, activation='relu', name='user_embedding'),
        L2NormalizationLayer()
    ], name='user_tower')
    user_embedding = user_tower(user_input)
    
    # Item tower
    item_input = Input(shape=(item_dim,), name='item_input')
    item_tower = Sequential([
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(embedding_dim, activation='relu', name='item_embedding'),
        L2NormalizationLayer()
    ], name='item_tower')
    item_embedding = item_tower(item_input)
    
    # Combine towers
    merged = Concatenate()([user_embedding, item_embedding])
    x = Dense(128, activation='relu')(merged)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(1, activation='linear')(x)
    
    # Create model
    model = Model(inputs=[user_input, item_input], outputs=output, name='TwoTowerRecommender')
    
    return model, user_tower, item_tower


In [22]:
# Check if reduced feature files exist, if not, create them
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Check if the files exist
if (not os.path.exists('outputs/data/processed/user_features_train_reduced.npy') or
    not os.path.exists('outputs/data/processed/item_features_train_reduced.npy') or
    not os.path.exists('outputs/data/processed/user_features_test_reduced.npy') or
    not os.path.exists('outputs/data/processed/item_features_test_reduced.npy')):
    
    print("Reduced feature files not found. Generating them now...")
    
    # Load necessary data
    train_interactions = pd.read_csv('outputs/data/processed/train_interactions.csv')
    test_interactions = pd.read_csv('outputs/data/processed/test_interactions.csv')
    user_features = pd.read_csv('outputs/data/processed/user_features_processed.csv')
    item_features = pd.read_csv('outputs/data/processed/item_features_processed.csv')
    
    # Create mapping dictionaries for faster lookups
    user_features_dict = user_features.set_index('user_id').to_dict('index')
    item_features_dict = item_features.set_index('video_id').to_dict('index')
    
    # Extract user features
    user_feature_cols = [col for col in user_features.columns if col != 'user_id']
    user_features_train = np.array([
        [user_features_dict.get(uid, {}).get(col, 0) for col in user_feature_cols]
        for uid in train_interactions['user_id']
    ])
    user_features_test = np.array([
        [user_features_dict.get(uid, {}).get(col, 0) for col in user_feature_cols]
        for uid in test_interactions['user_id']
    ])
    
    # Extract item features
    item_feature_cols = [col for col in item_features.columns if col != 'video_id']
    item_features_train = np.array([
        [item_features_dict.get(vid, {}).get(col, 0) for col in item_feature_cols]
        for vid in train_interactions['video_id']
    ])
    item_features_test = np.array([
        [item_features_dict.get(vid, {}).get(col, 0) for col in item_feature_cols]
        for vid in test_interactions['video_id']
    ])
    
    # Scale features
    user_scaler = StandardScaler()
    item_scaler = StandardScaler()
    
    user_features_train_scaled = user_scaler.fit_transform(user_features_train)
    item_features_train_scaled = item_scaler.fit_transform(item_features_train)
    user_features_test_scaled = user_scaler.transform(user_features_test)
    item_features_test_scaled = item_scaler.transform(item_features_test)
    
    # Apply PCA
    pca_user = PCA(n_components=0.95)
    pca_item = PCA(n_components=0.95)
    
    user_features_train_reduced = pca_user.fit_transform(user_features_train_scaled)
    item_features_train_reduced = pca_item.fit_transform(item_features_train_scaled)
    user_features_test_reduced = pca_user.transform(user_features_test_scaled)
    item_features_test_reduced = pca_item.transform(item_features_test_scaled)
    
    # Save the reduced features
    np.save('outputs/data/processed/user_features_train_reduced.npy', user_features_train_reduced)
    np.save('outputs/data/processed/item_features_train_reduced.npy', item_features_train_reduced)
    np.save('outputs/data/processed/user_features_test_reduced.npy', user_features_test_reduced)
    np.save('outputs/data/processed/item_features_test_reduced.npy', item_features_test_reduced)
    
    # Save scalers and PCA objects
    with open('outputs/data/processed/user_scaler.pkl', 'wb') as f:
        pickle.dump(user_scaler, f)
    with open('outputs/data/processed/item_scaler.pkl', 'wb') as f:
        pickle.dump(item_scaler, f)
    with open('outputs/data/processed/pca_user.pkl', 'wb') as f:
        pickle.dump(pca_user, f)
    with open('outputs/data/processed/pca_item.pkl', 'wb') as f:
        pickle.dump(pca_item, f)
    
    print(f"Created reduced features: User shape={user_features_train_reduced.shape}, Item shape={item_features_train_reduced.shape}")
else:
    print("Loading existing reduced feature files...")

# Now load the files (they should exist now)
user_features_train_reduced = np.load('outputs/data/processed/user_features_train_reduced.npy')
item_features_train_reduced = np.load('outputs/data/processed/item_features_train_reduced.npy')
user_features_test_reduced = np.load('outputs/data/processed/user_features_test_reduced.npy')
item_features_test_reduced = np.load('outputs/data/processed/item_features_test_reduced.npy')


Loading existing reduced feature files...


In [23]:
# Prepare data for Two-Tower model
print("Preparing data for Two-Tower model...")

# Get user and item dimensions
user_dim = len([col for col in user_nn_features.columns if col.startswith('pca_user_')])
item_dim = len([col for col in video_nn_features.columns if col.startswith('pca_item_')])

# Create user and item feature matrices
user_features_matrix = user_nn_features[[col for col in user_nn_features.columns if col.startswith('pca_user_')]].values
item_features_matrix = video_nn_features[[col for col in video_nn_features.columns if col.startswith('pca_item_')]].values

# Extract target values
y_train = train_interactions[['watch_ratio']].values
y_test = test_interactions[['watch_ratio']].values
y_train_scaled = target_scaler.transform(y_train)
y_test_scaled = target_scaler.transform(y_test)

print(f"User features shape: {user_features_train_reduced.shape}")
print(f"Item features shape: {item_features_train_reduced.shape}")


Preparing data for Two-Tower model...
User features shape: (2000000, 23)
Item features shape: (2000000, 51)


In [24]:
# Train Two-Tower model
print("Building and training Two-Tower model...")

# Define model parameters
embedding_dim = 64
learning_rate = 0.001
batch_size = 512
epochs = 30

# Build model
two_tower_model, user_tower, item_tower = create_two_tower_model(user_dim, item_dim, embedding_dim)
two_tower_model.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss='mse',
    metrics=['mae']
)

# Define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

model_checkpoint = ModelCheckpoint(
    'outputs/models/two_tower_model.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Train model
history = two_tower_model.fit(
    [user_features_train_reduced, item_features_train_reduced],
    y_train_scaled,
    validation_data=([user_features_test_reduced, item_features_test_reduced], y_test_scaled),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

# Plot training history
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.savefig('outputs/figures/two_tower_training_history.png')
plt.close()

# Save the models
user_tower.save('outputs/models/user_tower.h5')
item_tower.save('outputs/models/item_tower.h5')


Building and training Two-Tower model...
Epoch 1/30
[1m3906/3907[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.9741 - mae: 0.3520
Epoch 1: val_loss improved from inf to 0.58865, saving model to outputs/models/two_tower_model.h5




[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 5ms/step - loss: 0.9741 - mae: 0.3520 - val_loss: 0.5887 - val_mae: 0.2414
Epoch 2/30
[1m3896/3907[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.9890 - mae: 0.3362
Epoch 2: val_loss improved from 0.58865 to 0.58492, saving model to outputs/models/two_tower_model.h5




[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.9889 - mae: 0.3362 - val_loss: 0.5849 - val_mae: 0.2427
Epoch 3/30
[1m3903/3907[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.9742 - mae: 0.3319
Epoch 3: val_loss did not improve from 0.58492
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.9742 - mae: 0.3319 - val_loss: 0.5849 - val_mae: 0.2482
Epoch 4/30
[1m3905/3907[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.9553 - mae: 0.3294
Epoch 4: val_loss did not improve from 0.58492
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.9553 - mae: 0.3294 - val_loss: 0.5911 - val_mae: 0.2652
Epoch 5/30
[1m3896/3907[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.9431 - mae: 0.3285
Epoch 5: val_loss did not improve from 0.58492
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6ms/ste



In [25]:
# Make predictions with Two-Tower model
print("Generating predictions...")
test_predictions = two_tower_model.predict([user_features_test_reduced, item_features_test_reduced])
y_pred = target_scaler.inverse_transform(test_predictions)

# Create results dataframe
results_df = pd.DataFrame({
    'user_id': test_interactions['user_id'],
    'video_id': test_interactions['video_id'],
    'true_watch_ratio': test_interactions['watch_ratio'],
    'predicted_watch_ratio': y_pred.flatten()
})

# Calculate error metrics
results_df['absolute_error'] = abs(results_df['predicted_watch_ratio'] - results_df['true_watch_ratio'])

# Calculate baseline (mean) prediction
mean_watch_ratio = results_df['true_watch_ratio'].mean()
baseline_mae = mean_absolute_error(results_df['true_watch_ratio'], np.full_like(results_df['true_watch_ratio'], mean_watch_ratio))
model_mae = results_df['absolute_error'].mean()

print(f"Baseline MAE: {baseline_mae:.4f}")
print(f"Model MAE: {model_mae:.4f}")
print(f"Improvement: {(baseline_mae - model_mae) / baseline_mae:.2%}")


Generating predictions...
[1m62500/62500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 872us/step
Baseline MAE: 0.4866
Model MAE: 0.4078
Improvement: 16.19%


In [26]:
# Define relevance threshold and top-k
RELEVANCE_THRESHOLD = 0.8
K = 25

# Mark relevant items
results_df['relevant'] = (results_df['true_watch_ratio'] >= RELEVANCE_THRESHOLD).astype(int)

# Group by user and rank predictions
results_df['rank'] = results_df.groupby('user_id')['predicted_watch_ratio'].rank(method='dense', ascending=False)

# Filter to top-K recommendations per user
top_k_recommendations = results_df[results_df['rank'] <= K]

# Calculate metrics properly
precision_values = []
recall_values = []
ndcg_values = []

for user_id in top_k_recommendations['user_id'].unique():
    # Get all recommendations for this user
    user_recs = top_k_recommendations[top_k_recommendations['user_id'] == user_id]
    user_recs = user_recs.sort_values('rank')
    
    # Get all relevant items for this user (from the entire results_df, not just top-K)
    all_user_items = results_df[results_df['user_id'] == user_id]
    total_relevant_items = all_user_items['relevant'].sum()
    
    if total_relevant_items == 0:
        continue  # Skip users with no relevant items
    
    # Calculate precision and recall for this user
    relevant_in_topk = user_recs['relevant'].sum()
    precision = relevant_in_topk / min(K, len(user_recs))
    recall = relevant_in_topk / total_relevant_items
    
    precision_values.append(precision)
    recall_values.append(recall)
    
    # Calculate NDCG for this user
    if relevant_in_topk > 0:
        # Create arrays for NDCG calculation without using pivot
        # This avoids the duplicate index error
        y_true = np.zeros(K)
        y_score = np.zeros(K)
        
        # Fill arrays with values from user_recs
        for i, (_, row) in enumerate(user_recs.iterrows()):
            if i < K:
                rank_idx = min(int(row['rank']) - 1, K-1)  # Convert rank to 0-based index
                y_true[rank_idx] = row['relevant']
                y_score[rank_idx] = row['predicted_watch_ratio']
        
        # Calculate NDCG
        user_ndcg = ndcg_score(np.array([y_true]), np.array([y_score]))
        ndcg_values.append(user_ndcg)

# Calculate average metrics
avg_precision = np.mean(precision_values) if precision_values else 0
avg_recall = np.mean(recall_values) if recall_values else 0
avg_ndcg = np.mean(ndcg_values) if ndcg_values else 0

print(f"NDCG@{K}: {avg_ndcg:.4f}")
print(f"Precision@{K}: {avg_precision:.4f}")
print(f"Recall@{K}: {avg_recall:.4f}")

# Save evaluation results
eval_results = {
    'MAE': model_mae,
    'Baseline_MAE': baseline_mae,
    f'NDCG@{K}': avg_ndcg,
    f'Precision@{K}': avg_precision,
    f'Recall@{K}': avg_recall
}

pd.DataFrame([eval_results]).to_csv('outputs/evaluation_results.csv', index=False)
print("Evaluation complete. Results saved to outputs/evaluation_results.csv")


NDCG@25: 0.9624
Precision@25: 0.8783
Recall@25: 0.0335
Evaluation complete. Results saved to outputs/evaluation_results.csv


In [28]:
# Generate final recommendations
print("Generating final recommendations...")

def generate_recommendations(num_users=100, k=50):
    """Generate recommendations for a subset of users using the Two-Tower model"""
    recommendations = []
    
    # Select a subset of users
    np.random.seed(42)  # For reproducibility
    all_user_indices = list(idx_to_user_id.keys())
    selected_users = np.random.choice(all_user_indices, min(num_users, len(all_user_indices)), replace=False)
    
    # Pre-compute all item embeddings once
    print("Computing item embeddings...")
    item_embeddings = item_tower.predict(item_features_test_reduced, verbose=0)
    
    # Generate recommendations for selected users
    for user_idx in tqdm(selected_users, desc="Generating Two-Tower recommendations"):
        if user_idx < len(user_features_test_reduced):
            # Get user features
            user_features = user_features_test_reduced[user_idx:user_idx+1]
            
            # Get user embedding
            user_embedding = user_tower.predict(user_features, verbose=0)
            
            # Calculate scores using dot product
            scores = np.dot(user_embedding, item_embeddings.T)[0]
            
            # Get top-k items
            top_indices = np.argsort(-scores)[:k]
            top_scores = scores[top_indices]
            
            for i, (item_idx, score) in enumerate(zip(top_indices, top_scores)):
                if user_idx in idx_to_user_id and item_idx < len(idx_to_video_id):
                    user_id = idx_to_user_id[user_idx]
                    video_id = idx_to_video_id[item_idx]
                    recommendations.append((user_id, video_id, float(score), i+1))
    
    # Create DataFrame
    recs_df = pd.DataFrame(recommendations, columns=['user_id', 'video_id', 'score', 'rank'])
    return recs_df

# Generate recommendations for 100 users
two_tower_recommendations = generate_recommendations(num_users=100, k=50)
two_tower_recommendations.to_csv('outputs/recommendations/two_tower_recommendations.csv', index=False)

print(f"Generated {len(two_tower_recommendations)} recommendations for 100 users")
print("Recommendation generation complete!")


Generating final recommendations...
Computing item embeddings...


Generating Two-Tower recommendations: 100%|██████████| 100/100 [00:12<00:00,  8.02it/s]

Generated 14 recommendations for 100 users
Recommendation generation complete!



