In [None]:
# Import libraries
import pandas as pd
import numpy as np

For this data analysis, we will focus on the interaction data, i.e., the data that contains the user-item interactions. If you make the choice to use the other data available, you should repeat the same kind of analysis on the other data as well.

In [3]:
rootpath="./data_final_project/KuaiRec 2.0/"

print("Loading small matrix...")
small_matrix = pd.read_csv(rootpath + "data/small_matrix.csv")

print("Loading big matrix...")
big_matrix = pd.read_csv(rootpath + "data/big_matrix.csv")

print("Loading item features...")
item_categories = pd.read_csv(rootpath + "data/item_categories.csv")
item_categories["feat"] = item_categories["feat"].map(eval)

print("Loading captions...") # captions contains chinese characters
captions = pd.read_csv(rootpath + "data/kuairec_caption_category.csv", engine='python')

print("Loading items' daily features...")
item_daily_features = pd.read_csv(rootpath + "data/item_daily_features.csv")

print("All data loaded.")

Loading small matrix...
Loading big matrix...
Loading item features...
Loading captions...
Loading items' daily features...
All data loaded.


## Implementation

Creation of the interactions_train, interactions_test and sample_submmission. I get rid of the columns that will not be usefull like play_duration and video_duration because it is used in the watch_ratio computation, so no need. I alsaw get rid of the time and date. 

Timestamps is a bit special, in the interactions_train matrix they are valuable because they capture the evolution of user preferences over time rather than treating them as static. They enable recency-based weighting where recent interactions influence recommendations more heavily, and support sequential pattern recognition for time-aware suggestions. Additionally, timestamps provide crucial context that helps differentiate between casual browsing and genuine interest.

In [None]:
from sklearn.model_selection import train_test_split

def create_interactions(big_matrix, small_matrix):
    """
    Create train/test sets including a warm-up phase for cold-start users.
    """
    interactions_train = big_matrix[['user_id', 'video_id', 'watch_ratio', 'timestamp']].copy()

    # interactions_test only needs user_id and video_id
    interactions_test = small_matrix[['user_id', 'video_id']].copy()

    return interactions_train, interactions_test


In [6]:
interactions_train, interactions_test = create_interactions(big_matrix, small_matrix)
print("Interactions train and test dataframes created.")
print(interactions_train.shape)
print(interactions_test.shape)
interactions_train.to_csv(rootpath + "data/interactions_train.csv", index=False)
interactions_test.to_csv(rootpath + "data/interactions_test.csv", index=False)

Interactions train and test dataframes created.
(12530806, 4)
(4676570, 2)


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def create_video_metadata_v2(item_categories, kuairec_caption_category, item_daily_features):
    """
    Create enhanced video metadata with improved negative feedback incorporation
    and additional features.
    """
    for df in [item_categories, kuairec_caption_category, item_daily_features]:
        if 'video_id' in df.columns:
            df['video_id'] = df['video_id'].astype(str)

    video_features = item_daily_features.drop_duplicates(subset=['video_id'])[
        ['video_id', 'author_id', 'video_duration']
    ]

    # Define engagement columns
    engagement_columns = [
        'show_cnt', 'play_cnt', 'play_duration', 'play_progress',
        'complete_play_cnt', 'valid_play_cnt', 'long_time_play_cnt', 'short_time_play_cnt',
        'like_cnt', 'cancel_like_cnt', 'comment_cnt', 'comment_stay_duration',
        'follow_cnt', 'cancel_follow_cnt', 'share_cnt', 'download_cnt',
        'report_cnt', 'reduce_similar_cnt', 'collect_cnt', 'cancel_collect_cnt'
    ]

    agg_metrics = item_daily_features.groupby('video_id')[engagement_columns].mean()

    derived_metrics = pd.DataFrame(index=agg_metrics.index)

    # Basic engagement metrics
    derived_metrics['completion_rate'] = agg_metrics['complete_play_cnt'] / agg_metrics['play_cnt'].replace(0, 1)
    derived_metrics['engagement_depth'] = agg_metrics['long_time_play_cnt'] / (
        agg_metrics['long_time_play_cnt'] + agg_metrics['short_time_play_cnt']).replace(0, 1)
    derived_metrics['validation_ratio'] = agg_metrics['valid_play_cnt'] / agg_metrics['play_cnt'].replace(0, 1)
    
    # Interaction metrics
    derived_metrics['like_ratio'] = agg_metrics['like_cnt'] / agg_metrics['show_cnt'].replace(0, 1)
    derived_metrics['like_cancel_ratio'] = agg_metrics['cancel_like_cnt'] / agg_metrics['like_cnt'].replace(0, 1)
    derived_metrics['comment_ratio'] = agg_metrics['comment_cnt'] / agg_metrics['show_cnt'].replace(0, 1)
    derived_metrics['share_ratio'] = agg_metrics['share_cnt'] / agg_metrics['show_cnt'].replace(0, 1)
    derived_metrics['collect_ratio'] = agg_metrics['collect_cnt'] / agg_metrics['show_cnt'].replace(0, 1)
    
    # Negative feedback metrics
    derived_metrics['report_ratio'] = agg_metrics['report_ratio'] = agg_metrics['report_cnt'] / agg_metrics['show_cnt'].replace(0, 1)
    derived_metrics['reduce_similar_ratio'] = agg_metrics['reduce_similar_cnt'] / agg_metrics['show_cnt'].replace(0, 1)

    # Comprehensive quality score
    derived_metrics['quality_score'] = (
        derived_metrics['completion_rate'] +
        derived_metrics['like_ratio'] * 2 +
        derived_metrics['share_ratio'] * 3 +
        derived_metrics['collect_ratio'] * 2 -
        derived_metrics['like_cancel_ratio'] * 2 -
        derived_metrics['report_ratio'] * 5 -
        derived_metrics['reduce_similar_ratio'] * 3
    )

    # Engagement score combining quality and quantity metrics
    derived_metrics['engagement_score'] = (
        agg_metrics['play_progress'] * 
        (derived_metrics['completion_rate'] + derived_metrics['engagement_depth'])
    )


    derived_metrics['follow_impact'] = (
        agg_metrics['follow_cnt'] - agg_metrics['cancel_follow_cnt']
    ) / agg_metrics['show_cnt'].replace(0, 1)


    derived_metrics['retention_impact'] = (
        (agg_metrics['play_duration'] / (video_features['video_duration'] * agg_metrics['play_cnt']).replace(0, 1)) + 
        (agg_metrics['comment_stay_duration'] / 60) / agg_metrics['play_cnt'].replace(0, 1)
    )

    # Merge video features with derived metrics
    video_metadata = video_features.merge(
        derived_metrics, left_on='video_id', right_index=True, how='left'
    )

    # Merge with categorical features
    video_metadata = video_metadata.merge(
        item_categories[['video_id', 'feat']], on='video_id', how='left'
    )

    video_metadata = video_metadata.merge(
        kuairec_caption_category[['video_id', 'topic_tag', 
                                'first_level_category_name', 'second_level_category_name', 
                                'third_level_category_name']], 
        on='video_id', how='left'
    )

    # Calculate trending score
    daily_metrics = item_daily_features.groupby(['video_id', 'date'])['play_cnt'].sum().reset_index()
    if len(daily_metrics['date'].unique()) >= 2:
        latest_date = daily_metrics['date'].max()
        prev_date = sorted(daily_metrics['date'].unique())[-2]
        trend_df = daily_metrics.pivot(index='video_id', columns='date', values='play_cnt').fillna(0)
        trend_df['trending_score'] = (trend_df[latest_date] - trend_df[prev_date]) / (trend_df[prev_date] + 1)
        video_metadata = video_metadata.merge(
            trend_df[['trending_score']], left_on='video_id', right_index=True, how='left'
        )
    else:
        video_metadata['trending_score'] = 0

    return video_metadata

In [5]:
video_metadata2 = create_video_metadata_v2(item_categories, captions, item_daily_features)
print("Video metadata features created.")
video_metadata2.to_csv(rootpath + "data/video_metadata2.csv", index=False)

Video metadata features created.


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import multiprocessing
from tqdm import tqdm
from functools import partial

def process_user_batch(user_batch, user_groups, video_vectors, feature_dimensions):
    """
    Process a batch of users to create user profiles.
    Now defined at module level to support multiprocessing.
    """
    batch_profiles = {}
    for user_id in user_batch:
        if user_id not in user_groups:
            continue
            
        group = user_groups[user_id]
        vids = group['video_id'].values
        weights = group['watch_ratio'].values
        timestamps = group['timestamp'].values

        if len(timestamps) > 1:
            max_ts = timestamps.max()
            min_ts = timestamps.min()
            age = (max_ts - timestamps) / (max_ts - min_ts + 1e-6)
            recency_weights = np.exp(-2 * age)
        else:
            recency_weights = np.ones_like(timestamps)
        
        # Combine watch_ratio and recency
        combined_weights = weights * recency_weights
        
        vectors = []
        for i, vid in enumerate(vids):
            if vid in video_vectors:
                w = combined_weights[i]
                # Use high watch ratio as positive signal
                if w > 0.6:
                    vectors.append(video_vectors[vid] * w)
                # Use low watch ratio as negative signal
                elif w < 0.2:
                    vectors.append(video_vectors[vid] * -0.5)
                else:
                    vectors.append(video_vectors[vid] * w)
        
        if vectors:
            batch_profiles[user_id] = np.mean(vectors, axis=0)
        else:
            batch_profiles[user_id] = np.zeros(feature_dimensions)
    
    return batch_profiles

def process_test_batch(test_batch, user_profiles, video_vectors, video_metadata_dict, 
                       min_quality, quality_range):
    """
    Process a batch of test interactions to generate recommendations.
    Now defined at module level to support multiprocessing.
    """
    batch_results = []
    for _, row in test_batch.iterrows():
        user_id = row['user_id']
        video_id = row['video_id']
        
        if user_id in user_profiles and video_id in video_vectors:
            user_vec = user_profiles[user_id].reshape(1, -1)
            video_vec = video_vectors[video_id].reshape(1, -1)
            
            # Compute similarity
            similarity = cosine_similarity(user_vec, video_vec)[0][0]
            similarity_norm = (similarity + 1) / 2
            
            # Get quality score
            quality_score, trending_score = video_metadata_dict.get(
                video_id, (0, 0)
            )
            
            quality_norm = (quality_score - min_quality) / quality_range if quality_range > 0 else 0
            
            # Apply quality boost
            quality_boost = 0.85 + 0.3 * quality_norm
            score = float(similarity_norm * quality_boost)
            
        elif video_id in video_vectors:
            # Cold-start: use quality and trending
            quality_score, trending_score = video_metadata_dict.get(
                video_id, (0, 0)
            )
            
            quality_norm = (quality_score - min_quality) / quality_range if quality_range > 0 else 0
            cold_start_weight = 0.2 if user_id not in user_profiles else 0.1
            score = float(cold_start_weight * (0.7 * quality_norm + 0.3 * trending_score))
        else:
            score = 0.0
        
        batch_results.append({
            'user_id': user_id, 
            'video_id': video_id, 
            'predicted_score': score
        })
        
    return batch_results

def fast_recommendation_system(interactions_train, interactions_test, video_metadata, 
                              svd_components=50, batch_size=1000, n_jobs=-1):
    """
    Fast implementation of recommendation system with fixed multiprocessing support
    """
    # Filter the training set to only include users who are in the test set
    # This optimization significantly reduces processing time. Had to do that because too long otherwise
    test_users = set(interactions_test['user_id'].unique())
    filtered_interactions_train = interactions_train[interactions_train['user_id'].isin(test_users)]
    print(f"Filtered training interactions from {len(interactions_train)} to {len(filtered_interactions_train)} entries")
    
    print("Step 1: Processing video features")
    
    # Process tag features with MultiLabelBinarizer 
    mlb = MultiLabelBinarizer(sparse_output=True)
    tag_features = mlb.fit_transform(video_metadata['feat'])
    
    # Process categorical features
    categorical_features = [
        'first_level_category_name', 'second_level_category_name', 
        'third_level_category_name', 'topic_tag'
    ]
    video_metadata[categorical_features] = video_metadata[categorical_features].fillna('')
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
    encoded_cats = encoder.fit_transform(video_metadata[categorical_features])
    
    # Process numerical features
    numerical_features = [
        'video_duration', 'aspect_ratio',
        'completion_rate', 'engagement_depth', 'validation_ratio',
        'like_ratio', 'comment_ratio', 'share_ratio', 'collect_ratio',
        'quality_score', 'engagement_score', 'follow_impact',
        'retention_impact', 'trending_score'
    ]
    
    # Scale
    scaler = StandardScaler()
    numerical_data = scaler.fit_transform(video_metadata[numerical_features].fillna(0))
    
    # Process author features
    author_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
    author_encoded = author_encoder.fit_transform(video_metadata[['author_id']])
    
    print("Step 2: Applying dimensionality reduction")

    svd_cat = TruncatedSVD(n_components=min(svd_components, encoded_cats.shape[1]-1), random_state=42)
    cat_reduced = svd_cat.fit_transform(encoded_cats)
    print(f"Category features explained variance: {sum(svd_cat.explained_variance_ratio_):.2f}")
    
    if tag_features.shape[1] > 0:
        svd_tag = TruncatedSVD(n_components=min(svd_components, tag_features.shape[1]-1), random_state=42)
        tag_reduced = svd_tag.fit_transform(tag_features)
        print(f"Tag features explained variance: {sum(svd_tag.explained_variance_ratio_):.2f}")
    else:
        tag_reduced = np.zeros((video_metadata.shape[0], 1))
    
    author_components = min(svd_components//2, author_encoded.shape[1]-1)
    if author_components > 0:
        svd_author = TruncatedSVD(n_components=author_components, random_state=42)
        author_reduced = svd_author.fit_transform(author_encoded) * 0.3  # Reduced weight for author
    else:
        author_reduced = np.zeros((video_metadata.shape[0], 1))
    
    # Combine all features
    video_feature_vectors = np.hstack([
        cat_reduced, 
        tag_reduced, 
        numerical_data,
        author_reduced
    ])
    
    video_id_to_index = {vid: idx for idx, vid in enumerate(video_metadata['video_id'])}
    
    video_metadata_dict = dict(zip(video_metadata['video_id'], zip(
        video_metadata['quality_score'],
        video_metadata['trending_score']
    )))
    
    video_vectors = {vid: video_feature_vectors[idx] for vid, idx in video_id_to_index.items()}
    
    print("Step 3: Building user profiles")

    user_profiles = {}
    user_groups = dict(list(filtered_interactions_train.groupby('user_id')))
    
    user_ids = list(user_groups.keys())
    num_cpus = multiprocessing.cpu_count() if n_jobs <= 0 else min(n_jobs, multiprocessing.cpu_count())
    batch_size = max(1, len(user_ids) // (num_cpus * 2))
    
    user_batches = [user_ids[i:i + batch_size] for i in range(0, len(user_ids), batch_size)]
    print(f"Processing {len(user_ids)} users in {len(user_batches)} batches using {num_cpus} processes")
    
    with multiprocessing.Pool(processes=num_cpus) as pool:
        feature_dimensions = video_feature_vectors.shape[1]
        results = list(tqdm(
            pool.starmap(
                process_user_batch, 
                [(batch, user_groups, video_vectors, feature_dimensions) for batch in user_batches]
            ),
            total=len(user_batches),
            desc="Building user profiles"
        ))
    
    for batch_result in results:
        user_profiles.update(batch_result)
    
    print(f"Built profiles for {len(user_profiles)} users")
    

    print("Step 4: Computing recommendations")
    
    min_quality = min(q for q, _ in video_metadata_dict.values())
    max_quality = max(q for q, _ in video_metadata_dict.values())
    quality_range = max_quality - min_quality
    
    test_batches = [interactions_test[i:i + batch_size] for i in range(0, len(interactions_test), batch_size)]
    
    with multiprocessing.Pool(processes=num_cpus) as pool:
        batch_results = list(tqdm(
            pool.starmap(
                process_test_batch,
                [(batch, user_profiles, video_vectors, video_metadata_dict, min_quality, quality_range) 
                 for batch in test_batches]
            ),
            total=len(test_batches),
            desc="Calculating recommendation scores"
        ))
    
    # Combine all results
    all_results = []
    for batch in batch_results:
        all_results.extend(batch)
    
    recommendations = pd.DataFrame(all_results)
    recommendations['predicted_score'] = np.clip(recommendations['predicted_score'], 0, 1)
    
    print("Recommendation generation complete!")
    return recommendations

In [1]:
import pandas as pd
import numpy as np

rootpath="./data_final_project/KuaiRec 2.0/"
video_metadata2 = pd.read_csv(rootpath + "data/video_metadata2.csv")
interactions_train = pd.read_csv(rootpath + "data/interactions_train.csv")
interactions_test = pd.read_csv(rootpath + "data/interactions_test.csv")

import ast
interactions_test['video_id'] = interactions_test['video_id'].astype(int)
interactions_train['video_id'] = interactions_train['video_id'].astype(int)

In [3]:
video_metadata2["video_id"] = video_metadata2["video_id"].astype(int)

In [4]:
filtered_interactions_train = interactions_train[interactions_train['user_id'].isin(interactions_test['user_id'])]
recommendations = fast_recommendation_system(interactions_train, interactions_test, video_metadata2)
print("Recommendations generated.")
print(recommendations.head())
recommendations.to_csv(rootpath + "data/recommendations3.csv", index=False)

Filtered training interactions from 12530806 to 571061 entries
Step 1: Processing video features
Step 2: Applying dimensionality reduction
Category features explained variance: 0.71
Step 3: Building user profiles
Processing 1411 users in 33 batches using 16 processes


Building user profiles: 100%|██████████| 33/33 [00:00<00:00, 203547.11it/s]


Built profiles for 1411 users
Step 4: Computing recommendations


Calculating recommendation scores: 100%|██████████| 106286/106286 [00:00<00:00, 3853764.72it/s]


Recommendation generation complete!
Recommendations generated.
   user_id  video_id  predicted_score
0       14       148         0.951569
1       14       183         1.000000
2       14      3649         0.978810
3       14      5262         1.000000
4       14      8234         0.826265


In [75]:
print(recommendations.head(10))

   user_id  video_id  predicted_score
0       14       148         0.966219
1       14       183         1.000000
2       14      3649         0.947657
3       14      5262         1.000000
4       14      8234         0.907958
5       14      6789         0.981326
6       14      1963         0.937952
7       14       175         0.722779
8       14      1973         1.000000
9       14       171         1.000000


In [None]:
import pandas as pd
import numpy as np
from io import StringIO

rootpath="./data_final_project/KuaiRec 2.0/"
recommendation_df = pd.read_csv(rootpath + 'data/recommendations3.csv')
truth_df = pd.read_csv(rootpath + 'data/small_matrix.csv')

recommendation_top10 = recommendation_df.groupby('user_id', group_keys=False).apply(
    lambda x: x.nlargest(50, 'predicted_score'))

truth_top10 = truth_df.groupby('user_id', group_keys=False).apply(
    lambda x: x.nlargest(50, 'watch_ratio'))

  recommendation_top10 = recommendation_df.groupby('user_id', group_keys=False).apply(
  truth_top10 = truth_df.groupby('user_id', group_keys=False).apply(


In [None]:
def precision_at_k(recommended, relevant, k):
    """Calculate precision@k (order-agnostic)"""
    recommended_set = set(recommended[:k])
    relevant_set = set(relevant)
    hits = len(recommended_set.intersection(relevant_set))
    return hits / k

def recall_at_k(recommended, relevant, k):
    """Calculate recall@k (order-agnostic)"""
    recommended_set = set(recommended[:k])
    relevant_set = set(relevant)
    hits = len(recommended_set.intersection(relevant_set))
    return hits / len(relevant_set) if relevant_set else 0

def f1_score_at_k(recommended, relevant, k):
    """Calculate F1 score@k (order-agnostic)"""
    prec = precision_at_k(recommended, relevant, k)
    rec = recall_at_k(recommended, relevant, k)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0

# Additional set-based metrics
def jaccard_similarity(recommended, relevant, k):
    """Calculate Jaccard similarity (order-agnostic)"""
    recommended_set = set(recommended[:k])
    relevant_set = set(relevant)
    intersection = len(recommended_set.intersection(relevant_set))
    union = len(recommended_set.union(relevant_set))
    return intersection / union if union > 0 else 0

In [None]:
k = 50
user_ids = recommendation_top10['user_id'].unique()

precision_scores = []
recall_scores = []
f1_scores = []
jaccard_scores = []

for user in user_ids:
    rec_videos = recommendation_top10[recommendation_top10['user_id'] == user]['video_id'].tolist()
    true_videos = truth_top10[truth_top10['user_id'] == user]['video_id'].tolist()
    
    precision_scores.append(precision_at_k(rec_videos, true_videos, k))
    recall_scores.append(recall_at_k(rec_videos, true_videos, k))
    f1_scores.append(f1_score_at_k(rec_videos, true_videos, k))
    jaccard_scores.append(jaccard_similarity(rec_videos, true_videos, k))


average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(f1_scores)
average_jaccard = np.mean(jaccard_scores)

print(f"Average Precision@{k}: {average_precision:.4f}")
print(f"Average Recall@{k}: {average_recall:.4f}")
print(f"Average F1@{k}: {average_f1:.4f}")
print(f"Average Jaccard Similarity: {average_jaccard:.4f}")

Average Precision@50: 0.0479
Average Recall@50: 0.0479
Average F1@50: 0.0479
Average Jaccard Similarity: 0.0250
