In [83]:
import os
import sys

try:
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(current_dir, ".."))  
if project_root not in sys.path:
    sys.path.append(project_root)


In [None]:

# import pysparnn.cluster_index as ci
# import scipy.sparse
# import os
import math
import numpy as np
import pandas as pd
import random
import scipy.sparse as sp
from utils import train_test_split, df_to_matrix ,matrix_to_df_2, threshold_interactions_df, matrix_to_df,set_intersection,get_0_and_p_index,set_diff, matrix_to_full_df, threshold_interactions_df_plus, train_test_split_csr

# !pip install surprise
from surprise import Reader, accuracy
from surprise import SVD
from surprise import Dataset
from collections import Counter
from collections import defaultdict

# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from pct.tree.heuristic.Heuristic import Heuristic5
from pct.tree.heuristic.semibi_NumericHeuristic import NumericHeuristic5 
from pct.tree.splitter.semibi_splitter import Splitter 
from pct.tree.semibi_tree import Tree

In [87]:
filtered_df = pd.read_csv("filtered_semi_binary.csv")
filtered_df

Unnamed: 0,user_id,item_id,rating,item_type,artist_id,genre_ids
0,9,238709,0.00,artist,238709,0
1,9,169510,0.00,artist,169510,0
2,9,208084,1.00,artist,208084,0
3,9,245398,0.00,artist,245398,0
4,9,153166,0.00,artist,153166,0
...,...,...,...,...,...,...
1524889,248947,83754,0.01,artist,83754,0
1524890,248947,141799,0.00,genre,0,[141799]
1524891,248947,141677,0.00,artist,141677,0
1524892,248947,262458,0.00,artist,262458,0


In [89]:
# item type map
item_type_map = filtered_df.drop_duplicates(subset='item_id')[['item_id', 'item_type']]
item_type_map = dict(zip(item_type_map['item_id'], item_type_map['item_type']))

In [91]:
def get_item_type(item_id):
    return item_type_map.get(item_id, 'unknown')  


In [93]:
print(get_item_type(172223))

artist


In [95]:
print(get_item_type(238709))  
print(get_item_type(141799))  


artist
genre


In [97]:
all_user_ids = sorted(filtered_df['user_id'].unique().tolist())

def split_users_by_ratio(all_user_ids, ratio):
    n = len(all_user_ids)
    split_point = int(n * ratio)
    warm_users = all_user_ids[:split_point]
    cold_users = all_user_ids[split_point:]
    return warm_users, cold_users

# Example ratios from 10% to 50%
# ratios = [0.1, 0.2, 0.3, 0.4, 0.5]
# splits = {r: split_users_by_ratio(all_user_ids, r) for r in ratios}


## 10% warm

In [99]:
warm_users_idx, cold_users_idx = split_users_by_ratio(all_user_ids, 0.1)

df_warm = filtered_df[filtered_df['user_id'].isin(warm_users_idx)].copy()
df_cold = filtered_df[filtered_df['user_id'].isin(cold_users_idx)].copy()

matrix_warm, rid_to_idx_warm, idx_to_rid_warm, cid_to_idx, idx_to_cid = df_to_matrix(
    df_warm, "user_id", "item_id", "rating")


matrix_cold, rid_to_idx_cold, idx_to_rid_cold, _, _ = df_to_matrix( 
    df_cold, "user_id", "item_id", "rating")



In [100]:
def split_and_combine(strategy="artist-only"):
    """Handles both approaches with proper matrix alignment"""
    # Get full cold matrix and mappings
    matrix_cold, rid_to_idx_cold, _, cid_to_idx, _ = df_to_matrix(
        df_cold, "user_id", "item_id", "rating"
    )
    matrix_cold = matrix_cold.tocsr()

    # Create boolean masks
    artist_mask = np.isin(
        np.arange(matrix_cold.shape[1]), 
        [cid_to_idx[iid] for iid in df_cold[df_cold['item_type'] == 'artist']['item_id']]
    )
    genre_mask = ~artist_mask

    # Create aligned matrices
    matrix_cold_artist = matrix_cold.multiply(artist_mask)
    matrix_cold_genre = matrix_cold.multiply(genre_mask)
    matrix_cold_artist = matrix_cold.multiply(artist_mask).tocsr()
    matrix_cold_genre = matrix_cold.multiply(genre_mask).tocsr()

    al_artist, test_cold, _ = train_test_split(
        matrix_cold_artist, 
        split_count=30,
        fraction=None
    )
    
    if strategy == "artist-only":
        X_cold, K_cold, _ = train_test_split_csr(al_artist, 1)  
        return K_cold, X_cold, test_cold
    
    elif strategy == "hybrid":
        X_cold, K_cold, _ = train_test_split_csr(al_artist, 1)
        X_cold = X_cold + matrix_cold_genre
        return K_cold, X_cold, test_cold


In [103]:
my_seed = 7
random.seed(my_seed)
np.random.seed(my_seed)

train_cold_K_artist, X_cold_artist, test_cold_artist  = split_and_combine("artist-only")
train_cold_K_hybrid, X_cold_hybrid, test_cold_hybrid = split_and_combine("hybrid")


In [104]:
print(f"Cold users in X: {len(np.unique(X_cold_hybrid.nonzero()[0]))}")
print(f"Cold items in X: {len(np.unique(X_cold_hybrid.nonzero()[1]))}")
print(f"test users in test: {len(np.unique(test_cold_hybrid.nonzero()[0]))}")
print(f"test items in test: {len(np.unique(test_cold_hybrid.nonzero()[1]))}")
print(f"train users in train: {len(np.unique(train_cold_K_hybrid.nonzero()[0]))}")
print(f"train items in train: {len(np.unique(train_cold_K_hybrid.nonzero()[1]))}")

print("-------------------")

# shape of test_cold
print(f"Shape of test_cold: {test_cold_hybrid.shape}")
print(f"Shape of train_cold: {train_cold_K_hybrid.shape}")
print(f"Shape of X_cold: {X_cold_hybrid.shape}")

Cold users in X: 1531
Cold items in X: 5011
test users in test: 1475
test items in test: 3980
train users in train: 689
train items in train: 587
-------------------
Shape of test_cold: (1531, 5011)
Shape of train_cold: (1531, 5011)
Shape of X_cold: (1531, 5011)


# X with artist + genre

In [79]:

from collections import defaultdict
from surprise import SVD, Dataset, Reader, accuracy
import pandas as pd

def elicitation_by_tree_path_retrain_skiped_warm_type(Tree, train, test, X, matrix_warm, idx_to_rid_cold, idx_to_rid_warm, idx_to_cid, iteration=5):
    """
    Method 3 (revised): Retrain tree at each round. For each user, traverse from root down the tree.
    At each level, if the item was already asked (stored in asked_items), go deeper.
    Ask the first item the user hasn't been asked before.

    Parameters:
    - Tree: class of the decision tree model
    - train, test, X: lil_matrix (known ratings, test set, and full matrix)
    - matrix_warm: lil_matrix of warm user ratings
    - idx_to_rid_*, idx_to_cid: mapping from indices to real user/item IDs
    - iteration: number of elicitation rounds

    Returns:
    - rmse_list, mae_list: performance at each round
    - item_type_stats: {round_i: {'artist': count, 'genre': count, ...}, ...}
    """

    num_users, num_items = train.shape
    train_copy = train.tolil().copy()
    X_copy = X.tolil().copy()
    rmse_list, mae_list = [], []
    cid_to_idx = {v: k for k, v in idx_to_cid.items()}
    asked_items = {u: set() for u in range(num_users)}  
    item_type_stats = defaultdict(lambda: defaultdict(int))  # 🌟 item type statistics each iteration

    # Step 0: Baseline evaluation
    print("🔍 Evaluating baseline RMSE/MAE...")
    train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
    reader = Reader(rating_scale=(0, 1))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    trainset = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)
    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    test_data = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    testset = test_data.build_full_trainset().build_testset()
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions, verbose=True))
    mae_list.append(accuracy.mae(predictions, verbose=True))
    print("✅ Baseline evaluation complete.")

    # Iterative elicitation
    for i in range(iteration):
        print(f"\n🔁 Iteration {i+1}/{iteration} (skip asked items, walk from root)")
        # Retrain tree using current train_copy
        warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
        coldK_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
        x_df = pd.concat([warm_df, coldK_df], ignore_index=False)

        pct = Tree(max_depth=i + 1, min_instances=5)
        pct.fit(x_df, x_df)
        print("🌳 Tree re-trained.")

        for u in range(num_users):
            node = pct.root

            # Traverse down the tree until we find an unasked item
            while node and not node.is_leaf and node.attribute_name:
                item = node.attribute_name
                if item not in cid_to_idx:
                    node = None
                    break

                item_idx = cid_to_idx[item]

                if item_idx in asked_items[u]:
                    # Already asked, go deeper based on user's rating
                    rating = train_copy[u, item_idx]
                    if rating > 0.01:
                        node = node.children[0]
                    elif rating > 0:
                        node = node.children[1]
                    else:
                        node = node.children[2]
                    continue

                # First unasked item: try to add it to training
                asked_items[u].add(item_idx)
                rating = X_copy[u, item_idx]
                if rating > 0:
                    train_copy[u, item_idx] = rating
                    X_copy[u, item_idx] = 0

                # 🌟 record item type
                item_type = get_item_type(item)  
                item_type_stats[i][item_type] += 1

                break  # only ask one item per user per iteration

        # Step 3: Evaluate with SVD
        print("📊 Evaluating after this iteration...")
        train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        trainset = data_r.build_full_trainset()
        algo.fit(trainset)
        test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
        test_data = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        testset = test_data.build_full_trainset().build_testset()
        predictions = algo.test(testset)
        rmse_list.append(accuracy.rmse(predictions, verbose=True))
        mae_list.append(accuracy.mae(predictions, verbose=True))
        print(f"✅ Iteration {i+1} complete.")

    return rmse_list, mae_list, item_type_stats



In [81]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# ignore runtime warnings

pct_hybrid_10_type = elicitation_by_tree_path_retrain_skiped_warm_type(
    Tree=Tree,
    train=train_cold_K_hybrid,
    test=test_cold_hybrid,
    X=X_cold_hybrid,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=4
)

🔍 Evaluating baseline RMSE/MAE...
RMSE: 0.5674
MAE:  0.3739
✅ Baseline evaluation complete.

🔁 Iteration 1/4 (skip asked items, walk from root)
Initializing Splitter...
✅ Calling build()...
✅ Tree built successfully!
🌳 Tree re-trained.
📊 Evaluating after this iteration...
RMSE: 0.5660
MAE:  0.3745
✅ Iteration 1 complete.

🔁 Iteration 2/4 (skip asked items, walk from root)
Initializing Splitter...
✅ Calling build()...
✅ Tree built successfully!
🌳 Tree re-trained.
📊 Evaluating after this iteration...
RMSE: 0.5528
MAE:  0.3736
✅ Iteration 2 complete.

🔁 Iteration 3/4 (skip asked items, walk from root)
Initializing Splitter...
✅ Calling build()...
✅ Tree built successfully!
🌳 Tree re-trained.
📊 Evaluating after this iteration...
RMSE: 0.5499
MAE:  0.3736
✅ Iteration 3 complete.

🔁 Iteration 4/4 (skip asked items, walk from root)
Initializing Splitter...
✅ Calling build()...
✅ Tree built successfully!
🌳 Tree re-trained.
📊 Evaluating after this iteration...
RMSE: 0.5486
MAE:  0.3736
✅ Iterat

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# ignore runtime warnings

pct_hybrid_10_type = elicitation_by_tree_path_retrain_skiped_warm_type(
    Tree=Tree,
    train=X_cold_hybrid,
    test=test_cold_hybrid,
    X=X_cold_hybrid,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=1
)

In [None]:
rmse_list, mae_list, item_type_stats = pct_hybrid_10_type
for round_i, type_counts in item_type_stats.items():
    total = sum(type_counts.values())
    ratios = {k: v / total for k, v in type_counts.items()}
    print(f"Round {round_i+1}: {ratios}")
item_type_stats

In [107]:
def flat_baseline_from_elicitation_np_warm(matrix_warm, train, X, test):
    """
    Train a single SVD model using full info: matrix_warm + train + X
    Return RMSE and MAE on test set (Surprise-based).
    """
    #  K + X
    full_train_cold = train.copy().tolil()
    X_copy = X.copy().tolil()
    for u in range(train.shape[0]):
        items = X_copy.getrow(u).nonzero()[1]
        for item in items:
            full_train_cold[u, item] = X[u, item]

    # combine warm and  cold 
    full_matrix = sp.vstack([matrix_warm.tocsr(), full_train_cold.tocsr()])

    #  convert dataframe
    train_df = matrix_to_df_2(full_train_cold, idx_to_rid_cold, idx_to_cid)
    test_df = matrix_to_df(test, idx_to_rid_cold, idx_to_cid)

    # train SVD 
    reader = Reader(rating_scale=(0, 1))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    data_rr = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(data_rr)

    # Predict on test
    test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    test_rr = test_r.build_full_trainset()
    predictions = algo.test(test_rr.build_testset())
    
    rmse_val = accuracy.rmse(predictions)
    mae_val = accuracy.mae(predictions)

    print(f"[Flat Baseline] RMSE = {rmse_val:.4f}, MAE = {mae_val:.4f}")
    return rmse_val, mae_val

In [109]:
flat_hybrid10 = flat_baseline_from_elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K_hybrid,
    X=X_cold_hybrid,
    test=test_cold_hybrid
)

RMSE: 0.4078
MAE:  0.3356
[Flat Baseline] RMSE = 0.4078, MAE = 0.3356


In [113]:
def flat_baseline_from_elicitation_np_coldonly(train, X, test):
    """
    Train a single SVD model using full info from cold users only: train + X
    Return RMSE and MAE on test set.
    """
    # Merge train and X: cold users only
    full_train_cold = train.copy().tolil()
    X_copy = X.copy().tolil()
    for u in range(train.shape[0]):
        items = X_copy.getrow(u).nonzero()[1]
        for item in items:
            full_train_cold[u, item] = X[u, item]

    # convert cold-only matrix to df
    train_df = matrix_to_df_2(full_train_cold, idx_to_rid_cold, idx_to_cid)
    test_df = matrix_to_df(test, idx_to_rid_cold, idx_to_cid)

    # train SVD on cold users
    reader = Reader(rating_scale=(0, 1))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    data_rr = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(data_rr)

    # Predict on test
    test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    test_rr = test_r.build_full_trainset()
    predictions = algo.test(test_rr.build_testset())
    
    rmse_val = accuracy.rmse(predictions)
    mae_val = accuracy.mae(predictions)

    print(f"[Flat Baseline - Cold Only] RMSE = {rmse_val:.4f}, MAE = {mae_val:.4f}")
    return rmse_val, mae_val


In [119]:
flat_hybrid10 = flat_baseline_from_elicitation_np_coldonly(
    train=X_cold_hybrid,
    X=X_cold_hybrid,
    test=test_cold_hybrid
)

RMSE: 0.4086
MAE:  0.3361
[Flat Baseline - Cold Only] RMSE = 0.4086, MAE = 0.3361


In [135]:
import scipy.sparse as sp
import numpy as np

def select_top_n_per_user(X, n=10):
    X = X.tocsr()
    selected = sp.lil_matrix(X.shape)
    for u in range(X.shape[0]):
        row = X.getrow(u)
        if row.nnz == 0:
            continue
        item_ids = row.indices
        ratings = row.data
        if np.allclose(ratings, ratings[0]):
            # all scores are equal (e.g. semi-binary) => random top-n
            selected_items = np.random.choice(item_ids, size=min(n, len(item_ids)), replace=False)
        else:
            top_n_idx = np.argsort(-ratings)[:n]
            selected_items = item_ids[top_n_idx]
        selected[u, selected_items] = X[u, selected_items]
    return selected.tocsr()



In [145]:
def run_topN_baselines(train_K, X, test, idx_to_rid_cold, idx_to_cid, N_list):
    results = []

    for N in N_list:
        print(f"\n=== Running Top-{N} per user baseline ===")

        # Step 1: Top-N selection
        X_selected = select_top_n_per_user(X, n=N)

        # Step 2: Merge K + selected X via assignment (not addition!)
        train_full = train_K.copy().tolil()
        X_sel = X_selected.tolil()
        for u in range(train_K.shape[0]):
            items = X_sel.rows[u]
            values = X_sel.data[u]
            for item, value in zip(items, values):
                train_full[u, item] = value
        train_full = train_full.tocsr()

        


        # Optional: check total number of ratings
        total_ratings = train_full.count_nonzero()
        print(f"Total training ratings (K + top-{N} X): {total_ratings}")

        # Step 3: Convert to DataFrame
        train_df = matrix_to_df(train_full, idx_to_rid_cold, idx_to_cid)
        test_df = matrix_to_df(test, idx_to_rid_cold, idx_to_cid)

        # Step 4: Train SVD
        reader = Reader(rating_scale=(0, 1))
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        data_rr = data_r.build_full_trainset()

        algo = SVD()
        algo.fit(data_rr)

        # Step 5: Predict
        test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        test_rr = test_r.build_full_trainset()
        predictions = algo.test(test_rr.build_testset())

        # Step 6: Evaluation
        rmse = accuracy.rmse(predictions)
        mae = accuracy.mae(predictions)

        results.append({
            "N_per_user": N,
            "k+X_ratings": total_ratings,
            "RMSE": rmse,
            "MAE": mae
        })

    return results



In [147]:
N_values = [25, 50, 100, 150, 200, 250,300,350, 400, 500, 550]

results_topN = run_topN_baselines(
    train_K=train_cold_K_hybrid,
    X=X_cold_hybrid,
    test=test_cold_hybrid,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_cid=idx_to_cid,
    N_list=N_values
)


df_results = pd.DataFrame(results_topN)
print("\n📊 Baseline Performance Summary:")
print(df_results)


=== Running Top-25 per user baseline ===
Total training ratings (K + top-25 X): 38776
RMSE: 0.7000
MAE:  0.5437

=== Running Top-50 per user baseline ===
Total training ratings (K + top-50 X): 75983
RMSE: 0.6735
MAE:  0.5180

=== Running Top-100 per user baseline ===
Total training ratings (K + top-100 X): 144932
RMSE: 0.6142
MAE:  0.4671

=== Running Top-150 per user baseline ===
Total training ratings (K + top-150 X): 206744
RMSE: 0.5665
MAE:  0.4269

=== Running Top-200 per user baseline ===
Total training ratings (K + top-200 X): 261628
RMSE: 0.5320
MAE:  0.4003

=== Running Top-250 per user baseline ===
Total training ratings (K + top-250 X): 309503
RMSE: 0.5013
MAE:  0.3808

=== Running Top-300 per user baseline ===
Total training ratings (K + top-300 X): 351627
RMSE: 0.4765
MAE:  0.3671

=== Running Top-350 per user baseline ===
Total training ratings (K + top-350 X): 388603
RMSE: 0.4574
MAE:  0.3577

=== Running Top-400 per user baseline ===
Total training ratings (K + top-400

In [149]:
def select_random_percent_per_user(X, percent=0.1, seed=42):
    """
    For each user, randomly keep `percent` of their rated items in X.
    """
    np.random.seed(seed)
    X = X.tocsr()
    selected = sp.lil_matrix(X.shape)
    for u in range(X.shape[0]):
        row = X.getrow(u)
        item_ids = row.indices
        ratings = row.data
        n_keep = int(len(item_ids) * percent)
        if n_keep == 0:
            continue
        selected_idx = np.random.choice(len(item_ids), size=n_keep, replace=False)
        selected_items = item_ids[selected_idx]
        selected[u, selected_items] = ratings[selected_idx]
    return selected.tocsr()


In [151]:
def run_random_percent_baselines(train_K, X, test, idx_to_rid_cold, idx_to_cid, percent_list):
    results = []

    for p in percent_list:
        print(f"\n=== Running Random {int(p*100)}% per user baseline ===")

        # Step 1: 随机保留X评分
        X_selected = select_random_percent_per_user(X, percent=p)

        # Step 2: 合并 train_K 和 X_selected（逐项覆盖）
        train_full = train_K.copy().tolil()
        X_sel = X_selected.tolil()
        for u in range(train_K.shape[0]):
            items = X_sel.rows[u]
            values = X_sel.data[u]
            for item, value in zip(items, values):
                train_full[u, item] = value
        train_full = train_full.tocsr()

        # Step 3: 转换 dataframe
        train_df = matrix_to_df(train_full, idx_to_rid_cold, idx_to_cid)
        test_df = matrix_to_df(test, idx_to_rid_cold, idx_to_cid)

        # Step 4: 训练 SVD
        reader = Reader(rating_scale=(0, 1))
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        data_rr = data_r.build_full_trainset()

        algo = SVD()
        algo.fit(data_rr)

        # Step 5: 预测并评估
        test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        test_rr = test_r.build_full_trainset()
        predictions = algo.test(test_rr.build_testset())

        rmse = accuracy.rmse(predictions)
        mae = accuracy.mae(predictions)
        total_ratings = train_full.count_nonzero()

        results.append({
            "percent": p,
            "k+X_ratings": total_ratings,
            "RMSE": rmse,
            "MAE": mae
        })

    return results


In [155]:
percent_values = [ 0.10, 0.15, 0.20, 0.30, 0.50,0.80,0.90,1]

results_random = run_random_percent_baselines(
    train_K=train_cold_K_hybrid,
    X=X_cold_hybrid,
    test=test_cold_hybrid,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_cid=idx_to_cid,
    percent_list=percent_values
)

df_random = pd.DataFrame(results_random)
print(df_random)



=== Running Random 10% per user baseline ===
RMSE: 0.4278
MAE:  0.3574

=== Running Random 15% per user baseline ===
RMSE: 0.4236
MAE:  0.3521

=== Running Random 20% per user baseline ===
RMSE: 0.4216
MAE:  0.3486

=== Running Random 30% per user baseline ===
RMSE: 0.4185
MAE:  0.3451

=== Running Random 50% per user baseline ===
RMSE: 0.4143
MAE:  0.3414

=== Running Random 80% per user baseline ===
RMSE: 0.4098
MAE:  0.3381

=== Running Random 90% per user baseline ===
RMSE: 0.4087
MAE:  0.3367

=== Running Random 100% per user baseline ===
RMSE: 0.4080
MAE:  0.3357
   percent  k+X_ratings      RMSE       MAE
0     0.10        57831  0.427806  0.357392
1     0.15        86737  0.423580  0.352123
2     0.20       115768  0.421574  0.348557
3     0.30       173540  0.418497  0.345112
4     0.50       289530  0.414329  0.341424
5     0.80       462820  0.409836  0.338109
6     0.90       520591  0.408695  0.336673
7     1.00       579131  0.407990  0.335724


In [157]:
def select_random_n_per_user(X, n=10, seed=42):
    """
    For each user, randomly select up to `n` rated items from X.
    """
    np.random.seed(seed)
    X = X.tocsr()
    selected = sp.lil_matrix(X.shape)
    for u in range(X.shape[0]):
        row = X.getrow(u)
        item_ids = row.indices
        ratings = row.data
        if len(item_ids) == 0:
            continue
        n_keep = min(n, len(item_ids))
        selected_idx = np.random.choice(len(item_ids), size=n_keep, replace=False)
        selected_items = item_ids[selected_idx]
        selected[u, selected_items] = ratings[selected_idx]
    return selected.tocsr()


In [159]:
def run_random_n_baselines(train_K, X, test, idx_to_rid_cold, idx_to_cid, N_list):
    results = []

    for N in N_list:
        print(f"\n=== Running Random-N={N} per user baseline ===")

        # Step 1: 随机选择每个用户的N个评分项
        X_selected = select_random_n_per_user(X, n=N)

        # Step 2: 合并 train_K 和 X_selected（逐项覆盖）
        train_full = train_K.copy().tolil()
        X_sel = X_selected.tolil()
        for u in range(train_K.shape[0]):
            items = X_sel.rows[u]
            values = X_sel.data[u]
            for item, value in zip(items, values):
                train_full[u, item] = value
        train_full = train_full.tocsr()

        # Step 3: 转换为 dataframe
        train_df = matrix_to_df(train_full, idx_to_rid_cold, idx_to_cid)
        test_df = matrix_to_df(test, idx_to_rid_cold, idx_to_cid)

        # Step 4: SVD 训练
        reader = Reader(rating_scale=(0, 1))
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        data_rr = data_r.build_full_trainset()

        algo = SVD()
        algo.fit(data_rr)

        # Step 5: 预测并评估
        test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        test_rr = test_r.build_full_trainset()
        predictions = algo.test(test_rr.build_testset())

        rmse = accuracy.rmse(predictions)
        mae = accuracy.mae(predictions)
        total_ratings = train_full.count_nonzero()

        results.append({
            "N_per_user": N,
            "k+X_ratings": total_ratings,
            "RMSE": rmse,
            "MAE": mae
        })

    return results


In [161]:
N_values = [5, 10, 25, 50, 100]

results_randomN = run_random_n_baselines(
    train_K=train_cold_K_hybrid,
    X=X_cold_hybrid,
    test=test_cold_hybrid,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_cid=idx_to_cid,
    N_list=N_values
)

df_randomN = pd.DataFrame(results_randomN)
print("\n📊 Random-N Baseline Summary:")
print(df_randomN)



=== Running Random-N=5 per user baseline ===
RMSE: 0.4607
MAE:  0.4306

=== Running Random-N=10 per user baseline ===
RMSE: 0.4462
MAE:  0.4020

=== Running Random-N=25 per user baseline ===
RMSE: 0.4321
MAE:  0.3662

=== Running Random-N=50 per user baseline ===
RMSE: 0.4245
MAE:  0.3526

=== Running Random-N=100 per user baseline ===
RMSE: 0.4199
MAE:  0.3466

📊 Random-N Baseline Summary:
   N_per_user  k+X_ratings      RMSE       MAE
0           5         8340  0.460717  0.430572
1          10        15976  0.446208  0.402029
2          25        38776  0.432144  0.366222
3          50        75983  0.424501  0.352581
4         100       144932  0.419924  0.346605
