In [1]:
from pct.tree.heuristic.Heuristic import Heuristic5
from pct.tree.heuristic.NumericHeuristicCopy1 import NumericHeuristic5
from pct.tree.splitter.splitterCopy2 import Splitter
from pct.tree.treeCopy2 import Tree

In [2]:
from surprise import Reader, accuracy
from surprise import SVD
from surprise import Dataset
from collections import Counter

In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import math
import random
from utils import train_test_split, df_to_matrix ,matrix_to_df_2, threshold_interactions_df,threshold_interactions_df_plus, matrix_to_df,set_intersection,get_0_and_p_index,set_diff, matrix_to_full_df

my_seed = 7
random.seed(my_seed)
np.random.seed(my_seed)

def load_user_item_matrix(filepath):
    data = []

    with open(filepath, 'r') as f:
        while True:
            user_line = f.readline()
            if not user_line:
                break

            user_line = user_line.strip()
            if '|' not in user_line:
                continue

            user_id, num_ratings = user_line.split('|')
            user_id = int(user_id)
            num_ratings = int(num_ratings)

            for _ in range(num_ratings):
                item_line = f.readline().strip()
                parts = item_line.split('\t')
                if len(parts) < 2:
                    continue
                item_id, score = parts[0], parts[1]
                data.append([user_id, int(item_id), int(score)])

    df = pd.DataFrame(data, columns=['user_id', 'item_id', 'rating'])
    return df


In [9]:

def safe_int(x):
    return int(x) if x != 'None' else 0


def load_track_data(filepath):
    track_data = []
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split('|')
            track_id = safe_int(parts[0])
            album_id = safe_int(parts[1])
            artist_id = safe_int(parts[2])
            genre_ids = [safe_int(g) for g in parts[3:]] if len(parts) > 3 else []
            track_data.append([track_id, album_id, artist_id, genre_ids])
    return pd.DataFrame(track_data, columns=['track_id', 'album_id', 'artist_id', 'genre_ids'])

def load_album_data(filepath):
    album_data = []
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split('|')
            album_id = safe_int(parts[0])
            artist_id = safe_int(parts[1])
            genre_ids = [safe_int(g) for g in parts[2:]] if len(parts) > 2 else []
            album_data.append([album_id, artist_id, genre_ids])
    return pd.DataFrame(album_data, columns=['album_id', 'artist_id', 'genre_ids'])


# read artistData2
def load_artist_data(filepath):
    with open(filepath, 'r') as f:
        artist_ids = [int(line.strip()) for line in f]
    return pd.DataFrame(artist_ids, columns=['artist_id'])

# read genreData2
def load_genre_data(filepath):
    with open(filepath, 'r') as f:
        genre_ids = [int(line.strip()) for line in f]
    return pd.DataFrame(genre_ids, columns=['genre_id'])


In [11]:
track_df = load_track_data('D:/dataset/ydata-ymusic-kddcup-2011-track2/ydata-ymusic-kddcup-2011-track2/trackData2.txt')
track_df

Unnamed: 0,track_id,album_id,artist_id,genre_ids
0,1,106710,281667,"[214765, 162234, 155788]"
1,2,280977,233685,"[131552, 173467, 48505]"
2,3,38422,219136,"[61215, 201738, 88853]"
3,4,119529,166863,"[17453, 35389]"
4,5,16742,294690,"[61215, 34486, 274088]"
...,...,...,...,...
224036,296100,166516,33011,"[274088, 199606, 88853]"
224037,296101,0,0,[]
224038,296102,153644,289056,"[158282, 139095, 242383]"
224039,296105,68336,6613,[82064]


In [13]:
album_df = load_album_data('D:/dataset/ydata-ymusic-kddcup-2011-track2/ydata-ymusic-kddcup-2011-track2/albumData2.txt')
album_df

Unnamed: 0,album_id,artist_id,genre_ids
0,0,0,[214765]
1,6,228091,"[158282, 81520, 242383]"
2,19,85028,[103715]
3,30,16832,[31567]
4,33,26330,"[149962, 209270]"
...,...,...,...
52824,296104,125866,[158282]
52825,296106,221637,"[116130, 9432]"
52826,296107,0,[61215]
52827,296108,93277,[61215]


In [15]:
artist_df = load_artist_data('D:/dataset/ydata-ymusic-kddcup-2011-track2/ydata-ymusic-kddcup-2011-track2/artistData2.txt')
artist_df

Unnamed: 0,artist_id
0,16
1,23
2,35
3,40
4,49
...,...
18669,295987
18670,296007
18671,296012
18672,296013


In [17]:
genre_df = load_genre_data('D:/dataset/ydata-ymusic-kddcup-2011-track2/ydata-ymusic-kddcup-2011-track2/genreData2.txt')
genre_df

Unnamed: 0,genre_id
0,208
1,315
2,642
3,1075
4,1271
...,...
562,292093
563,293670
564,293688
565,294138


In [19]:
train_path = 'D:/dataset/ydata-ymusic-kddcup-2011-track2/ydata-ymusic-kddcup-2011-track2/trainIdx2.txt'
my_seed = 7
random.seed(my_seed)
np.random.seed(my_seed)
train_df = load_user_item_matrix(train_path)


In [20]:
train_df

Unnamed: 0,user_id,item_id,rating
0,0,28341,90
1,0,51210,90
2,0,79500,90
3,0,82317,90
4,0,98399,90
...,...,...,...
61944401,249011,270557,90
61944402,249011,273574,90
61944403,249011,286938,90
61944404,249011,287681,80


In [23]:
duplicate_counts = train_df.groupby(['user_id', 'item_id']).size()
duplicates = duplicate_counts[duplicate_counts > 1]

print(f"🔁 Number of duplicated (user_id, item_id) pairs: {len(duplicates)}")
print(duplicates.head())


🔁 Number of duplicated (user_id, item_id) pairs: 0
Series([], dtype: int64)


In [23]:
print(train_df['item_id'].max())      # highest item_id
print(track_df['track_id'].max())     # highest track_id
print(train_df['item_id'].isin(track_df['track_id']).mean())  # proportion to match

print(train_df['item_id'].isin(album_df['album_id']).mean())

print(train_df['item_id'].isin(artist_df['artist_id']).mean())

print(train_df['item_id'].isin(genre_df['genre_id']).mean())



296110
296110
0.4385845107627636
0.1925648621120041
0.3114063600835885
0.05744426704164376


In [39]:
def build_item_metadata(track_df, album_df, artist_df, genre_df):
    rows = []

    # 1. Tracks
    for _, row in track_df.iterrows():
        rows.append({
            'item_id': row['track_id'],
            'item_type': 'track',
            'artist_id': row['artist_id'],
            'genre_ids': row['genre_ids']
        })

    # 2. Albums
    for _, row in album_df.iterrows():
        rows.append({
            'item_id': row['album_id'],
            'item_type': 'album',
            'artist_id': row['artist_id'],
            'genre_ids': row['genre_ids']
        })

    # 3. Artists
    for artist_id in artist_df['artist_id']:
        rows.append({
            'item_id': artist_id,
            'item_type': 'artist',
            'artist_id': artist_id,
            'genre_ids': 0
        })

    # 4. Genres
    for genre_id in genre_df['genre_id']:
        rows.append({
            'item_id': genre_id,
            'item_type': 'genre',
            'artist_id': 0,
            'genre_ids': [genre_id]
        })

    return pd.DataFrame(rows)

In [105]:
item_meta_df = build_item_metadata(track_df, album_df, artist_df, genre_df)
# only track
#valid_item_ids = set(track_df['track_id'])
#item_meta_df = item_meta_df[item_meta_df['item_id'].isin(valid_item_ids)]
#train_with_meta = train_df.merge(item_meta_df, on='item_id', how='inner')

train_with_meta = train_df.merge(item_meta_df, on='item_id', how='inner')

train_with_meta

Unnamed: 0,user_id,item_id,rating,item_type,artist_id,genre_ids
0,0,28341,90,artist,28341,0
1,9,28341,0,artist,28341,0
2,13,28341,90,artist,28341,0
3,15,28341,90,artist,28341,0
4,20,28341,90,artist,28341,0
...,...,...,...,...,...,...
61944401,225199,29944,83,track,127850,[274161]
61944402,238684,29944,50,track,127850,[274161]
61944403,243157,29944,70,track,127850,[274161]
61944404,243951,29944,30,track,127850,[274161]


In [135]:
sampled_users = np.random.choice(train_with_meta['user_id'].unique(), size=5000, replace=False)
user_filtered_df = train_with_meta[train_with_meta['user_id'].isin(sampled_users)]


top_items = (
    user_filtered_df['item_id']
    .value_counts()
    .head(10000)
    .index
)


sample_df = user_filtered_df[user_filtered_df['item_id'].isin(top_items)]

print(f"🎯 Sample shape: {sample_df.shape}")
print(f"👤 Users: {sample_df['user_id'].nunique()}, 🎵 Items: {sample_df['item_id'].nunique()}")

🎯 Sample shape: (688457, 6)
👤 Users: 5000, 🎵 Items: 10000


In [136]:
def threshold_interactions_df_plus(
    df,
    user_col='user_id',
    item_col='item_id',
    artist_col='artist_id',
    genre_col='genre_ids',  
    min_items_per_user=100,
    min_artists_per_user=20,
    min_genres_per_user=10,
    min_users_per_item=100,
    verbose=True
):
    """
    Filters a user-item interaction dataframe by enforcing minimum thresholds on:
    1. Number of items per user
    2. Number of users per item
    3. Number of distinct artists per user
    4. Number of distinct genres per user

    Parameters
    ----------
    df : pd.DataFrame
        The interaction dataframe containing user_id, item_id, artist_id, and genre_id(s).
    user_col : str
        Column name for user ID.
    item_col : str
        Column name for item ID.
    artist_col : str
        Column name for artist ID.
    genre_col : str
        Column name for genre ID (should be a list or allow explode).
    min_items_per_user : int
        Minimum number of items a user must have interacted with.
    min_artists_per_user : int
        Minimum number of unique artists a user must have interacted with.
    min_genres_per_user : int
        Minimum number of unique genres a user must have interacted with.
    min_users_per_item : int
        Minimum number of users that must have interacted with an item.
    verbose : bool
        Whether to print filtering and sparsity information.

    Returns
    -------
    pd.DataFrame
        The filtered interaction dataframe.
    """

    df_filtered = df.copy()

    # Initial sparsity
    n_users = df_filtered[user_col].nunique()
    n_items = df_filtered[item_col].nunique()
    sparsity = 100 * df_filtered.shape[0] / (n_users * n_items)
    if verbose:
        print(f"Initial: users = {n_users}, items = {n_items}, sparsity = {sparsity:.4f}%")

    # ---------- Step 1: Filter by items per user and users per item ----------
    done = False
    while not done:
        starting_shape = df_filtered.shape[0]

        user_counts = df_filtered.groupby(user_col)[item_col].count()
        df_filtered = df_filtered[df_filtered[user_col].isin(user_counts[user_counts >= min_items_per_user].index)]

        item_counts = df_filtered.groupby(item_col)[user_col].count()
        df_filtered = df_filtered[df_filtered[item_col].isin(item_counts[item_counts >= min_users_per_item].index)]

        if df_filtered.shape[0] == starting_shape:
            done = True

    n_users = df_filtered[user_col].nunique()
    n_items = df_filtered[item_col].nunique()
    sparsity = 100 * df_filtered.shape[0] / (n_users * n_items)
    if verbose:
        print(f"After item filtering: users = {n_users}, items = {n_items}, sparsity = {sparsity:.4f}%")

    # ---------- Step 2: Filter by number of unique artists per user ----------
    df_artist_valid = df_filtered[df_filtered[artist_col] != 0]
    artist_per_user = df_artist_valid.groupby(user_col)[artist_col].nunique()

    # ---------- Step 3: Filter by number of unique genres per user ----------
    df_genre_exploded = df_filtered.explode(genre_col)
    df_genre_exploded = df_genre_exploded[df_genre_exploded[genre_col] != 0]
    genre_per_user = df_genre_exploded.groupby(user_col)[genre_col].nunique()

    # ---------- Step 4: User intersection ----------
    valid_users = set(artist_per_user[artist_per_user > min_artists_per_user].index) & \
                  set(genre_per_user[genre_per_user > min_genres_per_user].index)

    final_df = df_filtered[df_filtered[user_col].isin(valid_users)]

    n_users = final_df[user_col].nunique()
    n_items = final_df[item_col].nunique()
    
    sparsity = 100 * final_df.shape[0] / (n_users * n_items)
    if verbose:
        n_artists = final_df[artist_col].nunique()
        n_genres = final_df.explode(genre_col)[genre_col].nunique()
        print(f"Final: users = {n_users}, items = {n_items}, sparsity = {sparsity:.4f}%")
        print(f"unique artists = {n_artists}, unique genres = {n_genres}")

    return final_df


In [137]:
# top 1000 item
# item_popularity = train_with_meta['item_id'].value_counts()
# top_items = item_popularity.head(1000).index

# popular_data = train_with_meta[train_with_meta['item_id'].isin(top_items)]

filtered_df = threshold_interactions_df_plus(sample_df, min_items_per_user=100, min_artists_per_user=20, min_genres_per_user=10,min_users_per_item=100)



Initial: users = 5000, items = 10000, sparsity = 1.3769%
After item filtering: users = 638, items = 518, sparsity = 31.0218%
Final: users = 611, items = 518, sparsity = 31.4523%
unique artists = 380, unique genres = 135


In [189]:
all_user_ids = sorted(filtered_df['user_id'].unique())
warm_users = all_user_ids[:120]
cold_users = all_user_ids[120:]


df_warm = filtered_df[filtered_df['user_id'].isin(warm_users)].copy()
df_cold = filtered_df[filtered_df['user_id'].isin(cold_users)].copy()


In [191]:
matrix_warm, rid_to_idx_warm, idx_to_rid_warm, cid_to_idx, idx_to_cid = df_to_matrix(
    df_warm, "user_id", "item_id", "rating"
)


matrix_cold, rid_to_idx_cold, idx_to_rid_cold, _, _ = df_to_matrix(
    df_cold, "user_id", "item_id", "rating"
)


In [193]:
al, train_cold_K, _ = train_test_split(matrix_cold, 1)


X_cold, test_cold, _ = train_test_split(al, 30)

In [195]:
warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
warm_df 

Unnamed: 0,28341,79500,82317,98399,151565,152077,173467,176858,180487,211565,...,113606,120851,225319,114424,21593,49866,157933,240094,84211,70788
2556,70.0,90.0,50.0,0.0,0.0,0.0,80.0,0.0,0.0,90.0,...,0.0,0.0,70.0,0.0,50.0,90.0,0.0,0.0,0.0,0.0
2892,0.0,60.0,0.0,0.0,0.0,0.0,100.0,30.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0
3103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0
5869,90.0,0.0,0.0,0.0,0.0,0.0,90.0,90.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,90.0,90.0,90.0,90.0,0.0,90.0,0.0,90.0,50.0,90.0
21728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
4280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [197]:
train_copy = train_cold_K.tolil().copy()
cold_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
cold_df 

Unnamed: 0,28341,79500,82317,98399,151565,152077,173467,176858,180487,211565,...,113606,120851,225319,114424,21593,49866,157933,240094,84211,70788
58290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
x_df = pd.concat([warm_df, cold_df], ignore_index=False)
x_df

Unnamed: 0,28341,79500,82317,98399,151565,152077,173467,176858,180487,211565,...,113606,120851,225319,114424,21593,49866,157933,240094,84211,70788
2556,70.0,90.0,50.0,0.0,0.0,0.0,80.0,0.0,0.0,90.0,...,0.0,0.0,70.0,0.0,50.0,90.0,0.0,0.0,0.0,0.0
2892,0.0,60.0,0.0,0.0,0.0,0.0,100.0,30.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0
3103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0
5869,90.0,0.0,0.0,0.0,0.0,0.0,90.0,90.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [201]:
def elicitation_by_tree_path_retrain_depth_warm(
    tree_model_class,
    train,              # lil_matrix: cold users' known ratings (1 item/user, updated over rounds)
    test,               # lil_matrix: cold users' test ratings (30 ratings/user), fixed
    X,                  # lil_matrix: cold users' "hidden" ratings, revealed 1 per round
    matrix_warm,        # lil_matrix: warm users' full ratings
    idx_to_rid_cold,    # cold user index → user_id
    idx_to_rid_warm,    # warm user index → user_id
    idx_to_cid,         # item index → item_id
    iteration=5
):
    """
    Method 2: Retrain tree each round using warm + cold user data.
    Walk i steps for each cold user to reach a node and elicit a new rating.

    Returns:
    - rmse_list: RMSE after each iteration
    - mae_list: MAE after each iteration
    """

    num_users, num_items = train.shape
    train_copy = train.tolil().copy()
    X_copy = X.tolil().copy()
    rmse_list, mae_list = [], []
    cid_to_idx = {v: k for k, v in idx_to_cid.items()}

    # Step 0: Baseline SVD using only cold-start (1 rating per user)
    train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
    reader = Reader(rating_scale=(1, 5))
    algo = SVD()
    trainset = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset()
    algo.fit(trainset)

    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    testset = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset().build_testset()
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions, verbose=True))
    mae_list.append(accuracy.mae(predictions, verbose=True))
    print("✅ Baseline evaluation complete.")

    for i in range(iteration):
        print(f"\n🔁 Iteration {i+1}/{iteration} (Tree depth = {i+1})")

        # Step 1: Prepare tree training data (warm + updated cold)
        warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
        cold_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
        x_df = pd.concat([warm_df, cold_df], ignore_index=False)

        tree_model = tree_model_class(max_depth=i+1, min_instances=5)
        tree_model.fit(x_df, x_df)
        print("🌳 Tree re-trained.")

        # Step 2: Walk tree and elicit a new rating for each cold user
        for u in range(num_users):
            node = tree_model.root
            depth = 0

            while node and not node.is_leaf and node.attribute_name and depth < i:
                item = node.attribute_name
                if item not in cid_to_idx:
                    node = None
                    break
                item_idx = cid_to_idx[item]
                rating = train_copy[u, item_idx]

                if rating >= 81:
                    node = node.children[0]  # Lovers
                elif rating > 0:
                    node = node.children[1]  # Haters
                else:
                    node = node.children[2]  # Unknowns

                depth += 1

            if node is None or node.attribute_name is None:
                continue

            item = node.attribute_name
            if item not in cid_to_idx:
                continue
            item_idx = cid_to_idx[item]
            rating = X_copy[u, item_idx]

            if rating > 0:
                train_copy[u, item_idx] = rating
                X_copy[u, item_idx] = 0

        # Step 3: Evaluate updated cold user SVD model
        train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
        algo = SVD()
        trainset = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset()
        algo.fit(trainset)

        test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
        testset = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset().build_testset()
        predictions = algo.test(testset)
        rmse_list.append(accuracy.rmse(predictions, verbose=True))
        mae_list.append(accuracy.mae(predictions, verbose=True))
        print(f"✅ Iteration {i+1} complete.")

    return rmse_list, mae_list


In [203]:
def elicitation_by_tree_path_retrain_depth_debug(
    tree_model_class,
    train,
    test,
    X,
    matrix_warm,
    idx_to_rid_cold,
    idx_to_cid,
    idx_to_rid_warm,
    iteration=5,
    trace_user_id=None
):
    """
    Method 2 (Debug Version): Retrain tree using warm + updated cold user ratings.
    Includes optional tracing of a specific cold user each round.

    Parameters:
    - tree_model_class: class of the tree model
    - train: lil_matrix of cold user initial ratings
    - test: lil_matrix of cold user test ratings
    - X: lil_matrix of cold user hidden ratings
    - matrix_warm: lil_matrix of warm user full matrix
    - idx_to_rid_cold: dict, index to raw user_id (cold only)
    - idx_to_rid_warm: dict, index to raw user_id (warm only)
    - idx_to_cid: dict, index to raw item_id
    - iteration: number of elicitation rounds
    - trace_user_id: optional int, trace one cold user's ratings over time

    Returns:
    - rmse_list, mae_list
    """

    num_users, num_items = train.shape
    train_copy = train.tolil().copy()
    X_copy = X.tolil().copy()

    rmse_list, mae_list = [], []
    cid_to_idx = {v: k for k, v in idx_to_cid.items()}
    reader = Reader(rating_scale=(1, 5))

    # Prepare test set once
    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    test_items = set(test_df['item_id'].unique())
    testset = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset().build_testset()

    # Step 0: baseline
    train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
    algo = SVD()
    algo.fit(Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset())
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions, verbose=True))
    mae_list.append(accuracy.mae(predictions, verbose=True))
    print("✅ Baseline evaluation complete.")

    for i in range(iteration):
        print(f"\n🔁 Iteration {i+1}/{iteration} (Tree depth = {i+1})")

        # Step 1: retrain tree
        warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
        cold_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
        x_df = pd.concat([warm_df, cold_df], ignore_index=True)

        tree_model = tree_model_class(max_depth=i+1, min_instances=5)
        tree_model.fit(x_df, x_df)
        print("🌳 Tree re-trained.")

        added, skipped = 0, 0

        for u in range(num_users):
            node = tree_model.root
            depth = 0

            while node and not node.is_leaf and node.attribute_name and depth < i:
                item = node.attribute_name
                if item not in cid_to_idx:
                    node = None
                    break
                item_idx = cid_to_idx[item]
                rating = train_copy[u, item_idx]

                if rating >= 81:
                    node = node.children[0]
                elif rating > 0:
                    node = node.children[1]
                else:
                    node = node.children[2]
                depth += 1

            if node is None or node.attribute_name is None:
                skipped += 1
                continue

            item = node.attribute_name
            if item not in cid_to_idx:
                skipped += 1
                continue

            item_idx = cid_to_idx[item]
            rating = X_copy[u, item_idx]

            if rating > 0:
                train_copy[u, item_idx] = rating
                X_copy[u, item_idx] = 0
                added += 1
            else:
                skipped += 1

        print(f"📬 New ratings added this iteration: {added}")
        print(f"🚫 Cold users skipped (no valid path/item): {skipped}")
        print(f"📊 SVD training set size (iteration {i+1}): {train_copy.nnz}")
        print(f"✖️ Train/Test item overlap after iteration {i+1}: "
              f"{len(set(test_df['item_id'].unique()) & set(train_df['item_id'].unique()))} / {len(set(test_df['item_id'].unique()))}")

        # Step 2: retrain SVD
        train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
        algo = SVD()
        algo.fit(Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset())
        predictions = algo.test(testset)

        rmse_list.append(accuracy.rmse(predictions, verbose=True))
        mae_list.append(accuracy.mae(predictions, verbose=True))
        print(f"✅ Iteration {i+1} complete.")

        # Step 3: trace a specific user (optional)
        if trace_user_id is not None:
            # Find corresponding internal row index
            target_idx = None
            for idx, uid in idx_to_rid_cold.items():
                if uid == trace_user_id:
                    target_idx = idx
                    break

            if target_idx is not None:
                user_ratings = []
                for item_idx in range(num_items):
                    r = train_copy[target_idx, item_idx]
                    if r > 0:
                        user_ratings.append((idx_to_cid[item_idx], r))
                user_df = pd.DataFrame(user_ratings, columns=['item_id', 'rating'])
                user_df.insert(0, 'user_id', trace_user_id)
                print(f"\n🔍 Ratings for cold user ID {trace_user_id} after iteration {i+1}:")
                display(user_df.sort_values('item_id').reset_index(drop=True))

    return rmse_list, mae_list



In [213]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

elicitation_by_tree_path_retrain_depth_warm(
    tree_model_class=Tree,
    train=train_cold_K,
    test=test_cold,
    X=X_cold,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=3
)


RMSE: 67.9580
MAE:  63.7605
✅ Baseline evaluation complete.

🔁 Iteration 1/3 (Tree depth = 1)
Initializing Splitter...
✅ Tree built successfully!
🌳 Tree re-trained.
RMSE: 67.9579
MAE:  63.7602
✅ Iteration 1 complete.

🔁 Iteration 2/3 (Tree depth = 2)
Initializing Splitter...
✅ Tree built successfully!
🌳 Tree re-trained.
RMSE: 67.9581
MAE:  63.7604
✅ Iteration 2 complete.

🔁 Iteration 3/3 (Tree depth = 3)
Initializing Splitter...
✅ Tree built successfully!
🌳 Tree re-trained.
RMSE: 67.9578
MAE:  63.7600
✅ Iteration 3 complete.


([67.95804708919017, 67.9578757684798, 67.95806342685421, 67.95779260771761],
 [63.76049990474328, 63.76017732445323, 63.76035242290749, 63.759960841899165])

In [205]:
def i_variance(matirx):
    c = matirx.copy()
    c_2 = c.power(2)
    E_2 = c_2.mean(0)
    E = c.mean(0)
    v = E_2 - np.sqrt(E)
    return (np.asarray(v)).flatten()

def i_random(matirx):
    c = matirx.copy()
    c_2 = c.power(2)
    E_2 = c_2.mean(0)
    E = c.mean(0)
    v = E_2 - np.sqrt(E)
    return (np.asarray(v)).flatten()
    
def i_entropy(matirx):
    entropy = []
    for item in range(0, matirx.shape[1]):
        ratings = matirx.getcol(item).data
        count_r = Counter(ratings)
        len_rating = len(ratings)
        ent = 0
        for c in count_r:
            ent -= (count_r[c]/len_rating)*math.log(count_r[c]/len_rating)
        entropy.append(ent)
    return (np.asarray(entropy))

def i_entropy_0(matirx):
    entropy = []
    c_all = matirx.shape[0]
    for item in range(0, matirx.shape[1]):
        ratings = matirx.getcol(item).data
        count_r = Counter(ratings)
        count_r[0] = c_all - matirx.getcol(item).count_nonzero()
        ent = 0
        for c in count_r:
            ent -= (count_r[c]/c_all)*math.log(count_r[c]/c_all)
        entropy.append(ent)
    return (np.asarray(entropy))

def i_pop(matrix):
    popularity = []
    for item in range(0, matrix.shape[1]):
        popularity.append(matrix.getcol(item).count_nonzero())
        
    return (np.asarray(popularity))

def log_pop_entropy(matrix):
    entropy = i_entropy(matrix)
    popularity = i_pop(matrix)
    scores = np.log(popularity) *  entropy 
    return (np.asarray(scores))

def log_pop_entropy0(matrix):
    entropy = i_entropy_0(matrix)
    popularity = i_pop(matrix)
    scores = np.log(popularity) *  entropy 
    return (np.asarray(scores))

def sqrt_pop_variance(matrix):
    variance = i_variance(matrix)
    popularity = i_pop(matrix)
    scores = np.sqrt(popularity) *  variance 
    return (np.asarray(scores))

def helf(matrix):
    log_U = np.log(matrix.shape[0])
    lf = np.log(i_pop(matrix))/log_U
    h = i_entropy(matrix)/np.log(5)
    helf = (2* lf*h)/(lf+h)
    return helf

def helf0(matrix):
    log_U = np.log(matrix.shape[0])
    lf = np.log(i_pop(matrix))/log_U
    h = i_entropy_0(matrix)/np.log(5)
    helf = (2* lf*h)/(lf+h)
    return helf

def i_co_rate(matrix):
    matrix_c = matrix.copy()
    matrix_c[matrix_c != 0] = 1
    c= matrix_c.T.dot(matrix_c)
    c.setdiag(0, k=0)
    
    return np.asarray(c.sum(axis=0))[0]

In [207]:
import scipy.sparse as sp


def elicitation_np_warm(matrix_warm, train,test,X,strategy,iteration,k,positive=False):
    rmse,mae = [],[]
    
    # Performance before elicitation
    train_df = matrix_to_df_2(train,idx_to_rid_cold,idx_to_cid)
    reader = Reader(rating_scale=(1, 5))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    data_rr = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(data_rr)
    test_df = matrix_to_df_2(test,idx_to_rid_cold,idx_to_cid)
    test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    test_rr = test_r.build_full_trainset()
    predictions= algo.test(test_rr.build_testset())
    rmse.append(accuracy.rmse(predictions))
    mae.append(accuracy.mae(predictions))
    
    c_u_dict = {}
    all_items = list(range(0,train.shape[1]))
    X_copy = (X.tolil()).copy()
    warm_copy = matrix_warm.tolil().copy()
    train_copy = (train.tolil()).copy()
    

    #scores = strategy(train_copy)
    
    for count in range(0,iteration-1):
        x_matrix = sp.vstack([warm_copy, train_copy])
        scores = strategy(x_matrix)
        if positive:
            ranking = np.argsort(-scores)
        else:
            ranking = np.argsort(scores)
        for u in range(0, train.shape[0]):
            if count ==0:
                p_u = train_copy.getrow(u).nonzero()[1]
                c_u = set_diff(all_items,p_u)
                c_u_dict[u] = c_u.copy()
            else:
                c_u = c_u_dict[u].copy()                
          
            ranking_u = ranking[np.in1d(ranking,c_u)]
            topk_u = ranking_u[:k]
            c_u_dict[u] = set_diff(c_u,topk_u)
            px_u = X_copy.getrow(u).nonzero()[1]
            recom = set_intersection(topk_u,px_u)
            for item in recom:                  
                train_copy[u,item] = X[u,item]
                X_copy[u,item] = 0
        train_df = matrix_to_df_2(train_copy,idx_to_rid_cold,idx_to_cid)

        reader = Reader(rating_scale=(1, 5))
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        data_rr = data_r.build_full_trainset()
        algo.fit(data_rr)
        test_df = matrix_to_df(test,idx_to_rid_cold,idx_to_cid)
        test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        test_rr = test_r.build_full_trainset()
        predictions= algo.test(test_rr.build_testset())
        rmse.append(accuracy.rmse(predictions))
        mae.append(accuracy.mae(predictions))
        
        print(count+1)
    return rmse,mae

In [215]:
def flat_baseline_from_elicitation_np_warm(matrix_warm, train, X, test):
    """
    Train a single SVD model using full info: matrix_warm + train + X
    Return RMSE and MAE on test set (Surprise-based).
    """
    #  K + X
    full_train_cold = train.copy().tolil()
    X_copy = X.copy().tolil()
    for u in range(train.shape[0]):
        items = X_copy.getrow(u).nonzero()[1]
        for item in items:
            full_train_cold[u, item] = X[u, item]

    # combine warm and  cold 
    full_matrix = sp.vstack([matrix_warm.tocsr(), full_train_cold.tocsr()])

    #  convert dataframe
    train_df = matrix_to_df_2(full_train_cold, idx_to_rid_cold, idx_to_cid)
    test_df = matrix_to_df(test, idx_to_rid_cold, idx_to_cid)

    # train SVD 
    reader = Reader(rating_scale=(1, 5))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    data_rr = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(data_rr)

    # Predict on test
    test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    test_rr = test_r.build_full_trainset()
    predictions = algo.test(test_rr.build_testset())
    
    rmse_val = accuracy.rmse(predictions)
    mae_val = accuracy.mae(predictions)

    print(f"[Flat Baseline] RMSE = {rmse_val:.4f}, MAE = {mae_val:.4f}")
    return rmse_val, mae_val


In [217]:
flat = flat_baseline_from_elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K,
    X=X_cold,
    test=test_cold
)


RMSE: 67.9557
MAE:  63.7573
[Flat Baseline] RMSE = 67.9557, MAE = 63.7573


In [219]:
entropy0 = elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K,
    test=test_cold,
    X=X_cold,
    strategy=i_entropy_0,
    iteration=30,
    k=1,
    positive=False
)


RMSE: 67.9584
MAE:  63.7608
RMSE: 67.9579
MAE:  63.7604
1
RMSE: 67.9582
MAE:  63.7606
2
RMSE: 67.9578
MAE:  63.7600
3
RMSE: 67.9579
MAE:  63.7604
4
RMSE: 67.9580
MAE:  63.7603
5
RMSE: 67.9581
MAE:  63.7604
6
RMSE: 67.9584
MAE:  63.7606
7
RMSE: 67.9578
MAE:  63.7600
8
RMSE: 67.9579
MAE:  63.7604
9
RMSE: 67.9579
MAE:  63.7603
10
RMSE: 67.9584
MAE:  63.7611
11
RMSE: 67.9585
MAE:  63.7608
12
RMSE: 67.9578
MAE:  63.7600
13
RMSE: 67.9579
MAE:  63.7602
14
RMSE: 67.9585
MAE:  63.7606
15
RMSE: 67.9581
MAE:  63.7604
16
RMSE: 67.9578
MAE:  63.7600
17
RMSE: 67.9582
MAE:  63.7605
18
RMSE: 67.9579
MAE:  63.7604
19
RMSE: 67.9581
MAE:  63.7604
20
RMSE: 67.9578
MAE:  63.7600
21
RMSE: 67.9578
MAE:  63.7600
22
RMSE: 67.9578
MAE:  63.7600
23
RMSE: 67.9581
MAE:  63.7604
24
RMSE: 67.9579
MAE:  63.7601
25
RMSE: 67.9579
MAE:  63.7601
26
RMSE: 67.9578
MAE:  63.7600
27
RMSE: 67.9575
MAE:  63.7596
28
RMSE: 67.9575
MAE:  63.7596
29


In [211]:
variance = elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K,
    test=test_cold,
    X=X_cold,
    strategy=i_variance,
    iteration=9,
    k=1,
    positive=False
)

RMSE: 67.9582
MAE:  63.7606
RMSE: 67.9578
MAE:  63.7600
1
RMSE: 67.9584
MAE:  63.7608
2
RMSE: 67.9579
MAE:  63.7602
3
RMSE: 67.9578
MAE:  63.7600
4
RMSE: 67.9584
MAE:  63.7605
5
RMSE: 67.9581
MAE:  63.7603
6
RMSE: 67.9580
MAE:  63.7604
7
RMSE: 67.9575
MAE:  63.7596
8
