In [1]:
import os
import sys

try:
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(current_dir, ".."))  
if project_root not in sys.path:
    sys.path.append(project_root)


In [None]:

# import pysparnn.cluster_index as ci
# import scipy.sparse
# import os
import math
import numpy as np
import pandas as pd
import random
import scipy.sparse as sp
from utils import train_test_split, df_to_matrix ,matrix_to_df_2, threshold_interactions_df, matrix_to_df,set_intersection,get_0_and_p_index,set_diff, matrix_to_full_df, threshold_interactions_df_plus, train_test_split_csr

# !pip install surprise
from surprise import Reader, accuracy
from surprise import SVD
from surprise import Dataset
from collections import Counter
from collections import defaultdict

# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from pct.tree.heuristic.Heuristic import Heuristic5
from pct.tree.heuristic.NumericHeuristic_pair import NumericHeuristic5
from pct.tree.splitter.splitter_pair import Splitter
from pct.tree.Yahoo_pairwise_tree import Tree

In [5]:
filtered_df = pd.read_csv("filtered_semi_binary.csv")
filtered_df

Unnamed: 0,user_id,item_id,rating,item_type,artist_id,genre_ids
0,9,238709,0.00,artist,238709,0
1,9,169510,0.00,artist,169510,0
2,9,208084,1.00,artist,208084,0
3,9,245398,0.00,artist,245398,0
4,9,153166,0.00,artist,153166,0
...,...,...,...,...,...,...
1524889,248947,83754,0.01,artist,83754,0
1524890,248947,141799,0.00,genre,0,[141799]
1524891,248947,141677,0.00,artist,141677,0
1524892,248947,262458,0.00,artist,262458,0


In [6]:
# item type map
item_type_map = filtered_df.drop_duplicates(subset='item_id')[['item_id', 'item_type']]
item_type_map = dict(zip(item_type_map['item_id'], item_type_map['item_type']))

In [7]:
def get_item_type(item_id):
    return item_type_map.get(item_id, 'unknown')  


In [None]:
print(get_item_type(172223))

In [None]:
print(get_item_type(238709))  
print(get_item_type(141799))  


In [8]:
all_user_ids = sorted(filtered_df['user_id'].unique().tolist())

def split_users_by_ratio(all_user_ids, ratio):
    n = len(all_user_ids)
    split_point = int(n * ratio)
    warm_users = all_user_ids[:split_point]
    cold_users = all_user_ids[split_point:]
    return warm_users, cold_users

# Example ratios from 10% to 50%
# ratios = [0.1, 0.2, 0.3, 0.4, 0.5]
# splits = {r: split_users_by_ratio(all_user_ids, r) for r in ratios}


## 10% warm

In [9]:
warm_users_idx, cold_users_idx = split_users_by_ratio(all_user_ids, 0.1)

df_warm = filtered_df[filtered_df['user_id'].isin(warm_users_idx)].copy()
df_cold = filtered_df[filtered_df['user_id'].isin(cold_users_idx)].copy()

matrix_warm, rid_to_idx_warm, idx_to_rid_warm, cid_to_idx, idx_to_cid = df_to_matrix(
    df_warm, "user_id", "item_id", "rating")


matrix_cold, rid_to_idx_cold, idx_to_rid_cold, _, _ = df_to_matrix( 
    df_cold, "user_id", "item_id", "rating")



In [10]:
def split_and_combine(strategy="artist-only"):
    """Handles both approaches with proper matrix alignment"""
    # Get full cold matrix and mappings
    matrix_cold, rid_to_idx_cold, _, cid_to_idx, _ = df_to_matrix(
        df_cold, "user_id", "item_id", "rating"
    )
    matrix_cold = matrix_cold.tocsr()

    # Create boolean masks
    artist_mask = np.isin(
        np.arange(matrix_cold.shape[1]), 
        [cid_to_idx[iid] for iid in df_cold[df_cold['item_type'] == 'artist']['item_id']]
    )
    genre_mask = ~artist_mask

    # Create aligned matrices
    matrix_cold_artist = matrix_cold.multiply(artist_mask)
    matrix_cold_genre = matrix_cold.multiply(genre_mask)
    matrix_cold_artist = matrix_cold.multiply(artist_mask).tocsr()
    matrix_cold_genre = matrix_cold.multiply(genre_mask).tocsr()

    al_artist, test_cold, _ = train_test_split(
        matrix_cold_artist, 
        split_count=30,
        fraction=None
    )
    
    if strategy == "artist-only":
        X_cold, K_cold, _ = train_test_split_csr(al_artist, 1)  
        return K_cold, X_cold, test_cold
    
    elif strategy == "hybrid":
        X_cold, K_cold, _ = train_test_split_csr(al_artist, 1)
        X_cold = X_cold + matrix_cold_genre
        return K_cold, X_cold, test_cold


In [11]:
my_seed = 7
random.seed(my_seed)
np.random.seed(my_seed)

train_cold_K_artist, X_cold_artist, test_cold_artist  = split_and_combine("artist-only")
train_cold_K_hybrid, X_cold_hybrid, test_cold_hybrid = split_and_combine("hybrid")


In [None]:
print(f"Cold users in X: {len(np.unique(X_cold_hybrid.nonzero()[0]))}")
print(f"Cold items in X: {len(np.unique(X_cold_hybrid.nonzero()[1]))}")
print(f"test users in test: {len(np.unique(test_cold_hybrid.nonzero()[0]))}")
print(f"test items in test: {len(np.unique(test_cold_hybrid.nonzero()[1]))}")
print(f"train users in train: {len(np.unique(train_cold_K_hybrid.nonzero()[0]))}")
print(f"train items in train: {len(np.unique(train_cold_K_hybrid.nonzero()[1]))}")

print("-------------------")

# shape of test_cold
print(f"Shape of test_cold: {test_cold_hybrid.shape}")
print(f"Shape of train_cold: {train_cold_K_hybrid.shape}")
print(f"Shape of X_cold: {X_cold_hybrid.shape}")

# X with artist + genre

In [13]:
def elicitation_by_pairwise_tree_retrain_skiped(Tree, train, test, X, matrix_warm,
                                                 idx_to_rid_cold, idx_to_rid_warm, idx_to_cid,
                                                 iteration=5, strategy=1):
    """
    Pairwise tree-based elicitation (skip asked pairs). Retrain tree each round.
    At each iteration, select first unasked (itemA, itemB) pair and move both ratings from X to K.

    Parameters:
    - strategy: 1 (top2), 2 (most similar), 3 (least similar)

    Returns:
    - rmse_list, mae_list: performance metrics per iteration
    - item_type_stats: per-round asked item type counts
    """

    num_users, num_items = train.shape
    train_copy = train.tolil().copy()
    X_copy = X.tolil().copy()
    rmse_list, mae_list = [], []
    asked_pairs = {u: set() for u in range(num_users)}  
    item_type_stats = defaultdict(lambda: defaultdict(int))
    cid_to_idx = {v: k for k, v in idx_to_cid.items()}

    # Step 0: Baseline evaluation
    print("🔍 Evaluating baseline RMSE/MAE...")
    train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
    reader = Reader(rating_scale=(0, 1))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    trainset = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)
    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    test_data = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    testset = test_data.build_full_trainset().build_testset()
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions, verbose=True))
    mae_list.append(accuracy.mae(predictions, verbose=True))
    print("✅ Baseline evaluation complete.")

    # Iterative elicitation
    for i in range(iteration):
        print(f"\n🔁 Iteration {i+1}/{iteration} (skip asked pairs, walk from root)")
        # Step 1: Retrain tree
        warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
        coldK_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
        x_df = pd.concat([warm_df, coldK_df], ignore_index=False)

        pct = Tree(max_depth=i+1, min_instances=5, item_type_map=item_type_map)
        pct.fit(x_df, x_df, strategy=strategy)
        print("🌳 Tree re-trained.")

        for u in range(num_users):
            node = pct.root
            while node and not node.is_leaf and node.attribute_name:
                itemA, itemB = node.attribute_name
                if itemA not in cid_to_idx or itemB not in cid_to_idx:
                    node = None
                    break

                itemA_idx = cid_to_idx[itemA]
                itemB_idx = cid_to_idx[itemB]
                pair = frozenset([itemA_idx, itemB_idx])
  
                if pair in asked_pairs[u]:
                    ratingA = train_copy[u, itemA_idx]
                    ratingB = train_copy[u, itemB_idx]
                    if ratingA > ratingB:
                        node = node.children[0]
                    elif ratingB > ratingA:
                        node = node.children[1]
                    else:
                        node = node.children[2]
                    continue

                # First unasked pair found
                ratingA = X_copy[u, itemA_idx]
                ratingB = X_copy[u, itemB_idx]

                if ratingA > 0:
                    train_copy[u, itemA_idx] = ratingA
                    X_copy[u, itemA_idx] = 0
                if ratingB > 0:
                    train_copy[u, itemB_idx] = ratingB
                    X_copy[u, itemB_idx] = 0

                asked_pairs[u].add(pair)  

                # 🌟 Record item type (based on itemA)
                item_type = get_item_type(itemA)
                item_type_stats[i][item_type] += 1

                break  # only one pair per user per iteration

        # Step 2: Evaluate
        print("📊 Evaluating after this iteration...")
        train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        trainset = data_r.build_full_trainset()
        algo.fit(trainset)
        test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
        test_data = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        testset = test_data.build_full_trainset().build_testset()
        predictions = algo.test(testset)
        rmse_list.append(accuracy.rmse(predictions, verbose=True))
        mae_list.append(accuracy.mae(predictions, verbose=True))
        print(f"✅ Iteration {i+1} complete.")

    return rmse_list, mae_list, item_type_stats


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# ignore runtime warnings

pairwise_tree_1 = elicitation_by_pairwise_tree_retrain_skiped(
    Tree=Tree,
    train=train_cold_K_hybrid,
    test=test_cold_hybrid,
    X=X_cold_hybrid,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=20, strategy=1
)

🔍 Evaluating baseline RMSE/MAE...
RMSE: 0.5735
MAE:  0.4082
✅ Baseline evaluation complete.

🔁 Iteration 1/20 (skip asked pairs, walk from root)
✅Initializing Splitter...
✅ Calling build()...
🔍 Pair found: 172223, 278591
✅ Tree built successfully!
🌳 Tree re-trained.
📊 Evaluating after this iteration...
RMSE: 0.5566
MAE:  0.4097
✅ Iteration 1 complete.

🔁 Iteration 2/20 (skip asked pairs, walk from root)
✅Initializing Splitter...
✅ Calling build()...
🔍 Pair found: 172223, 278591
🔍 Pair found: 189272, 132285
🔍 Pair found: 195412, 151565
🔍 Pair found: 83754, 278591
✅ Tree built successfully!
🌳 Tree re-trained.
📊 Evaluating after this iteration...
RMSE: 0.5460
MAE:  0.4095
✅ Iteration 2 complete.

🔁 Iteration 3/20 (skip asked pairs, walk from root)
✅Initializing Splitter...
✅ Calling build()...


In [None]:
rmse_list, mae_list, item_type_stats = pairwise_tree_1 
for round_i, type_counts in item_type_stats.items():
    total = sum(type_counts.values())
    ratios = {k: v / total for k, v in type_counts.items()}
    print(f"Round {round_i+1}: {ratios}")
item_type_stats

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# ignore runtime warnings

pairwise_tree_2 = elicitation_by_pairwise_tree_retrain_skiped(
    Tree=Tree,
    train=train_cold_K_hybrid,
    test=test_cold_hybrid,
    X=X_cold_hybrid,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=20, strategy=2
)

In [None]:
rmse_list, mae_list, item_type_stats = pairwise_tree_2 
for round_i, type_counts in item_type_stats.items():
    total = sum(type_counts.values())
    ratios = {k: v / total for k, v in type_counts.items()}
    print(f"Round {round_i+1}: {ratios}")
item_type_stats

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# ignore runtime warnings

pairwise_tree_3 = elicitation_by_pairwise_tree_retrain_skiped(
    Tree=Tree,
    train=train_cold_K_hybrid,
    test=test_cold_hybrid,
    X=X_cold_hybrid,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=20, strategy=3
)

In [None]:
rmse_list, mae_list, item_type_stats = pairwise_tree_3 
for round_i, type_counts in item_type_stats.items():
    total = sum(type_counts.values())
    ratios = {k: v / total for k, v in type_counts.items()}
    print(f"Round {round_i+1}: {ratios}")
item_type_stats