In [None]:
import os
import sys

try:
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(current_dir, ".."))  
if project_root not in sys.path:
    sys.path.append(project_root)


In [None]:

# import pysparnn.cluster_index as ci
# import scipy.sparse
# import os
import math
import numpy as np
import pandas as pd
import random
import scipy.sparse as sp
from utils_elicitation import train_test_split, df_to_matrix ,matrix_to_df_2, threshold_interactions_df, matrix_to_df,set_intersection,get_0_and_p_index,set_diff, matrix_to_full_df, threshold_interactions_df_plus, train_test_split_csr

!pip install surprise
from surprise import Reader, accuracy
from surprise import SVD
from surprise import Dataset
from collections import Counter
from collections import defaultdict

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from pct.tree.heuristic.Heuristic import Heuristic5
from pct.tree.heuristic.NumericHeuristic import NumericHeuristic5
from pct.tree.splitter.splitter import Splitter
from pct.tree.tree import Tree

In [None]:
filtered_df = pd.read_csv("../Yahoodata/filtered_df.csv")
filtered_df

In [None]:
# item type map
item_type_map = filtered_df.drop_duplicates(subset='item_id')[['item_id', 'item_type']]
item_type_map = dict(zip(item_type_map['item_id'], item_type_map['item_type']))


In [None]:
def get_item_type(item_id):
    return item_type_map.get(item_id, 'unknown')  


In [None]:
print(get_item_type(238709))  
print(get_item_type(141799))  


In [None]:
all_user_ids = sorted(filtered_df['user_id'].unique().tolist())

def split_users_by_ratio(all_user_ids, ratio):
    n = len(all_user_ids)
    split_point = int(n * ratio)
    warm_users = all_user_ids[:split_point]
    cold_users = all_user_ids[split_point:]
    return warm_users, cold_users

# Example ratios from 10% to 50%
# ratios = [0.1, 0.2, 0.3, 0.4, 0.5]
# splits = {r: split_users_by_ratio(all_user_ids, r) for r in ratios}


## 10% warm

In [None]:
warm_users_idx, cold_users_idx = split_users_by_ratio(all_user_ids, 0.1)

df_warm = filtered_df[filtered_df['user_id'].isin(warm_users_idx)].copy()
df_cold = filtered_df[filtered_df['user_id'].isin(cold_users_idx)].copy()

matrix_warm, rid_to_idx_warm, idx_to_rid_warm, cid_to_idx, idx_to_cid = df_to_matrix(
    df_warm, "user_id", "item_id", "rating")


matrix_cold, rid_to_idx_cold, idx_to_rid_cold, _, _ = df_to_matrix( 
    df_cold, "user_id", "item_id", "rating")



In [None]:
def split_and_combine(strategy="artist-only"):
    """Handles both approaches with proper matrix alignment"""
    # Get full cold matrix and mappings
    matrix_cold, rid_to_idx_cold, _, cid_to_idx, _ = df_to_matrix(
        df_cold, "user_id", "item_id", "rating"
    )
    matrix_cold = matrix_cold.tocsr()

    # Create boolean masks
    artist_mask = np.isin(
        np.arange(matrix_cold.shape[1]), 
        [cid_to_idx[iid] for iid in df_cold[df_cold['item_type'] == 'artist']['item_id']]
    )
    genre_mask = ~artist_mask

    # Create aligned matrices
    matrix_cold_artist = matrix_cold.multiply(artist_mask)
    matrix_cold_genre = matrix_cold.multiply(genre_mask)
    matrix_cold_artist = matrix_cold.multiply(artist_mask).tocsr()
    matrix_cold_genre = matrix_cold.multiply(genre_mask).tocsr()

    al_artist, test_cold, _ = train_test_split(
        matrix_cold_artist, 
        split_count=30,
        fraction=None
    )
    
    if strategy == "artist-only":
        X_cold, K_cold, _ = train_test_split_csr(al_artist, 1)  
        return K_cold, X_cold, test_cold
    
    elif strategy == "hybrid":
        X_cold, K_cold, _ = train_test_split_csr(al_artist, 1)
        X_cold = X_cold + matrix_cold_genre
        return K_cold, X_cold, test_cold


In [None]:
my_seed = 7
random.seed(my_seed)
np.random.seed(my_seed)

train_cold_K_artist, X_cold_artist, test_cold_artist  = split_and_combine("artist-only")
train_cold_K_hybrid, X_cold_hybrid, test_cold_hybrid = split_and_combine("hybrid")


In [None]:
# shape of test_cold

print(f"Cold users in X: {len(np.unique(X_cold_artist.nonzero()[0]))}")
print(f"Cold items in X: {len(np.unique(X_cold_artist.nonzero()[1]))}")
print(f"test users in test: {len(np.unique(test_cold_artist.nonzero()[0]))}")
print(f"test items in test: {len(np.unique(test_cold_artist.nonzero()[1]))}")
print(f"train users in train: {len(np.unique(train_cold_K_artist.nonzero()[0]))}")
print(f"train items in train: {len(np.unique(train_cold_K_artist.nonzero()[1]))}")

print("-------------------")

# shape of test_cold
print(f"Shape of test_cold: {test_cold_artist.shape}")
print(f"Shape of train_cold: {train_cold_K_artist.shape}")
print(f"Shape of X_cold: {X_cold_artist.shape}")

In [None]:
print(f"Cold users in X: {len(np.unique(X_cold_hybrid.nonzero()[0]))}")
print(f"Cold items in X: {len(np.unique(X_cold_hybrid.nonzero()[1]))}")
print(f"test users in test: {len(np.unique(test_cold_hybrid.nonzero()[0]))}")
print(f"test items in test: {len(np.unique(test_cold_hybrid.nonzero()[1]))}")
print(f"train users in train: {len(np.unique(train_cold_K_hybrid.nonzero()[0]))}")
print(f"train items in train: {len(np.unique(train_cold_K_hybrid.nonzero()[1]))}")

print("-------------------")

# shape of test_cold
print(f"Shape of test_cold: {test_cold_hybrid.shape}")
print(f"Shape of train_cold: {train_cold_K_hybrid.shape}")
print(f"Shape of X_cold: {X_cold_hybrid.shape}")

Cold users in X: 1531
Cold items in X: 4793
test users in test: 1478
test items in test: 3907
train users in train: 666
train items in train: 571
-------------------
Shape of test_cold: (1531, 5011)
Shape of train_cold: (1531, 5011)
Shape of X_cold: (1531, 5011)

In [None]:
def elicitation_by_tree_path_fixed_warm(
    tree_model_class,
    train, test, X,
    matrix_warm,
    idx_to_rid_cold, idx_to_rid_warm, idx_to_cid,
    iteration=5
):
    """
    Method 1 (revised): Fixed tree + fixed path elicitation, but tree is trained with warm + cold data.
    User paths are still based only on initial cold ratings.

    Parameters:
    - tree_model_class: tree class, not instance
    - train: cold users' initial ratings (lil_matrix)
    - test: cold users' test set (lil_matrix)
    - X: cold users' to-be-elicited ratings (lil_matrix)
    - matrix_warm: warm users' full ratings (lil_matrix)
    - idx_to_rid_cold, idx_to_rid_warm, idx_to_cid: index mappings
    - iteration: number of elicitation rounds

    Returns:
    - rmse_list, mae_list
    """
    from surprise import Dataset, Reader, SVD, accuracy

    num_users, num_items = train.shape
    train_static = train.tolil().copy()
    train_copy = train.tolil().copy()
    X_copy = X.tolil().copy()
    user_paths = {u: None for u in range(num_users)}
    rmse_list, mae_list = [], []

    cid_to_idx = {v: k for k, v in idx_to_cid.items()}

    # Step 1: Baseline evaluation with cold users
    train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
    reader = Reader(rating_scale=(1, 100))
    algo = SVD()
    trainset = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset()
    algo.fit(trainset)

    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    testset = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset().build_testset()
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions, verbose=True))
    mae_list.append(accuracy.mae(predictions, verbose=True))
    print("✅ Baseline evaluation complete.")

    # Step 2: Train fixed tree with warm + cold
    warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
    cold_df = matrix_to_full_df(train_static, idx_to_rid_cold, idx_to_cid)
    x_df = pd.concat([warm_df, cold_df], ignore_index=False)

    tree_model = tree_model_class(max_depth=iteration, min_instances=5)
    tree_model.fit(x_df, x_df)
    print("🌳 Tree training complete.")

    # Step 3: Initialize user paths
    for u in range(num_users):
        user_paths[u] = tree_model.root

    # Step 4: Iterative elicitation
    for i in range(iteration):
        for u in range(num_users):
            node = user_paths[u]
            if node is None or node.is_leaf or node.attribute_name is None:
                continue

            item = node.attribute_name
            if item not in cid_to_idx:
                continue
            item_idx = cid_to_idx[item]

            rating_path = train_static[u, item_idx]  # from initial cold
            rating_real = X_copy[u, item_idx]        # from ground truth

            if rating_real > 0:
                train_copy[u, item_idx] = rating_real
                X_copy[u, item_idx] = 0

            # Update path
            if rating_path == 0:
                user_paths[u] = node.children[2]
            elif rating_path >= 50:
                user_paths[u] = node.children[0]
            else:
                user_paths[u] = node.children[1]

        # Step 5: Evaluate
        train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
        algo = SVD()
        trainset = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset()
        algo.fit(trainset)

        test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
        testset = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset().build_testset()
        predictions = algo.test(testset)
        rmse_list.append(accuracy.rmse(predictions, verbose=True))
        mae_list.append(accuracy.mae(predictions, verbose=True))
        print(f"✅ Iteration {i+1} complete.")

    return rmse_list, mae_list


In [None]:
def elicitation_by_tree_path_retrain_depth_warm(
    tree_model_class,
    train,              # lil_matrix: cold users' known ratings (1 item/user, updated over rounds)
    test,               # lil_matrix: cold users' test ratings (30 ratings/user), fixed
    X,                  # lil_matrix: cold users' "hidden" ratings, revealed 1 per round
    matrix_warm,        # lil_matrix: warm users' full ratings
    idx_to_rid_cold,    # cold user index → user_id
    idx_to_rid_warm,    # warm user index → user_id
    idx_to_cid,         # item index → item_id
    iteration=5
):
    """
    Method 2: Retrain tree each round using warm + cold user data.
    Walk i steps for each cold user to reach a node and elicit a new rating.

    Returns:
    - rmse_list: RMSE after each iteration
    - mae_list: MAE after each iteration
    """

    num_users, num_items = train.shape
    train_copy = train.tolil().copy()
    X_copy = X.tolil().copy()
    rmse_list, mae_list = [], []
    cid_to_idx = {v: k for k, v in idx_to_cid.items()}

    # Step 0: Baseline SVD using only cold-start (1 rating per user)
    train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
    reader = Reader(rating_scale=(1, 100))
    algo = SVD()
    trainset = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset()
    algo.fit(trainset)

    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    testset = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset().build_testset()
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions, verbose=True))
    mae_list.append(accuracy.mae(predictions, verbose=True))
    print("✅ Baseline evaluation complete.")

    for i in range(iteration):
        print(f"\n🔁 Iteration {i+1}/{iteration} (Tree depth = {i+1})")

        # Step 1: Prepare tree training data (warm + updated cold)
        warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
        cold_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
        x_df = pd.concat([warm_df, cold_df], ignore_index=False)

        tree_model = tree_model_class(max_depth=i+1, min_instances=5)
        tree_model.fit(x_df, x_df)
        print("🌳 Tree re-trained.")

        # Step 2: Walk tree and elicit a new rating for each cold user
        for u in range(num_users):
            node = tree_model.root
            depth = 0

            while node and not node.is_leaf and node.attribute_name and depth < i:
                item = node.attribute_name
                if item not in cid_to_idx:
                    node = None
                    break
                item_idx = cid_to_idx[item]
                rating = train_copy[u, item_idx]

                if rating >= 50:
                    node = node.children[0]  # Lovers
                elif rating > 0:
                    node = node.children[1]  # Haters
                else:
                    node = node.children[2]  # Unknowns

                depth += 1

            if node is None or node.attribute_name is None:
                continue

            item = node.attribute_name
            if item not in cid_to_idx:
                continue
            item_idx = cid_to_idx[item]
            rating = X_copy[u, item_idx]

            if rating > 0:
                train_copy[u, item_idx] = rating
                X_copy[u, item_idx] = 0

        # Step 3: Evaluate updated cold user SVD model
        train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
        algo = SVD()
        trainset = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset()
        algo.fit(trainset)

        test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
        testset = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader).build_full_trainset().build_testset()
        predictions = algo.test(testset)
        rmse_list.append(accuracy.rmse(predictions, verbose=True))
        mae_list.append(accuracy.mae(predictions, verbose=True))
        print(f"✅ Iteration {i+1} complete.")

    return rmse_list, mae_list


In [1]:
from collections import defaultdict


def elicitation_by_tree_path_retrain_skiped_warm(Tree, train, test, X, matrix_warm, idx_to_rid_cold, idx_to_rid_warm, idx_to_cid, iteration=5):
    """
    Method 3 (revised): Retrain tree at each round. For each user, traverse from root down the tree.
    At each level, if the item was already asked (stored in asked_items), go deeper.
    Ask the first item the user hasn't been asked before.

    Parameters:
    - tree_model_class: class of the decision tree model (not instance)
    - train, test, X: lil_matrix (known ratings, test set, and full matrix)
    - idx_to_rid, idx_to_cid: index-to-ID mapping
    - iteration: number of elicitation rounds

    Returns:
    - rmse_list, mae_list
    """

    num_users, num_items = train.shape
    train_copy = train.tolil().copy()
    X_copy = X.tolil().copy()
    rmse_list, mae_list = [], []
    cid_to_idx = {v: k for k, v in idx_to_cid.items()}
    asked_items = {u: set() for u in range(num_users)}  

    # Step 0: Baseline evaluation
    print("🔍 Evaluating baseline RMSE/MAE...")
    train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
    reader = Reader(rating_scale=(1, 100))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    trainset = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)
    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    test_data = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    testset = test_data.build_full_trainset().build_testset()
    predictions = algo.test(testset)
    rmse_list.append(accuracy.rmse(predictions, verbose=True))
    mae_list.append(accuracy.mae(predictions, verbose=True))
    print("✅ Baseline evaluation complete.")

    for i in range(iteration):
        print(f"\n🔁 Iteration {i+1}/{iteration} (skip asked items, walk from root)")
        # Retrain tree using current train_copy
        warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
        coldK_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
        x_df = pd.concat([warm_df, coldK_df], ignore_index=False)

        pct = Tree(max_depth=i + 1, min_instances=5)
        pct.fit(x_df, x_df)
        print("🌳 Tree re-trained.")

        for u in range(num_users):
            node = pct.root

            # Traverse down the tree until we find an unasked item
            while node and not node.is_leaf and node.attribute_name:
                item = node.attribute_name
                if item not in cid_to_idx:
                    node = None
                    break

                item_idx = cid_to_idx[item]

                if item_idx in asked_items[u]:
                    # Already asked, go deeper based on user's rating
                    rating = train_copy[u, item_idx]
                    if rating >= 50:
                        node = node.children[0]
                    elif rating > 0:
                        node = node.children[1]
                    else:
                        node = node.children[2]
                    continue

                # First unasked item: try to add it to training
                asked_items[u].add(item_idx)  
                rating = X_copy[u, item_idx]
                if rating > 0:
                    train_copy[u, item_idx] = rating
                    X_copy[u, item_idx] = 0
                break  # only ask one item per user per iteration

        # Step 3: Evaluate with SVD
        print("📊 Evaluating after this iteration...")
        train_df = matrix_to_df_2(train_copy, idx_to_rid_cold, idx_to_cid)
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        trainset = data_r.build_full_trainset()
        algo.fit(trainset)
        test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
        test_data = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        testset = test_data.build_full_trainset().build_testset()
        predictions = algo.test(testset)
        rmse_list.append(accuracy.rmse(predictions, verbose=True))
        mae_list.append(accuracy.mae(predictions, verbose=True))
        print(f"✅ Iteration {i+1} complete.")

    return rmse_list, mae_list

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def elicitation_by_tree_path_retrain_depth_tree_only_warm(
    tree_model_class,
    train, test, X,
    matrix_warm,
    idx_to_rid_cold, idx_to_rid_warm, idx_to_cid,
    iteration=5
):
    """
    PCT 4 (enhanced version): Retrain a decision tree each round.
    The same tree is used for both elicitation and recommendation.
    Tree training uses cold users' known ratings + warm users' full ratings.

    Returns:
    - rmse_list: RMSE after each iteration
    - mae_list: MAE after each iteration
    """
    num_users, num_items = train.shape
    train_copy = train.tolil().copy()  # cold users' ratings, updated each iteration
    X_copy = X.tolil().copy()          # cold users' hidden ratings
    cid_to_idx = {v: k for k, v in idx_to_cid.items()}

    rmse_list = []
    mae_list = []

    test_df = matrix_to_df_2(test, idx_to_rid_cold, idx_to_cid)
    test_matrix = matrix_to_full_df(test, idx_to_rid_cold, idx_to_cid)

    for i in range(iteration):
        print(f"\n🔁 Iteration {i+1}/{iteration}")

        # Step 1: Prepare tree training data (cold + warm users)
        warm_df = matrix_to_full_df(matrix_warm, idx_to_rid_warm, idx_to_cid)
        cold_df = matrix_to_full_df(train_copy, idx_to_rid_cold, idx_to_cid)
        x_df = pd.concat([warm_df, cold_df], ignore_index=False)

        tree_model = tree_model_class(max_depth=i+1, min_instances=5)
        tree_model.fit(x_df, x_df)
        print("🌳 Tree trained.")

        # Step 2: Elicitation - each cold user walks the tree to elicit one new rating
        for u in range(num_users):
            node = tree_model.root
            depth = 0

            while node and not node.is_leaf and node.attribute_name and depth < i:
                item = node.attribute_name
                if item not in cid_to_idx:
                    node = None
                    break
                item_idx = cid_to_idx[item]
                rating = train_copy[u, item_idx]

                if rating >= 50:
                    node = node.children[0]  # Lovers
                elif rating > 0:
                    node = node.children[1]  # Haters
                else:
                    node = node.children[2]  # Unknowns

                depth += 1

            if node is None or node.attribute_name is None:
                continue

            item = node.attribute_name
            if item not in cid_to_idx:
                continue
            item_idx = cid_to_idx[item]
            rating = X_copy[u, item_idx]

            if rating > 0:
                train_copy[u, item_idx] = rating
                X_copy[u, item_idx] = 0

        print("❓ Elicitation done. Now evaluating...")

        # Step 3: Recommendation - use the trained tree to predict cold users' test ratings
        pred_matrix = tree_model.predict(test_matrix)

        y_true, y_pred = [], []

        for row in test_df.itertuples():
            uid, iid = row.user_id, row.item_id
            true_rating = row.rating
            pred_rating = pred_matrix.loc[uid, iid]

            if not pd.isna(pred_rating):
                y_true.append(true_rating)
                y_pred.append(pred_rating)

        rmse = mean_squared_error(y_true, y_pred, squared=False)
        mae = mean_absolute_error(y_true, y_pred)
        rmse_list.append(rmse)
        mae_list.append(mae)

        print(f"✅ Iteration {i+1} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")

    return rmse_list, mae_list


# X with artist only

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pct1_artist_10 = elicitation_by_tree_path_fixed_warm(
    tree_model_class=Tree,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=20
)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pct2_artist_10 = elicitation_by_tree_path_retrain_depth_warm(
    tree_model_class=Tree,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=20
)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pct3_artist_10 = elicitation_by_tree_path_retrain_skiped_warm(
    Tree=Tree,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=20
)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pct4_artist_10 = elicitation_by_tree_path_retrain_depth_tree_only_warm(
    tree_model_class=Tree,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    matrix_warm=matrix_warm,
    idx_to_rid_cold=idx_to_rid_cold,
    idx_to_rid_warm=idx_to_rid_warm,
    idx_to_cid=idx_to_cid,
    iteration=20
)

# Alternative Method

In [None]:
def i_entropy(matirx):
    entropy = []
    for item in range(0, matirx.shape[1]):
        ratings = matirx.getcol(item).data
        count_r = Counter(ratings)
        len_rating = len(ratings)
        ent = 0
        for c in count_r:
            ent -= (count_r[c]/len_rating)*math.log(count_r[c]/len_rating)
        entropy.append(ent)
    return (np.asarray(entropy))

def i_entropy_0(matirx):
    entropy = []
    c_all = matirx.shape[0]
    for item in range(0, matirx.shape[1]):
        ratings = matirx.getcol(item).data
        count_r = Counter(ratings)
        count_r[0] = c_all - matirx.getcol(item).count_nonzero()
        ent = 0
        for c in count_r:
            ent -= (count_r[c]/c_all)*math.log(count_r[c]/c_all)
        entropy.append(ent)
    return (np.asarray(entropy))

def i_pop(matrix):
    popularity = []
    for item in range(0, matrix.shape[1]):
        popularity.append(matrix.getcol(item).count_nonzero())
        
    return (np.asarray(popularity))



def helf0(matrix):
    log_U = np.log(matrix.shape[0])
    lf = np.log(i_pop(matrix))/log_U
    h = i_entropy_0(matrix)/np.log(5)
    helf = (2* lf*h)/(lf+h)
    return helf

def i_variance(matirx):
    c = matirx.copy()
    c_2 = c.power(2)
    E_2 = c_2.mean(0)
    E = c.mean(0)
    v = E_2 - np.sqrt(E)
    return (np.asarray(v)).flatten()

def i_random(matirx):
    c = matirx.copy()
    c_2 = c.power(2)
    E_2 = c_2.mean(0)
    E = c.mean(0)
    v = E_2 - np.sqrt(E)
    return (np.asarray(v)).flatten()


In [None]:
import scipy.sparse as sp


def elicitation_np_warm(matrix_warm,train,test,X,strategy,iteration,k,positive=False):
    rmse,mae = [],[]
    
    # Performance before elicitation
    train_df = matrix_to_df_2(train,idx_to_rid_cold,idx_to_cid)
    reader = Reader(rating_scale=(1, 100))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    data_rr = data_r.build_full_trainset()
    algo = SVD()
    algo.fit(data_rr)
    test_df = matrix_to_df_2(test,idx_to_rid_cold,idx_to_cid)
    test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    test_rr = test_r.build_full_trainset()
    predictions= algo.test(test_rr.build_testset())
    rmse.append(accuracy.rmse(predictions))
    mae.append(accuracy.mae(predictions))
    
    c_u_dict = {}
    all_items = list(range(0,train.shape[1]))
    X_copy = (X.tolil()).copy()
    warm_copy = matrix_warm.tolil().copy()
    train_copy = (train.tolil()).copy()
    

    #scores = strategy(train_copy)
    
    for count in range(0,iteration-1):
        x_matrix = sp.vstack([warm_copy, train_copy])
        scores = strategy(x_matrix)
        if positive:
            ranking = np.argsort(-scores)
        else:
            ranking = np.argsort(scores)
        for u in range(0, train.shape[0]):
            if count ==0:
                p_u = train_copy.getrow(u).nonzero()[1]
                c_u = set_diff(all_items,p_u)
                c_u_dict[u] = c_u.copy()
            else:
                c_u = c_u_dict[u].copy()                
          
            ranking_u = ranking[np.in1d(ranking,c_u)]
            topk_u = ranking_u[:k]
            c_u_dict[u] = set_diff(c_u,topk_u)
            px_u = X_copy.getrow(u).nonzero()[1]
            recom = set_intersection(topk_u,px_u)
            for item in recom:                  
                train_copy[u,item] = X[u,item]
                X_copy[u,item] = 0
        train_df = matrix_to_df_2(train_copy,idx_to_rid_cold,idx_to_cid)

        reader = Reader(rating_scale=(1, 100))
        data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
        data_rr = data_r.build_full_trainset()
        algo.fit(data_rr)
        test_df = matrix_to_df(test,idx_to_rid_cold,idx_to_cid)
        test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
        test_rr = test_r.build_full_trainset()
        predictions= algo.test(test_rr.build_testset())
        rmse.append(accuracy.rmse(predictions))
        mae.append(accuracy.mae(predictions))
        
        print(count+1)
    return rmse,mae

In [None]:
entropy0_artist_10 = elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    strategy=i_entropy_0,
    iteration=20,
    k=1,
    positive=False
)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


helf0_artist_10 = elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    strategy=helf0,
    iteration=20,
    k=1,
    positive=False
)

In [None]:
variance_artist_10 = elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    strategy=i_variance,
    iteration=20,
    k=1,
    positive=False
)

In [None]:
pop_artist_10 = elicitation_np_warm(
    matrix_warm=matrix_warm,
    train=train_cold_K_artist,
    test=test_cold_artist,
    X=X_cold_artist,
    strategy=i_pop,
    iteration=20,
    k=1,
    positive=False
)

# Max performance

In [None]:
# x = np.arange(0,16) 
fig, (ax1) = plt.subplots(1)
baseline_artist = 25.6939

# fig.suptitle('Horizontally stacked subplots')
ax1.plot(np.arange(len(pct1_artist_10[0])), pct1_artist_10[0], label='pct1_artist_10')
ax1.plot(np.arange(len(pct2_artist_10[0])), pct2_artist_10[0], label='pct2_artist_10')
ax1.plot(np.arange(len(pct3_artist_10[0])), pct3_artist_10[0], label='pct3_artist_10')
ax1.plot(np.arange(len(pct4_artist_10[0])), pct4_artist_10[0], label='pct4_artist_10')
ax1.axhline(y=baseline_artist, color='gray', linestyle='--', linewidth=2, label='Max performance artist')
# ax1.plot(x, pct_2[0], label='pct_2'

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.xticks(range(1, 21)) 

plt.show()

In [None]:
# x = np.arange(0,16) 
fig, (ax1) = plt.subplots(1)
baseline_artist = 25.6939

# fig.suptitle('Horizontally stacked subplots')
ax1.plot(np.arange(len(pct3_artist_10[0])), pct3_artist_10[0], label='pct3_artist_10')
ax1.plot(np.arange(len(entropy0_artist_10[0])), entropy0_artist_10[0], label='entropy0_artist_10')
ax1.plot(np.arange(len(helf0_artist_10[0])), helf0_artist_10[0], label='helf0_artist_10')
ax1.plot(np.arange(len(variance_artist_10[0])), variance_artist_10[0], label='variance_artist_10')
ax1.plot(np.arange(len(pop_artist_10[0])), pop_artist_10[0], label='pop_artist_10')
ax1.axhline(y=baseline_artist, color='gray', linestyle='--', linewidth=2, label='Max performance artist')
# ax1.plot(x, pct_2[0], label='pct_2'
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.xticks(range(1, 21)) 

plt.show()

In [None]:
# x = np.arange(0,16) 
fig, (ax1) = plt.subplots(1)
baseline_artist = 25.6939

# fig.suptitle('Horizontally stacked subplots')
ax1.plot(np.arange(len(pct1_artist_10[0])), pct1_artist_10[0], label='pct1_artist_10')
ax1.plot(np.arange(len(pct2_artist_10[0])), pct2_artist_10[0], label='pct2_artist_10')
ax1.plot(np.arange(len(pct3_artist_10[0])), pct3_artist_10[0], label='pct3_artist_10')
ax1.plot(np.arange(len(pct4_artist_10[0])), pct4_artist_10[0], label='pct4_artist_10')
ax1.plot(np.arange(len(entropy0_artist_10[0])), entropy0_artist_10[0], label='entropy0_artist_10')
ax1.plot(np.arange(len(helf0_artist_10[0])), helf0_artist_10[0], label='helf0_artist_10')
ax1.plot(np.arange(len(variance_artist_10[0])), variance_artist_10[0], label='variance_artist_10')
ax1.plot(np.arange(len(pop_artist_10[0])), pop_artist_10[0], label='pop_artist_10')
ax1.axhline(y=baseline_artist, color='gray', linestyle='--', linewidth=2, label='Max performance artist')
# ax1.plot(x, pct_2[0], label='pct_2'

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.xticks(range(1, 21)) 

plt.show()
