<a href="https://colab.research.google.com/github/andrejdaskalov/rec-sys-evaluation-paper/blob/main/PaperLLMRecommenders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
DATASET_DIR = "/gdrive/MyDrive/movielens_small/ml-latest-small/"
def get_dataset_path(filename: str) -> str:
    return DATASET_DIR + filename

this function is used to standardize model run names and path

In [None]:
import datetime
import re
RESULT_PATH = "results/"
def get_results_path(model_name: str, dataset_type: str, extra_identifier:str = "") -> str:
    extra_identifier = re.sub("/","_", extra_identifier)
    current_time = datetime.datetime.now()
    datestr = "T".join(str(current_time).split(" "))
    subdir = RESULT_PATH + "_".join([model_name, dataset_type, extra_identifier, datestr])
    return get_dataset_path(subdir + "/")

# Load and import

In [None]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import os

In [None]:
movie_data = pd.read_csv(get_dataset_path("enriched.csv"), na_filter=False)
movie_data.head()

Unnamed: 0,movieId,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year
0,1,Toy Story,adventure|animation|children|comedy|fantasy,pixar|pixar|fun,114709,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter|Pete Docter|Andrew Stanton|Joe R...,Bonnie Arnold|Ed Catmull|Ralph Guggenheim|Stev...,81.0,30000000.0,A little boy named Andy loves to be in his roo...,1995
1,2,Jumanji,adventure|children|fantasy,fantasy|magic board game|Robin Williams|game,113497,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh|Greg Taylor|Jim Strain|Greg...,Robert W. Cort|Ted Field|Larry Franco|Scott Kr...,104.0,50000000.0,"Jumanji, one of the most unique--and dangerous...",1995
2,3,Grumpier Old Men,comedy|romance,moldy|old,113228,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson|Mark Steven Johnson,Richard C. Berman|John Davis|George Folsey Jr....,101.0,25000000.0,Things don't seem to change much in Wabasha Co...,1995
3,4,Waiting to Exhale,comedy|drama|romance,,114885,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan|Terry McMillan|Ron Bass,Ron Bass|Caron K|Terry McMillan|Deborah Schind...,124.0,16000000.0,This story based on the best selling novel by ...,1995
4,5,Father of the Bride Part II,comedy,pregnancy|remake,113041,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett|Frances Goodrich|Nancy Meyers|C...,Carol Baum|Bruce A. Block|Julie B. Crane|Jim C...,106.0,30000000.0,"In this sequel to ""Father of the Bride"", Georg...",1995


In [None]:
USER_COLUMN_NAME = 'userId'
CONTENT_COLUMN_NAME = 'movieId'
RATING_COLUMN_NAME = 'rating'

In [None]:
movie_data['genres'] = movie_data['genres'].apply(lambda x: x.split("|"))
movie_data['tag'] = movie_data['tag'].apply(lambda x: x.split("|"))
movie_data['cast'] = movie_data['cast'].apply(lambda x: x.split("|"))
movie_data['writers'] = movie_data['writers'].apply(lambda x: x.split("|"))
movie_data['producers'] = movie_data['producers'].apply(lambda x: x.split("|"))
movie_data.head()

Unnamed: 0,movieId,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year
0,1,Toy Story,"[adventure, animation, children, comedy, fantasy]","[pixar, pixar, fun]",114709,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Pete Docter, Andrew Stanton, J...","[Bonnie Arnold, Ed Catmull, Ralph Guggenheim, ...",81.0,30000000.0,A little boy named Andy loves to be in his roo...,1995
1,2,Jumanji,"[adventure, children, fantasy]","[fantasy, magic board game, Robin Williams, game]",113497,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Jonathan Hensleigh, Greg Taylor, Jim Strain, ...","[Robert W. Cort, Ted Field, Larry Franco, Scot...",104.0,50000000.0,"Jumanji, one of the most unique--and dangerous...",1995
2,3,Grumpier Old Men,"[comedy, romance]","[moldy, old]",113228,"[Walter Matthau, Jack Lemmon, Sophia Loren, An...","[Mark Steven Johnson, Mark Steven Johnson]","[Richard C. Berman, John Davis, George Folsey ...",101.0,25000000.0,Things don't seem to change much in Wabasha Co...,1995
3,4,Waiting to Exhale,"[comedy, drama, romance]",[N/A],114885,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Terry McMillan, Terry McMillan, Ron Bass]","[Ron Bass, Caron K, Terry McMillan, Deborah Sc...",124.0,16000000.0,This story based on the best selling novel by ...,1995
4,5,Father of the Bride Part II,[comedy],"[pregnancy, remake]",113041,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Albert Hackett, Frances Goodrich, Nancy Meyer...","[Carol Baum, Bruce A. Block, Julie B. Crane, J...",106.0,30000000.0,"In this sequel to ""Father of the Bride"", Georg...",1995


In [None]:
movie_data_indexed = movie_data.set_index(CONTENT_COLUMN_NAME)

# Datasets
> NOTE: use only one of below dataset preparation groups, otherwise they overwrite eachother

# Prepare Normal Interactions dataset

In [None]:
DATASET_TYPE = "full"

In [None]:
movie_ratings = pd.read_csv(get_dataset_path("ratings.csv"))
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
len(movie_ratings)

100836

split by timestamp

In [None]:
INTERACTIONS_TEST_SIZE = 0.2
from math import ceil, floor
movie_ratings.sort_values('timestamp', inplace=True)
interactions_train_df = movie_ratings.groupby('userId', group_keys=False).apply(lambda x: x.head(ceil((1-INTERACTIONS_TEST_SIZE)*len(x))))
interactions_test_df = movie_ratings.groupby('userId', group_keys=False).apply(lambda x: x.tail(floor(INTERACTIONS_TEST_SIZE*len(x))))
interactions_train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
43,1,804,4.0,964980499
73,1,1210,5.0,964980499
171,1,2628,4.0,964980523
120,1,2018,5.0,964980523
183,1,2826,4.0,964980523


In [None]:
interactions_train_df_indexed = interactions_train_df.set_index('userId')
interactions_test_df_indexed = interactions_test_df.set_index('userId')
movie_ratings_indexed = movie_ratings.set_index('userId')

# Prepare Reduced interactions dataset

In [None]:
DATASET_TYPE = "reduced"

In [None]:
interactions_train_df_indexed = pd.read_csv(get_dataset_path("reduced_interactions.csv"), index_col="userId")
interactions_test_df_indexed = pd.read_csv(get_dataset_path("reduced_interactions_heldout.csv"), index_col="userId")
movie_ratings_indexed = pd.concat([interactions_train_df_indexed, interactions_test_df_indexed])
movie_ratings_indexed.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
429,595,5.0,828124615
429,588,5.0,828124615
429,590,5.0,828124615
107,140,4.0,829322340
107,105,4.0,829322340


In [None]:
interactions_train_df = interactions_train_df_indexed.reset_index()
interactions_test_df = interactions_test_df_indexed.reset_index()
movie_ratings = movie_ratings_indexed.reset_index()

# Prepare Mixed Interactions dataset

In [None]:
DATASET_TYPE = "mixed"

In [None]:
interactions_train_df_indexed = pd.read_csv(get_dataset_path("mixed_interactions.csv"), index_col="userId")
interactions_test_df_indexed = pd.read_csv(get_dataset_path("mixed_interactions_heldout.csv"), index_col="userId")
movie_ratings_indexed = pd.concat([interactions_train_df_indexed, interactions_test_df_indexed])
movie_ratings_indexed.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
448,69640,3.0,1289145810
352,90866,4.5,1493674691
599,1623,2.5,1498516912
474,2583,4.0,1081177421
465,2278,4.0,959896203


In [None]:
interactions_train_df = interactions_train_df_indexed.reset_index()
interactions_test_df = interactions_test_df_indexed.reset_index()
movie_ratings = movie_ratings_indexed.reset_index()

# Evaluation

In [None]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id][CONTENT_COLUMN_NAME]
    # interacted_items = interactions_df
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [None]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:

    def __init__(self, interactions_df_full, interactions_df_train, interactions_df_test, content_df):
        self.interactions_df_full = interactions_df_full
        self.interactions_df_train = interactions_df_train
        self.interactions_df_test = interactions_df_test
        self.content_df = content_df


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, self.interactions_df_full)
        all_items = set(self.interactions_df_full[CONTENT_COLUMN_NAME])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(list(non_interacted_items), sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index


    def _calculate_recall(self, person_interacted_items_testset, person_recs_df, person_id):
        hits_at_5_count = 0
        hits_at_10_count = 0

        interacted_items_count_testset = len(person_interacted_items_testset)
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id,
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df[CONTENT_COLUMN_NAME].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df[CONTENT_COLUMN_NAME].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items,
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        recall_metrics = {'hits@5_count':hits_at_5_count,
                          'hits@10_count':hits_at_10_count,
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return recall_metrics


    def _calculate_precision(self, person_interacted_items_testset, person_recs_df : pd.DataFrame, person_id):

        hits_at_5_count = 0
        hits_at_10_count = 0


        top_5 = person_recs_df.head(5)
        top_10 = person_recs_df.head(10)

        top_5_filtered = top_5[top_5[CONTENT_COLUMN_NAME].isin(person_interacted_items_testset)]
        top_10_filtered = top_10[top_10[CONTENT_COLUMN_NAME].isin(person_interacted_items_testset)]

        hits_at_5_count = top_5_filtered.__len__()
        hits_at_10_count = top_10_filtered.__len__()

        precision_at_5 = hits_at_5_count / 5.0
        precision_at_10 = hits_at_10_count / 10.0


        precision_metrics = {'hits@5_count':hits_at_5_count,
                          'hits@10_count':hits_at_10_count,
                          'precision@5': precision_at_5,
                          'precision@10': precision_at_10}
        return precision_metrics


    # calculate the mean reciprocal rank (MRR)
    # Mean Reciprocal Rank (MRR) at K evaluates how quickly a ranking system can show the first relevant item in the top-K results.
    # MRR = 1/U sum_(u=1, U)(1/ rank_i)
    # where U is the total number of users and i is the position of the first relevant item for user u in top K results
    # in this method, I calculate the reciprocal rank for each user to later calculate the MRR for all users
    def _calculate_rr_user(self, person_interacted_items_testset: set[int], person_recs_df: pd.DataFrame, person_id: int):


        first_relevant_item = person_recs_df[person_recs_df[CONTENT_COLUMN_NAME]\
         .isin(person_interacted_items_testset)].head(1)

        if first_relevant_item.empty:
            return {"rr": -1 }

        idx_first_relevant_item = first_relevant_item.index[0]

        rr =  1/ int( idx_first_relevant_item + 1 )
        return {
            "rr": rr
        }



    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = self.interactions_df_test.loc[person_id]
        if type(interacted_values_testset[CONTENT_COLUMN_NAME]) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset[CONTENT_COLUMN_NAME])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset[CONTENT_COLUMN_NAME])])
        # interacted_items_count_testset = len(person_interacted_items_testset)

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id,
                                               items_to_ignore=get_items_interacted(person_id,
                                                                                    self.interactions_df_train),
                                               topn=1000, verbose=True)

        recall_metrics = self._calculate_recall(person_interacted_items_testset, person_recs_df, person_id)
        precision_metrics = self._calculate_precision(person_interacted_items_testset, person_recs_df, person_id)
        rr_metric = self._calculate_rr_user(person_interacted_items_testset, person_recs_df, person_id)

        person_metrics = {
            'recall': recall_metrics,
            'precision': precision_metrics,
            'rr': rr_metric,
        }

        return person_metrics

    def evaluate_model(self, model) -> tuple[pd.DataFrame, dict]:

        print(f'Evaluating {model.get_model_name()} recommendation model...')

        people_metrics_recall = []
        people_metrics_precision = []
        people_metrics_rr = []
        for idx, person_id in enumerate(list(self.interactions_df_test.index.unique().values)):

            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics_recall = person_metrics['recall']
            person_metrics_precision = person_metrics['precision']
            person_metrics_rr = person_metrics['rr']

            person_metrics_recall['_person_id'] = person_id
            people_metrics_recall.append(person_metrics_recall)

            person_metrics_precision['_person_id'] = person_id
            people_metrics_precision.append(person_metrics_precision)


            person_metrics_rr['_person_id'] = person_id
            people_metrics_rr.append(person_metrics_rr)
        print('%d users processed' % idx)

        detailed_results_recall_df = pd.DataFrame(people_metrics_recall) \
                            .sort_values('interacted_count', ascending=False)

        detailed_results_precision_df = pd.DataFrame(people_metrics_precision)

        detailed_results_rr_df = pd.DataFrame(people_metrics_rr)


        global_recall_at_5 = detailed_results_recall_df['hits@5_count'].sum() / float(detailed_results_recall_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_recall_df['hits@10_count'].sum() / float(detailed_results_recall_df['interacted_count'].sum())

        global_precision_at_5 = detailed_results_precision_df['hits@5_count'].sum() / float(detailed_results_precision_df.__len__() * 5)
        global_precision_at_10 = detailed_results_precision_df['hits@10_count'].sum() / float(detailed_results_precision_df.__len__() * 10)

        sum_rr = detailed_results_rr_df[detailed_results_rr_df['rr'] != -1]['rr'].sum()
        num_users = len(list(self.interactions_df_test.index.unique().values))
        mean_reciprocal_rank = (1 / int(num_users) ) * sum_rr

        global_metrics = {'modelName': [model.get_model_name()],
                          'recall@5': [global_recall_at_5],
                          'recall@10': [global_recall_at_10],
                          'precision@5': [global_precision_at_5],
                          'precision@10': [global_precision_at_10],
                          'mrr': [mean_reciprocal_rank]
                          }
        global_metrics_df = pd.DataFrame(global_metrics)
        dataframes = {'recall': detailed_results_recall_df,
                      'precision': detailed_results_precision_df,
                      'rr': detailed_results_rr_df
        }
        return global_metrics_df, dataframes

    def print_results(self, global_metrics, dataframes):
        print('\nGlobal metrics:\n%s' % global_metrics)
        for _, df in dataframes.items():
            print(df.head(50))

    def save_results(self, global_metrics: pd.DataFrame, dataframes: dict[str, pd.DataFrame], model_name, dataset_type):
        file_path = get_results_path(model_name, dataset_type, "FullIMDB")
        os.makedirs(file_path, exist_ok=True)
        global_metrics.to_csv(file_path + "global_metrics.csv")
        for key, df in dataframes.items():
            df.to_csv(file_path+key+".csv")


model_evaluator = ModelEvaluator(movie_ratings_indexed, interactions_train_df_indexed, interactions_test_df_indexed, movie_data)

# LLM content-based model

In [None]:
!pip install -U sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
LLM_MODEL_NAME="BAAI/bge-m3"
# LLM_MODEL_NAME="dunzhang/stella_en_1.5B_v5"
# LLM_MODEL_NAME="dunzhang/stella_en_400M_v5"
# LLM_MODEL_NAME="Lajavaness/bilingual-embedding-large"


In [None]:
# !pip install xformers

In [None]:
# !pip install flash_attn

In [None]:
import torch
kwargs = {"torch_dtype": torch.float16}
# kwargs={}

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [None]:
from sentence_transformers import SentenceTransformer
import torch

# Load the model, optionally in float16 precision for faster inference
# model = SentenceTransformer("BAAI/bge-m3", model_kwargs={"torch_dtype": torch.float16})
model = SentenceTransformer(LLM_MODEL_NAME, trust_remote_code=True, model_kwargs=kwargs).cuda()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
# without imdb
def stringify_movie(movies_df, movie_id) -> str:
    title = movies_df.loc[movie_id]['title']
    movie_genres = movies_df.loc[movie_id]['genres']
    movie_tags = movies_df.loc[movie_id]['tag']
    tags_str = ', '.join(movie_tags)
    genres_str = ", ".join(movie_genres)
    movie_doc = f"{title}; Genres: {genres_str}; Tags: {tags_str}\n"
    return movie_doc

In [None]:
# with imdb
def stringify_movie(movies_df, movie_id) -> str:
    title = movies_df.loc[movie_id]['title']
    movie_genres = movies_df.loc[movie_id]['genres']
    movie_tags = movies_df.loc[movie_id]['tag']
    tags_str = ', '.join(movie_tags)
    genres_str = ", ".join(movie_genres)
    cast = ", ".join(movies_df.loc[movie_id]['cast'])
    writers = ", ".join(movies_df.loc[movie_id]['writers'])
    producers = ", ".join(movies_df.loc[movie_id]['producers'])
    boxOfficeBudget = movies_df.loc[movie_id]['boxOfficeBudget']
    year = movies_df.loc[movie_id]['year']
    # movie_doc = f"{title}; Genres: {genres_str}; Tags: {tags_str}; Cast: {cast}\n"
    movie_doc = f"{title}; Genres: {genres_str}; Tags: {tags_str}; Cast: {cast}; Writers: {writers}; Producers: {producers}; Box Office Budget: {boxOfficeBudget}; Year: {year};\n"
    return movie_doc

### chroma

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.33.1-py3-none-any.whl.metadata (1.6 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.33.1-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)


In [None]:
from chromadb.api.types import (
    Documents,
    EmbeddingFunction,
    Embeddings
)


class NormalLLMEmbed(EmbeddingFunction[Documents]):
    def __init__(
            self
    ):
        """Initialize the embedding function."""

    def __call__(self, input: Documents) -> Embeddings:
        """Embed the input documents."""
        embeddings = [model.encode(doc).tolist() for doc in input]
        return embeddings

In [None]:
class PromptedLLMEmbed(EmbeddingFunction[Documents]):
    def __init__(
            self,
            prompt: str
    ):
        """Initialize the embedding function."""
        self.prompt = prompt

    def __call__(self, input: Documents) -> Embeddings:
        """Embed the input documents."""
        embeddings = [model.encode(doc, prompt=self.prompt).tolist() for doc in input]
        return embeddings

In [None]:
import chromadb
from chromadb.utils import embedding_functions
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()

# Create collection. get_collection, get_or_create_collection, delete_collection also available!
# movie_emb = client.create_collection("movie_embeddings", embedding_function= embedding_functions.SentenceTransformerEmbeddingFunction(model_name=LLM_MODEL_NAME, trust_remote_code=True, device="cuda", model_kwargs=kwargs))
movie_emb = client.create_collection("movie_embeddings", embedding_function= NormalLLMEmbed())

In [None]:
# client.delete_collection("movie_embeddings")

In [None]:
documents = movie_data[CONTENT_COLUMN_NAME].apply(lambda x: stringify_movie(movie_data_indexed, x))
documents

Unnamed: 0,movieId
0,"Toy Story; Genres: adventure, animation, child..."
1,"Jumanji; Genres: adventure, children, fantasy;..."
2,"Grumpier Old Men; Genres: comedy, romance; Tag..."
3,"Waiting to Exhale; Genres: comedy, drama, roma..."
4,Father of the Bride Part II; Genres: comedy; T...
...,...
9737,Black Butler: Book of the Atlantic; Genres: ac...
9738,"No Game No Life: Zero; Genres: animation, come..."
9739,Flint; Genres: drama; Tags: N/A; Cast: Kaelen ...
9740,"Bungo Stray Dogs: Dead Apple; Genres: action, ..."


In [None]:
document_ids = movie_data[CONTENT_COLUMN_NAME].apply(str)
document_ids.tolist()

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '34',
 '36',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '52',
 '53',
 '54',
 '55',
 '57',
 '58',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '85',
 '86',
 '87',
 '88',
 '89',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '110',
 '111',
 '112',
 '113',
 '116',
 '117',
 '118',
 '119',
 '121',
 '122',
 '123',
 '125',
 '126',
 '128',
 '129',
 '132',
 '135',
 '137',
 '140',
 '141',
 '144',
 '145',
 '146',
 '147',
 '148',
 '149',
 '150',
 '151',
 '152',
 '153',
 '154',
 '155',
 '156',
 '157',
 '158',
 '159',
 '160',
 '161',
 '162',
 '163',
 

In [None]:
len(document_ids)

9742

In [None]:
document_ids = document_ids.tolist()
documents = documents.tolist()
for i in range(0, len(document_ids), 5460):
    movie_emb.upsert(
        ids= document_ids[i:i+5460],
        documents= documents[i:i+5460],
    )


In [None]:
# client.delete_collection("user_embeddings")

In [None]:
encoding_instruction = "Represent this user's watched movies for finding relevant recommendations:"
# encoding_instruction = "For given movies that user has already watched, separated by newline, given each movie's title, genres, tags and user rating, find relevant movies."
# encoding_instruction = "This is a user profile. It is given in a format where each movie the user has interacted with is on a new line. The movie is in the following format: Title:<TITLE>; Genres: <GENRE1, GENRE2 ...>; Tags: <TAG1, TAG2 ...>; Rating: <RATING\n. Find movies relevant to the ones provided: "
# encoding_instruction = "Encode each movie by focusing on its title, genres, tags, and user rating. Prioritize the extraction of thematic elements and genre characteristics, alongside user-specific preferences as reflected by their ratings. This encoding will inform a content-based recommendation system by capturing both the intrinsic attributes of the movies and the personalized preferences indicated by user ratings."
# encoding_instruction = "For a given user profile containing the user's preferred genres, tags and cast, find movies relevant to user."
prompt = f'<instruct>{encoding_instruction}\n<query>'


In [None]:
user_emb = client.create_collection("user_embeddings", embedding_function=PromptedLLMEmbed(prompt))

### prompt variant

In [None]:
movie_interactions_with_data = interactions_train_df.merge(movie_data, on=CONTENT_COLUMN_NAME)
movie_interactions_with_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year
0,1,804,4.0,964980499,She's the One,"[comedy, romance]",[N/A],117628,"[John Mahoney, Edward Burns, Michael McGlone, ...",[Edward Burns],"[Alysse Bezahler, Edward Burns, Ted Hope, Mich...",96.0,3500000.0,How do siblings deal with each other in their ...,1996
1,1,1210,5.0,964980499,Star Wars: Episode VI - Return of the Jedi,"[action, adventure, sci-fi]","[darth vader, luke skywalker, space opera]",86190,"[Mark Hamill, Harrison Ford, Carrie Fisher, Bi...","[Lawrence Kasdan, George Lucas, George Lucas]","[Jim Bloom, Howard G. Kazanjian, George Lucas,...",131.0,32500000.0,Luke Skywalker (Mark Hamill) battles horrible ...,1983
2,1,2628,4.0,964980523,Star Wars: Episode I - The Phantom Menace,"[action, adventure, sci-fi]","[prequel, the Force]",120915,"[Liam Neeson, Ewan McGregor, Natalie Portman, ...",[George Lucas],"[George Lucas, Rick McCallum]",136.0,115000000.0,When the Trade Federation organize a blockade ...,1999
3,1,2018,5.0,964980523,Bambi,"[animation, children, drama]",[N/A],34492,"[Hardie Albright, Stan Alexander, Bobette Audr...","[Felix Salten, Perce Pearce, Larry Morey, Vern...",[Walt Disney],69.0,858000.0,"It's spring, and all the animals of the forest...",1942
4,1,2826,4.0,964980523,"13th Warrior, The","[action, adventure, fantasy]",[N/A],120657,"[Antonio Banderas, Diane Venora, Dennis Storhø...","[Michael Crichton, William Wisher, Warren Lewis]","[Lou Arkoff, Michael Crichton, Ned Dowd, Ethan...",102.0,160000000.0,A cultured diplomat joins a band of savage war...,1999


In [None]:
movie_interactions_with_data['title'] = movie_interactions_with_data['title'].apply(lambda x: 'Title: '+ x)
movie_interactions_with_data['genres'] = movie_interactions_with_data['genres'].apply(lambda x: 'Genres: '+', '.join(x))
movie_interactions_with_data['tag'] = movie_interactions_with_data['tag'].apply(lambda x: 'Tags: '+', '.join(x))
movie_interactions_with_data['cast'] = movie_interactions_with_data['cast'].apply(lambda x: 'Cast: '+', '.join(x))
movie_interactions_with_data['writers'] = movie_interactions_with_data['writers'].apply(lambda x: 'Writers: '+', '.join(x))
movie_interactions_with_data['producers'] = movie_interactions_with_data['producers'].apply(lambda x: 'Producers: '+', '.join(x))
movie_interactions_with_data['boxOfficeBudget'] = movie_interactions_with_data['boxOfficeBudget'].apply(lambda x: f'Box Office Budget: {x}')
movie_interactions_with_data['year'] = movie_interactions_with_data['year'].apply(lambda x: f'Year: {x}')
movie_interactions_with_data['rating'] = movie_interactions_with_data['rating'].apply(lambda x: 'Rating: ' + str(x))

movie_interactions_with_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year
0,1,804,Rating: 4.0,964980499,Title: She's the One,"Genres: comedy, romance",Tags: N/A,117628,"Cast: John Mahoney, Edward Burns, Michael McGl...",Writers: Edward Burns,"Producers: Alysse Bezahler, Edward Burns, Ted ...",96.0,Box Office Budget: 3500000.0,How do siblings deal with each other in their ...,Year: 1996
1,1,1210,Rating: 5.0,964980499,Title: Star Wars: Episode VI - Return of the Jedi,"Genres: action, adventure, sci-fi","Tags: darth vader, luke skywalker, space opera",86190,"Cast: Mark Hamill, Harrison Ford, Carrie Fishe...","Writers: Lawrence Kasdan, George Lucas, George...","Producers: Jim Bloom, Howard G. Kazanjian, Geo...",131.0,Box Office Budget: 32500000.0,Luke Skywalker (Mark Hamill) battles horrible ...,Year: 1983
2,1,2628,Rating: 4.0,964980523,Title: Star Wars: Episode I - The Phantom Menace,"Genres: action, adventure, sci-fi","Tags: prequel, the Force",120915,"Cast: Liam Neeson, Ewan McGregor, Natalie Port...",Writers: George Lucas,"Producers: George Lucas, Rick McCallum",136.0,Box Office Budget: 115000000.0,When the Trade Federation organize a blockade ...,Year: 1999
3,1,2018,Rating: 5.0,964980523,Title: Bambi,"Genres: animation, children, drama",Tags: N/A,34492,"Cast: Hardie Albright, Stan Alexander, Bobette...","Writers: Felix Salten, Perce Pearce, Larry Mor...",Producers: Walt Disney,69.0,Box Office Budget: 858000.0,"It's spring, and all the animals of the forest...",Year: 1942
4,1,2826,Rating: 4.0,964980523,"Title: 13th Warrior, The","Genres: action, adventure, fantasy",Tags: N/A,120657,"Cast: Antonio Banderas, Diane Venora, Dennis S...","Writers: Michael Crichton, William Wisher, War...","Producers: Lou Arkoff, Michael Crichton, Ned D...",102.0,Box Office Budget: 160000000.0,A cultured diplomat joins a band of savage war...,Year: 1999


In [None]:
# movie_interactions_with_data['movie_str'] = movie_interactions_with_data[['title', 'genres', 'tag', 'cast', 'rating']].agg('; '.join, axis=1)
movie_interactions_with_data['movie_str'] = movie_interactions_with_data[['title', 'genres', 'tag', 'cast', 'writers', 'producers', 'boxOfficeBudget', 'year', 'rating']].agg('; '.join, axis=1)
# movie_interactions_with_data['movie_str'] = movie_interactions_with_data[['title', 'genres', 'tag',  'rating']].agg('; '.join, axis=1)
movie_interactions_with_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year,movie_str
0,1,804,Rating: 4.0,964980499,Title: She's the One,"Genres: comedy, romance",Tags: N/A,117628,"Cast: John Mahoney, Edward Burns, Michael McGl...",Writers: Edward Burns,"Producers: Alysse Bezahler, Edward Burns, Ted ...",96.0,Box Office Budget: 3500000.0,How do siblings deal with each other in their ...,Year: 1996,"Title: She's the One; Genres: comedy, romance;..."
1,1,1210,Rating: 5.0,964980499,Title: Star Wars: Episode VI - Return of the Jedi,"Genres: action, adventure, sci-fi","Tags: darth vader, luke skywalker, space opera",86190,"Cast: Mark Hamill, Harrison Ford, Carrie Fishe...","Writers: Lawrence Kasdan, George Lucas, George...","Producers: Jim Bloom, Howard G. Kazanjian, Geo...",131.0,Box Office Budget: 32500000.0,Luke Skywalker (Mark Hamill) battles horrible ...,Year: 1983,Title: Star Wars: Episode VI - Return of the J...
2,1,2628,Rating: 4.0,964980523,Title: Star Wars: Episode I - The Phantom Menace,"Genres: action, adventure, sci-fi","Tags: prequel, the Force",120915,"Cast: Liam Neeson, Ewan McGregor, Natalie Port...",Writers: George Lucas,"Producers: George Lucas, Rick McCallum",136.0,Box Office Budget: 115000000.0,When the Trade Federation organize a blockade ...,Year: 1999,Title: Star Wars: Episode I - The Phantom Mena...
3,1,2018,Rating: 5.0,964980523,Title: Bambi,"Genres: animation, children, drama",Tags: N/A,34492,"Cast: Hardie Albright, Stan Alexander, Bobette...","Writers: Felix Salten, Perce Pearce, Larry Mor...",Producers: Walt Disney,69.0,Box Office Budget: 858000.0,"It's spring, and all the animals of the forest...",Year: 1942,"Title: Bambi; Genres: animation, children, dra..."
4,1,2826,Rating: 4.0,964980523,"Title: 13th Warrior, The","Genres: action, adventure, fantasy",Tags: N/A,120657,"Cast: Antonio Banderas, Diane Venora, Dennis S...","Writers: Michael Crichton, William Wisher, War...","Producers: Lou Arkoff, Michael Crichton, Ned D...",102.0,Box Office Budget: 160000000.0,A cultured diplomat joins a band of savage war...,Year: 1999,"Title: 13th Warrior, The; Genres: action, adve..."


In [None]:
num_sampled = 50
# num_sampled = 5

In [None]:
users_stringified = movie_interactions_with_data\
    .sort_values(by='timestamp', ascending=False)\
    .groupby(by=USER_COLUMN_NAME)\
    .head(num_sampled).groupby(by=USER_COLUMN_NAME)\
    .agg({'movie_str': '\n'.join})
users_stringified.head()

Unnamed: 0_level_0,movie_str
userId,Unnamed: 1_level_1
1,"Title: Fight Club; Genres: action, crime, dram..."
2,Title: Inside Job; Genres: documentary; Tags: ...
3,"Title: Green Card; Genres: comedy, drama, roma..."
4,Title: Erin Brockovich; Genres: drama; Tags: s...
5,"Title: Dead Man Walking; Genres: crime, drama;..."


In [None]:
user_id_list = users_stringified.index.astype(str).tolist()

### Prompted LLM recommender

In [None]:
user_emb.upsert(
    ids=user_id_list,
    documents=users_stringified['movie_str'].tolist()
)

In [None]:
class PromptedLLMRecommender:

    MODEL_NAME = 'Prompted LLM recommender'


    def __init__(self, items_df=None):
        # self.popularity_df = popularity_df
        self.items_df = items_df


    def get_model_name(self):
        return self.MODEL_NAME


    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):

        user_embedding = user_emb.get([str(user_id)], include=['embeddings'])['embeddings']

        movie_ids = [movie for movie in movie_emb.get()['ids'] if int(movie) not in items_to_ignore] # get all ids NOT IN items to ignore list
        movie_embeddings = movie_emb.get(movie_ids, include=['embeddings'])['embeddings']

        similarities = model.similarity(user_embedding, movie_embeddings)
        similar_indices = similarities.argsort().flatten().tolist()[-topn:][::-1]

        similar_movies = [(int(movie_ids[i]), float(similarities[0][i])) for i in similar_indices]

        recommendations_df = pd.DataFrame(similar_movies, columns=[CONTENT_COLUMN_NAME, 'recStrength']).set_index(CONTENT_COLUMN_NAME)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)


        return recommendations_df

prompted_llm_model = PromptedLLMRecommender(movie_data)

In [None]:
prompted_llm_global_metrics, prompted_llm_results_dataframes = model_evaluator.evaluate_model(prompted_llm_model)
model_evaluator.print_results(prompted_llm_global_metrics, prompted_llm_results_dataframes)
model_evaluator.save_results(prompted_llm_global_metrics, prompted_llm_results_dataframes, prompted_llm_model.MODEL_NAME, DATASET_TYPE)

Evaluating Prompted LLM recommender recommendation model...
609 users processed

Global metrics:
                  modelName  recall@5  recall@10  precision@5  precision@10  \
0  Prompted LLM recommender  0.145486   0.233902     0.028852      0.023934   

        mrr  
0  0.095684  
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
413           102            166               539  0.189239   0.307978   
598            28             51               495  0.056566   0.103030   
473            21             38               421  0.049881   0.090261   
447            86            135               372  0.231183   0.362903   
273            39             73               269  0.144981   0.271375   
609            54             90               260  0.207692   0.346154   
67             14             38               252  0.055556   0.150794   
379            51             71               243  0.209877   0.292181   
605            16             33         

### variant with mean vectors

In [None]:
latest_n = 50

In [None]:
user_movie_list = interactions_train_df.sort_values(by='timestamp', ascending=False).groupby(by=USER_COLUMN_NAME).agg({CONTENT_COLUMN_NAME: lambda x: list(x), 'rating': lambda x: list(x)})
user_movie_list['movieId'] = user_movie_list['movieId'].apply(lambda x: x[:latest_n])
user_movie_list['rating'] = user_movie_list['rating'].apply(lambda x: x[:latest_n])
user_movie_list.head()

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[2959, 2329, 1208, 954, 2387, 3247, 2389, 2616...","[5.0, 5.0, 4.0, 5.0, 5.0, 3.0, 2.0, 4.0, 4.0, ..."
2,"[80906, 86345, 68157, 109487, 58559, 48516, 33...","[5.0, 4.0, 4.5, 3.0, 4.5, 4.0, 4.0, 4.0, 5.0, ..."
3,"[6238, 1302, 2018, 72378, 3024, 7899, 5764, 15...","[0.5, 0.5, 0.5, 0.5, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
4,"[3408, 4029, 3538, 4027, 4021, 3851, 3967, 391...","[4.0, 3.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 4.0, ..."
5,"[36, 232, 410, 253, 531, 608, 589, 594, 58, 21...","[4.0, 4.0, 3.0, 3.0, 4.0, 3.0, 3.0, 5.0, 5.0, ..."


In [None]:
from sklearn.preprocessing import normalize


In [None]:
# returns single array of normalized, weighted average user embedding
def ids_to_embedding(ids: list, ratings: list):
    ids_str = [str(id) for id in ids]
    movies = movie_emb.get(ids_str, include=['embeddings'])
    embeddings = movies['embeddings']
    normalized_movie_embeddings = normalize(embeddings, axis=1)  # L2 normalize
    mean_user_embedding = np.mean(normalized_movie_embeddings * np.array(ratings)[:, np.newaxis], axis=0)
    # mean_user_embedding = np.mean(embeddings, axis=0)
    return mean_user_embedding

In [None]:
embedding_list = user_movie_list.apply(lambda x: ids_to_embedding(x[CONTENT_COLUMN_NAME], x['rating']), axis=1).tolist()
embedding_list[:10]

[array([-0.0892448 , -0.03785968, -0.11918981, ...,  0.01310284,
        -0.01995867,  0.08290184]),
 array([-0.07730942, -0.0183097 , -0.08218147, ..., -0.0097377 ,
        -0.03953537,  0.08713152]),
 array([-0.04602071, -0.00923923, -0.09624504, ..., -0.02029213,
        -0.0041821 ,  0.06943411]),
 array([-0.05650862,  0.02177764, -0.07702394, ...,  0.01646464,
        -0.03546139,  0.07604209]),
 array([-0.07042354, -0.00378118, -0.08353898, ...,  0.00520239,
        -0.00953311,  0.07316439]),
 array([-0.06291936,  0.01501406, -0.09462119, ...,  0.02270139,
        -0.01844378,  0.0582808 ]),
 array([-0.0806746 ,  0.01212784, -0.06930887, ...,  0.00670705,
        -0.02743997,  0.06676526]),
 array([-0.08730331, -0.00021733, -0.09133369, ..., -0.01484901,
        -0.01410202,  0.0757796 ]),
 array([-0.08855075,  0.00388515, -0.0924563 , ...,  0.01184104,
        -0.03378991,  0.07392111]),
 array([-0.08854051,  0.00169658, -0.06135795, ...,  0.0039512 ,
        -0.03573912,  0.05

In [None]:
user_id_list = [str(userId) for userId in user_movie_list.index.tolist()]
user_id_list[:10]

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

### Mean vector recommender

In [None]:
user_emb.upsert(
    ids=user_id_list,
    embeddings=embedding_list,
)

In [None]:
class MeanVector_LLMRecommender:

    MODEL_NAME = 'Mean Vector LLM recommender'


    def __init__(self, items_df=None):
        # self.popularity_df = popularity_df
        self.items_df = items_df


    def get_model_name(self):
        return self.MODEL_NAME


    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):

        user_embedding = user_emb.get([str(user_id)], include=['embeddings'])['embeddings']

        movie_ids = [movie for movie in movie_emb.get()['ids'] if int(movie) not in items_to_ignore] # get all ids NOT IN items to ignore list
        movie_embeddings = movie_emb.get(movie_ids, include=['embeddings'])['embeddings']

        similarities = model.similarity(user_embedding, movie_embeddings)
        similar_indices = similarities.argsort().flatten().tolist()[-topn:][::-1]

        similar_movies = [(int(movie_ids[i]), float(similarities[0][i])) for i in similar_indices]

        recommendations_df = pd.DataFrame(similar_movies, columns=[CONTENT_COLUMN_NAME, 'recStrength']).set_index(CONTENT_COLUMN_NAME)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)


        return recommendations_df

meanvector_llm_model = MeanVector_LLMRecommender(movie_data)

In [None]:
meanvector_llm_global_metrics, meanvector_llm_results_dataframes = model_evaluator.evaluate_model(meanvector_llm_model)
model_evaluator.print_results(meanvector_llm_global_metrics, meanvector_llm_results_dataframes)
model_evaluator.save_results(meanvector_llm_global_metrics, meanvector_llm_results_dataframes, meanvector_llm_model.MODEL_NAME, DATASET_TYPE)

Evaluating Mean Vector LLM recommender recommendation model...
609 users processed

Global metrics:
                     modelName  recall@5  recall@10  precision@5  \
0  Mean Vector LLM recommender  0.100401   0.168154     0.015082   

   precision@10       mrr  
0      0.013607  0.047039  
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
413            61            111               539  0.113173   0.205937   
598            19             44               495  0.038384   0.088889   
473            14             30               421  0.033254   0.071259   
447            36             75               372  0.096774   0.201613   
273            35             52               269  0.130112   0.193309   
609            62             90               260  0.238462   0.346154   
67             15             31               252  0.059524   0.123016   
379            36             66               243  0.148148   0.271605   
605            11             19

# end instance

In [None]:
from google.colab import runtime
runtime.unassign()