# Example of using LLM-based information retrieval for recommendation task

Run in google colaboratory

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
import os

repo_path = '/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec'
sys.path.append(repo_path)

In [3]:
%cd '{repo_path}'

/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec


## Install requirements

In [4]:
!pip install -q -r '{repo_path}/requirements/requirements.txt'

## Get dataset and config

In [5]:
from llm4rec.tasks import RetrievalRecommender
from llm4rec.evaluation.evaluate import evaluate_pipeline
from copy import deepcopy
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from llm4rec.utils import EmbeddingsFromFile

In [6]:
from llm4rec.pipelines import RecBolePipelineRecommender
from llm4rec.dataset import RecboleSeqDataset
from llm4rec.evaluation.trainer import PipelineTrainer
from llm4rec.utils.dataset_utils import ml100k_preprocess
from recbole.data.utils import data_preparation
from recbole.config import Config
import os
import torch


model_cls = RecBolePipelineRecommender
dataset_name = 'ml-100k'

config = Config(model=model_cls, dataset=dataset_name,
             config_file_list=['./llm4rec/configs/dataset_ml100k.yaml',
                               './llm4rec/configs/overall.yaml'])

dataset = RecboleSeqDataset(config, preprocess_text_fn=ml100k_preprocess)
train_data, _, test_data = data_preparation(config, deepcopy(dataset))



In [7]:
config.final_config_dict['topk'] = [10, 20, 50, 100, 300]

## Case 1: open source information retrieval using HuggingFace Sentence Transformers model

Create retrieval from dataset meta information

In [17]:
retrieval = RetrievalRecommender(
                embeddings=None,
                item2text=dataset.item_token2text,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_opensource_meta = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_opensource_meta[2]['test_result']

[1;35mEvaluate   [0m: 100%|████████████████████████████████████████████████| 118/118 [02:31<00:00,  1.28s/it][0m


OrderedDict([('recall@10', 0.0286),
             ('recall@20', 0.0456),
             ('recall@50', 0.0912),
             ('recall@100', 0.1495),
             ('recall@300', 0.3351),
             ('ndcg@10', 0.0132),
             ('ndcg@20', 0.0174),
             ('ndcg@50', 0.0262),
             ('ndcg@100', 0.0357),
             ('ndcg@300', 0.0602)])

Create retrieval from ItemMemory data

In [8]:
from llm4rec.memory import ItemMemory

item_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/item_memory_summ.json"

item_memory = ItemMemory(item_ids=dataset.item_id_token[1:],
                         title_col='movie_title',
                         dataset_info_map=dataset.item_token2attr,
                         load_filename=item_filepath)

100%|██████████| 1682/1682 [00:00<00:00, 680113.69it/s]


In [None]:
retrieval = RetrievalRecommender(
                embeddings=None,
                item2text=lambda x: item_memory.retrieve(x, retr_type=''),
                item_memory=item_memory,
                load_from_file=False,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_opensource_memory = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_opensource_memory[2]['test_result']

Retrieval with user profile

In [None]:
from llm4rec.memory import UserMemory
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
from llm4rec.tasks import UserAugmentation
import torch

user_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/user"

embedding_size = 384
embedding_fn = HuggingFaceEmbeddings(
                model_name="all-MiniLM-L6-v2", model_kwargs={"device":"cuda:0" if torch.cuda.is_available() else "cpu"})

path_to_env = os.path.join(repo_path, "api_keys.env")
load_dotenv(path_to_env)

llm = ChatGroq(model_name="llama3-70b-8192", temperature=0)
user_memory = UserMemory(train_dataset=train_data.dataset,
                         load_filename=user_filepath,
                         user_attributes=dataset.user_token2text,
                         short_term_limit=20, llm=llm,
                         embeddings=embedding_fn,
                         item_memory=item_memory,
                         emb_size=embedding_size)
retrieval = RetrievalRecommender(
                embeddings=None,
                item2text=dataset.item_token2text,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})
augmentation = UserAugmentation(user_memory)

tasks = [augmentation, retrieval]
results_opensource_profile = evaluate_pipeline(config, deepcopy(dataset), tasks=tasks)
results_opensource_profile[2]['test_result']



## Case 2: OpenAI information retrieval using OpenAIEmbeddings

In [29]:
path_to_env = os.path.join(repo_path, "api_keys.env")
load_dotenv(path_to_env)

openai_api_key = os.environ.get("API_KEY")
embeddings_model = OpenAIEmbeddings(
            openai_api_key=openai_api_key, model="text-embedding-ada-002"
        )

Embeddings from dataset item meta information

In [30]:
retrieval = RetrievalRecommender(
                embeddings=embeddings_model,
                item2text=dataset.item_token2text,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_openai_meta = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_openai_meta[2]['test_result']

[1;35mEvaluate   [0m: 100%|████████████████████████████████████████████████| 118/118 [02:57<00:00,  1.50s/it][0m


OrderedDict([('recall@10', 0.0223),
             ('recall@20', 0.0467),
             ('recall@50', 0.0986),
             ('recall@100', 0.1569),
             ('recall@300', 0.3478),
             ('ndcg@10', 0.009),
             ('ndcg@20', 0.0151),
             ('ndcg@50', 0.0254),
             ('ndcg@100', 0.0348),
             ('ndcg@300', 0.0606)])

Embeddings from item memory

In [None]:
retrieval = RetrievalRecommender(
                embeddings=embeddings_model,
                item2text=lambda x: item_memory.retrieve(x, retr_type=''),
                item_memory=item_memory,
                load_from_file=False,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_openai_memory = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_openai_memory[2]['test_result']

## Case 3: Information retrieval using ALS embeddings

In [32]:
emb = EmbeddingsFromFile(emb_file_path=os.path.join(repo_path, 'examples', 'develop','als_emb_ml100k.npz'))

In [33]:
retrieval = RetrievalRecommender(
                embeddings=emb,
                csv_loader_args=dict(csv_args={'delimiter':'\t'},
                    source_column='item_id:token',
                    metadata_columns=['movie_title:token_seq', 'release_year:token', 'class:token_seq']),
                item2text=lambda x: x,
                load_from_file=True,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])},
                query='{user_profile}')

results_als = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_als[2]['test_result']

[1;35mEvaluate   [0m: 100%|████████████████████████████████████████████████| 118/118 [00:14<00:00,  8.29it/s][0m


OrderedDict([('recall@10', 0.0445),
             ('recall@20', 0.0615),
             ('recall@50', 0.0891),
             ('recall@100', 0.1135),
             ('recall@300', 0.1262),
             ('ndcg@10', 0.0231),
             ('ndcg@20', 0.0273),
             ('ndcg@50', 0.0327),
             ('ndcg@100', 0.0367),
             ('ndcg@300', 0.0384)])

In [34]:
import pandas as pd

df_results = pd.DataFrame([results_opensource_meta[2]['test_result'], #results_opensource_memory[2]['test_result'],
              results_openai_meta[2]['test_result'],# results_openai_memory[2]['test_result'],
              results_als[2]['test_result']], index=['opensource_metainfo',# 'opensource_memory',
                                                   'openai_metainfo', #'openai_memory',
                                                     'als'])
df_results.to_csv('./examples/develop/inf_retr.csv')
df_results

Unnamed: 0,recall@10,recall@20,recall@50,recall@100,recall@300,ndcg@10,ndcg@20,ndcg@50,ndcg@100,ndcg@300
opensource_metainfo,0.0286,0.0456,0.0912,0.1495,0.3351,0.0132,0.0174,0.0262,0.0357,0.0602
openai_metainfo,0.0223,0.0467,0.0986,0.1569,0.3478,0.009,0.0151,0.0254,0.0348,0.0606
als,0.0445,0.0615,0.0891,0.1135,0.1262,0.0231,0.0273,0.0327,0.0367,0.0384
