# Example of using LLM-based information retrieval for recommendation task

Run in google colaboratory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os

repo_path = '/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec'
sys.path.append(repo_path)

In [None]:
%cd '{repo_path}'

/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec


## Install requirements

In [None]:
!pip install -q -r '{repo_path}/requirements/requirements.txt'

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.6/973.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.4/310.4 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.1/324.1 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

## Get dataset and config

In [None]:
from llm4rec.pipelines import RecBolePipelineRecommender
from llm4rec.dataset import RecboleSeqDataset
from llm4rec.evaluation.trainer import PipelineTrainer
from llm4rec.utils.dataset_utils import ml100k_preprocess
from recbole.data.utils import data_preparation
from recbole.config import Config
import os
import torch


model_cls = RecBolePipelineRecommender
dataset_name = 'ml-100k'

config = Config(model=model_cls, dataset=dataset_name,
             config_file_list=['./llm4rec/configs/dataset_ml100k.yaml',
                               './llm4rec/configs/overall.yaml'])

dataset = RecboleSeqDataset(config, preprocess_text_fn=ml100k_preprocess)



In [None]:
config.final_config_dict['topk'] = [10, 20, 50, 100, 300]

In [None]:
from llm4rec.tasks import RetrievalRecommender
from llm4rec.evaluation.evaluate import evaluate_pipeline
from copy import deepcopy
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from llm4rec.utils import EmbeddingsFromFile

## Case 1: open source information retrieval using HuggingFace Sentence Transformers model

Create retrieval from dataset meta information

In [None]:
retrieval = RetrievalRecommender(
                embeddings=None,
                item2text=dataset.item_token2text,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_opensource_meta = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_opensource_meta[2]['test_result']

[1;35mEvaluate   [0m: 100%|████████████████████████████████████████████████| 118/118 [00:10<00:00, 11.70it/s][0m


OrderedDict([('recall@1', 0.0021),
             ('recall@5', 0.0127),
             ('recall@10', 0.0212),
             ('recall@20', 0.0467),
             ('ndcg@1', 0.0021),
             ('ndcg@5', 0.007),
             ('ndcg@10', 0.0096),
             ('ndcg@20', 0.0161)])

Create retrieval from ItemMemory data

In [None]:
from llm4rec.memory import ItemMemory

item_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/item_memory_summ.json"

item_memory = ItemMemory(item_ids=dataset.item_id_token[1:],
                         title_col='movie_title',
                         dataset_info_map=dataset.item_token2attr,
                         load_filename=item_filepath)

100%|██████████| 1682/1682 [00:00<00:00, 669528.26it/s]


In [None]:
retrieval = RetrievalRecommender(
                embeddings=None,
                item2text=lambda x: item_memory.retrieve(x, retr_type=''),
                item_memory=item_memory,
                load_from_file=False,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_opensource_memory = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_opensource_memory[2]['test_result']

[1;35mEvaluate   [0m: 100%|████████████████████████████████████████████████| 118/118 [00:19<00:00,  6.04it/s][0m


OrderedDict([('recall@1', 0.0),
             ('recall@5', 0.0064),
             ('recall@10', 0.0106),
             ('recall@20', 0.0201),
             ('ndcg@1', 0.0),
             ('ndcg@5', 0.0031),
             ('ndcg@10', 0.0045),
             ('ndcg@20', 0.0069)])



## Case 2: OpenAI information retrieval using OpenAIEmbeddings

In [None]:
path_to_env = os.path.join(repo_path, "api_keys.env")
load_dotenv(path_to_env)

openai_api_key = os.environ.get("API_KEY")
embeddings_model = OpenAIEmbeddings(
            openai_api_key=openai_api_key, model="text-embedding-ada-002"
        )

Embeddings from dataset item meta information

In [None]:
retrieval = RetrievalRecommender(
                embeddings=embeddings_model,
                item2text=dataset.item_token2text,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_openai_meta = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_openai_meta[2]['test_result']

[1;35mEvaluate   [0m: 100%|██████████████████████████████████████████████████| 25/25 [01:30<00:00,  3.63s/it][0m


Embeddings from item memory

In [None]:
retrieval = RetrievalRecommender(
                embeddings=embeddings_model,
                item2text=lambda x: item_memory.retrieve(x, retr_type=''),
                item_memory=item_memory,
                load_from_file=False,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])})

results_openai_meta = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_openai_meta[2]['test_result']

## Case 3: Information retrieval using ALS embeddings

In [None]:
emb = EmbeddingsFromFile(emb_file_path=os.path.join(repo_path, 'examples', 'develop','als_emb_ml100k.npz'))

In [None]:
retrieval = RetrievalRecommender(
                embeddings=emb,
                csv_loader_args=dict(csv_args=config['csv_args'],
                    source_column=config['source_column'],
                    metadata_columns=['movie_title:token_seq', 'release_year:token', 'class:token_seq']),
                item2text=lambda x: x,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                search_kwargs={'k':max(config['topk'])},
                query='{user_profile}')

results_als = evaluate_pipeline(config, deepcopy(dataset), tasks=[retrieval])
results_als[2]['test_result']

In [None]:
test_result

OrderedDict([('recall@1', 0.0074),
             ('recall@20', 0.0615),
             ('recall@50', 0.0891),
             ('recall@100', 0.1135),
             ('recall@200', 0.123),
             ('ndcg@1', 0.0074),
             ('ndcg@20', 0.0273),
             ('ndcg@50', 0.0327),
             ('ndcg@100', 0.0367),
             ('ndcg@200', 0.0381)])