# Example of using LLM-based information retrieval for recommendation task

Run in google colaboratory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os

repo_path = '/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec'
sys.path.append(repo_path)

## Install requirements

In [None]:
!pip install -q -r '{repo_path}/requirements/requirements.txt'

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.8/812.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.9/266.9 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

## Add config

In [None]:
import os

config_dict = {
    "csv_args": {"delimiter": "\t"},
    "source_column": "item_id:token",
    "search_kwargs": {"k": 200},
    "data_path": os.path.join(repo_path, "datasets"),
    "load_col": {
        "inter": ["user_id", "item_id", "rating", "timestamp"],
        "item": ["item_id", "movie_title"],
    },
    "text_col": ["movie_title", "release_year", "class"],
    "MAX_ITEM_LIST_LENGTH": 10,
    "eval_args": {"split": {"LS": "valid_and_test"}, "order": "TO", "mode": "full"},
    "repeatable": True,
    "loss_type": "CE",
    "train_batch_size": 100,
    "eval_batch_size": 8,
    "valid_metric": "NDCG@10",
    "metrics": ["Recall", "NDCG"],
    "topk": [1, 20, 50, 100, 200],
    "train_neg_sample_args": None,
}

In [None]:
assert config_dict['search_kwargs']['k'] >= max(config_dict['topk'])

## Get dataset and config

In [None]:
from llm4rec.pipelines import RecBolePipelineRecommender
from llm4rec.dataset import RecboleSeqDataset
from llm4rec.trainer import PipelineTrainer
from recbole.data.utils import data_preparation
from recbole.config import Config
from recbole.model.abstract_recommender import AbstractRecommender
import os
import torch

model_cls = RecBolePipelineRecommender
dataset_name = 'ml-100k'

config = Config(model=model_cls, dataset=dataset_name,
            config_dict=config_dict)

dataset = RecboleSeqDataset(config)
_, _, test_data = data_preparation(config, dataset)



## Case 1: open source information retrieval using HuggingFace Sentence Transformers model

In [None]:
from llm4rec.tasks import RetrievalRecommender

retrieval = RetrievalRecommender(
                embeddings=None,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                csv_loader_args=dict(csv_args=config['csv_args'],
                                source_column=config['source_column']),
                text_splitter_args=dict(chunk_size=1000, chunk_overlap=0),
                search_type="similarity",
                search_kwargs=config['search_kwargs'],
                emb_model_name="all-MiniLM-L6-v2",
                emb_model_kwargs={"device":"cuda:0" if torch.cuda.is_available() else "cpu"},
)

model = model_cls(config=config,
                            dataset=dataset,
                            tasks=[retrieval],
                            token2text=test_data.dataset.token2text,
                            verbose=False)

trainer = PipelineTrainer(config, model)
test_result = trainer.evaluate(test_data, show_progress=config['show_progress'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[1;35mEvaluate   [0m: 100%|████████████████████████████████████████████████| 118/118 [00:13<00:00,  8.49it/s][0m


In [None]:
test_result

OrderedDict([('recall@1', 0.0032),
             ('recall@20', 0.0594),
             ('recall@50', 0.1124),
             ('recall@100', 0.1729),
             ('recall@200', 0.2545),
             ('ndcg@1', 0.0032),
             ('ndcg@20', 0.0209),
             ('ndcg@50', 0.0312),
             ('ndcg@100', 0.0409),
             ('ndcg@200', 0.0524)])



## Case 2: OpenAI information retrieval using OpenAIEmbeddings

In [None]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

path_to_openai_env = os.path.join(repo_path, "openai.env")
load_dotenv(path_to_openai_env)

openai_api_key = os.environ.get("API_KEY")
embeddings_model = OpenAIEmbeddings(
            openai_api_key=openai_api_key, model="text-embedding-ada-002"
        )

retrieval = RetrievalRecommender(
                embeddings=embeddings_model,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                csv_loader_args=dict(csv_args=config['csv_args'],
                                source_column=config['source_column']),
                text_splitter_args=dict(chunk_size=1000, chunk_overlap=0),
                search_type="similarity",
                search_kwargs=config['search_kwargs'],
)

model = model_cls(config=config,
                dataset=dataset,
                tasks=[retrieval],
                token2text=test_data.dataset.token2text,
                verbose=False)

trainer = PipelineTrainer(config, model)
test_result = trainer.evaluate(test_data, show_progress=config['show_progress'])

[1;35mEvaluate   [0m: 100%|██████████████████████████████████████████████████| 25/25 [01:30<00:00,  3.63s/it][0m


In [None]:
test_result

OrderedDict([('recall@1', 0.035),
             ('recall@5', 0.07),
             ('recall@10', 0.085),
             ('recall@20', 0.125),
             ('ndcg@1', 0.035),
             ('ndcg@5', 0.0529),
             ('ndcg@10', 0.0578),
             ('ndcg@20', 0.0683)])

## Case 3: Information retrieval using ALS embeddings

In [None]:
from llm4rec.utils import EmbeddingsFromFile

emb = EmbeddingsFromFile(emb_file_path=os.path.join(repo_path, 'examples', 'als_emb_ml100k.npz'))

retrieval = RetrievalRecommender(
    items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
    embeddings=emb,
    csv_loader_args=dict(csv_args=config['csv_args'],
            source_column=config['source_column'],
            metadata_columns=['movie_title:token_seq', 'release_year:token', 'class:token_seq']),
    text_splitter_args=dict(chunk_size=1000, chunk_overlap=0),
    search_type="similarity",
    search_kwargs=config['search_kwargs'],
    query='{user_profile}'
)


model = model_cls(config=config,
                dataset=dataset,
                tasks=[retrieval],
                token2text=test_data.dataset.token2text,
                verbose=False)

trainer = PipelineTrainer(config, model)
test_result = trainer.evaluate(test_data, show_progress=config['show_progress'])

[1;35mEvaluate   [0m: 100%|████████████████████████████████████████████████| 118/118 [00:09<00:00, 12.26it/s][0m


In [None]:
test_result

OrderedDict([('recall@1', 0.0074),
             ('recall@20', 0.0615),
             ('recall@50', 0.0891),
             ('recall@100', 0.1135),
             ('recall@200', 0.123),
             ('ndcg@1', 0.0074),
             ('ndcg@20', 0.0273),
             ('ndcg@50', 0.0327),
             ('ndcg@100', 0.0367),
             ('ndcg@200', 0.0381)])