# Example of using LLM-based information retrieval for recommendation task

Run in google colaboratory

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import sys
import os

repo_path = '/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec'
sys.path.append(repo_path)

In [13]:
sys.path.append(os.path.join(repo_path, "tasks", "information_retrieval"))

## Install requirements

In [7]:
!pip install -q -r '{repo_path}/requirements/requirements.txt'

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.5/810.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

## Add config

In [8]:
import os

config_dict = {'csv_args': {'delimiter': '\t'},
 'source_column': 'item_id:token',
 'search_kwargs': {'k': 20, 'score_threshold': 0.5},
  'data_path': os.path.join(repo_path, "datasets"),
 'load_col': {'inter': ['user_id', 'item_id', 'rating', 'timestamp'],
  'item': ['item_id', 'title']},
 'text_col': 'title',
 'MAX_ITEM_LIST_LENGTH': 10,
  'eval_args': {'split': {'LS': 'valid_and_test'},
  'order': 'TO',
  'mode': 'full'},
 'repeatable': True,
 'loss_type': 'CE',
 'train_batch_size': 100,
 'eval_batch_size': 8,
 'valid_metric': 'NDCG@10',
 'metrics': ['Recall', 'NDCG'],
 'topk': [1, 5, 10, 20],
 'train_neg_sample_args': None
}

## Get dataset and config

In [None]:
from llm4rec.tasks import RecBoleRetrievalRecommender
from llm4rec.dataset import RecboleSeqDataset
from llm4rec.trainer import PipelineTrainer
from recbole.data.utils import data_preparation
from recbole.config import Config
from recbole.model.abstract_recommender import AbstractRecommender
import os
import torch

model_cls = RecBoleRetrievalRecommender
dataset_name = 'amazon-books'

config = Config(model=model_cls, dataset=dataset_name,
            config_dict=config_dict)

dataset = RecboleSeqDataset(config)
_, _, test_data = data_preparation(config, dataset)

## Case 1: open source information retrieval using HuggingFace Sentence Transformers model

In [None]:
model = model_cls(config=config,
                embeddings=None,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                source_column=config['source_column'],
                csv_args=config['csv_args'],
                text_splitter_args=dict(chunk_size=500, chunk_overlap=0),
                search_type="similarity",
                search_kwargs=config['search_kwargs'],
                emb_model_name="all-MiniLM-L6-v2",
                emb_model_kwargs={"device":"gpu" if torch.cuda.is_availabe() else "cpu"},
                dataset=dataset)

trainer = PipelineTrainer(config, model)
test_result = trainer.evaluate(test_data, show_progress=config['show_progress'])

In [None]:
test_result

## Case 2: OpenAI information retrieval using OpenAIEmbeddings

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

path_to_openai_env = os.path.join(repo_path, "openai.env")
load_dotenv(path_to_openai_env)

openai_api_key = os.environ.get("API_KEY")
embeddings_model = OpenAIEmbeddings(
            openai_api_key=openai_api_key, model="text-embedding-ada-002"
        )
model = model_cls(config=config,
                embeddings=embeddings_model,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                source_column=config['source_column'],
                csv_args=config['csv_args'],
                text_splitter_args=dict(chunk_size=500, chunk_overlap=0),
                search_type="similarity",
                search_kwargs=config['search_kwargs'],
                emb_model_name="",
                emb_model_kwargs=None,
                dataset=dataset)

trainer = PipelineTrainer(config, model)
test_result = trainer.evaluate(test_data, show_progress=config['show_progress'])

In [72]:
test_result

OrderedDict([('recall@1', 0.0),
             ('recall@5', 0.0),
             ('recall@10', 0.0),
             ('recall@20', 0.0),
             ('ndcg@1', 0.0),
             ('ndcg@5', 0.0),
             ('ndcg@10', 0.0),
             ('ndcg@20', 0.0)])