# Example of using LLM-based pipeline with augmentation for recommendation task


Run in google colaboratory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
import os

repo_path = '/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec'
sys.path.append(repo_path)

## Install requirements

In [None]:
!pip install -q -r '{repo_path}/requirements/requirements.txt'

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.5/308.5 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

## Add config

In [None]:
import os

config_dict = {
    "csv_args": {"delimiter": "\t"},
    "source_column": "item_id:token",
    "search_kwargs": {"k": 200},
    "data_path": os.path.join(repo_path, "datasets"),
    "load_col": {
        "inter": ["user_id", "item_id", "rating", "timestamp"],
        "item": ["item_id", "movie_title"],
    },
    "text_col": ["movie_title", "release_year", "class"],
    "MAX_ITEM_LIST_LENGTH": 10,
    "eval_args": {"split": {"LS": "valid_and_test"}, "order": "TO", "mode": "full"},
    "repeatable": True,
    "loss_type": "CE",
    "train_batch_size": 100,
    "eval_batch_size": 8,
    "valid_metric": "NDCG@10",
    "metrics": ["Recall", "NDCG"],
    "topk": [1, 5, 20],
    "train_neg_sample_args": None,
}

In [None]:
assert config_dict['search_kwargs']['k'] >= max(config_dict['topk'])

## Get dataset and config

In [None]:
from llm4rec.pipelines import RecBolePipelineRecommender
from llm4rec.dataset import RecboleSeqDataset
from llm4rec.trainer import PipelineTrainer
from recbole.data.utils import data_preparation
from recbole.config import Config
from recbole.model.abstract_recommender import AbstractRecommender
import os
import torch

model_cls = RecBolePipelineRecommender
dataset_name = 'ml-100k'

config = Config(model=model_cls, dataset=dataset_name,
            config_dict=config_dict)

dataset = RecboleSeqDataset(config)
train_data, _, eval_data = data_preparation(config, dataset)



## Create llm and embeddings instances for pipeline

In [None]:
from dotenv import load_dotenv

path_to_openai_env = os.path.join(repo_path, "api_keys.env")
load_dotenv(path_to_openai_env)

In [None]:
from langchain import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch

llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.1, "max_new_tokens":512,
                                                                                  "return_full_text":False})
embedding_size = 384
embedding_fn = HuggingFaceEmbeddings(
                model_name="all-MiniLM-L6-v2", model_kwargs={"device":"cuda:0" if torch.cuda.is_available() else "cpu"})

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Create item and user memory (for item info augmentation and for generation of user profile from train data)

In [None]:
from llm4rec.memory import ItemMemory, UserMemory
from functools import partial
from langchain_community.document_loaders import WikipediaLoader

wiki_loader = partial(WikipediaLoader, load_max_docs=1)

item_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/item_memory_summ.json"
user_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/user"

item_memory = ItemMemory(item_ids=dataset.item_id_token[1:],
                         title_col='movie_title',
                         dataset_info_map=dataset.item_token2attr,
                         load_filename=item_filepath,
                         summary_llm=llm,
                         augmentation_loader=wiki_loader)

user_memory = UserMemory(user_attributes=dataset.user_token2text,
                         short_term_limit=20, llm=llm,
                         embeddings=embedding_fn,
                         item_memory=item_memory,
                         emb_size=embedding_size,
                         load_filename=None,
                         train_dataset=train_data.dataset)

In [None]:
item_memory.save(item_filepath)
user_memory.save(user_filepath)

## Define tasks in pipeline

In [None]:
from llm4rec.tasks import ItemAugmentation, UserAugmentation, RetrievalRecommender, RankerRecommender

item_augm = ItemAugmentation(item_memory)
user_augm = UserAugmentation(user_memory)
retrieval = RetrievalRecommender(
                embeddings=embedding_fn,
                item2text=lambda x: item_memory.retrieve(x, retr_type=''),
                load_from_file=False,
                item_memory=item_memory,
                search_kwargs={'k':max(config['topk'])}
)
ranker = RankerRecommender(llm=llm)



## Create and evaluate pipeline

In [None]:
model = RecBolePipelineRecommender(config=config, dataset=dataset,
                                   tasks=[item_augm, user_augm, retrieval, ranker],
                                   verbose=True)

Evaluate for one user

In [None]:
for batched_data in eval_data:
    interaction, history_index, positive_u, positive_i = batched_data
    batch_size = len(interaction["user_id"])

    for inter_idx in range(batch_size):
        user_id = interaction[inter_idx]["user_id"]
        history_ids = interaction[inter_idx]["item_id_list"]
        history_length = min(
            config["MAX_ITEM_LIST_LENGTH"],
            interaction[inter_idx]["item_length"],
        )
        history_names = [eval_data.dataset.item_id2text(hist_id) for hist_id in history_ids[:history_length].tolist()]
        history_item_ids = eval_data.dataset.id2token("item_id", history_ids[:history_length])
        user_id_token  = eval_data.dataset.id2token('user_id', user_id)
        reco = model.run(user_profile=user_id_token, prev_interactions=history_item_ids, top_k=5)

        user_memory.update(user_id_token, {'item_id':history_item_ids[-1], 'rating':interaction[inter_idx]['rating'][-1]})
        print(reco)
        break
    break

Task 2 outputs:  User 259 attributes: age: 21; gender: M; occupation: student; zip_code: 48823
User recent preferences: 1. The user does not seem to enjoy comedies, as they have given low ratings to Monty Python and the Holy Grail, Kids, and What's Eating Gilbert Grape. 2. The user has shown no interest in science fiction or fantasy films, as they have given low ratings to Star Wars: Return of the Jedi and The Maltese Falcon. 3. The user appears to enjoy action and adventure films, as they have given a high rating to Donnie Brasco. 4. The user has shown no interest in musical scores or soundtracks, as they have given low ratings to Dances with Wolves and Clockwork Orange.
User long-term preferences: 1. The user does not seem to enjoy comedies, as they have given low ratings to Monty Python and the Holy Grail, Kids, and What's Eating Gilbert Grape. 2. The user has shown no interest in science fiction or fantasy films, as they have given low ratings to Star Wars: Return of the Jedi and T