# Example of creating and using tools

Run in google colaboratory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os

repo_path = '/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec'
sys.path.append(repo_path)

## Install requirements

In [None]:
!pip install -q -r '{repo_path}/requirements/requirements.txt'

## Add config

In [None]:
import os

config_dict = {
    "csv_args": {"delimiter": "\t"},
    "source_column": "item_id:token",
    "search_kwargs": {"k": 20},
    "data_path": os.path.join(repo_path, "datasets"),
    "load_col": {
        "inter": ["user_id", "item_id", "rating", "timestamp"],
        "item": ["item_id", "movie_title"],
    },
    "title_col":"movie_title",
    "text_col": ["movie_title", "release_year", "class"],
    "MAX_ITEM_LIST_LENGTH": 10,
    "eval_args": {"split": {"LS": "valid_and_test"}, "order": "TO", "mode": "full"},
    "repeatable": True,
    "loss_type": "CE",
    "train_batch_size": 100,
    "eval_batch_size": 8,
    "valid_metric": "NDCG@10",
    "metrics": ["Recall", "NDCG"],
    "topk": [1, 5, 20],
    "train_neg_sample_args": None,
}

In [None]:
assert config_dict['search_kwargs']['k'] >= max(config_dict['topk'])

## Get dataset and config

In [None]:
# preprocessing for ml-100k
def ml100k_preprocess(text: str) -> str:
    if text.endswith(', The'):
        text = 'The ' + text[:-5]
    elif text.endswith(', A'):
        text = 'A ' + text[:-3]
    return text

In [None]:
from llm4rec.pipelines import RecBolePipelineRecommender
from llm4rec.dataset import RecboleSeqDataset
from llm4rec.evaluation.trainer import PipelineTrainer
from recbole.data.utils import data_preparation
from recbole.config import Config
import os
import torch

model_cls = RecBolePipelineRecommender
dataset_name = 'ml-100k'

config = Config(model=model_cls, dataset=dataset_name,
            config_dict=config_dict)

dataset = RecboleSeqDataset(config, preprocess_text_fn=ml100k_preprocess)
train_data, _, eval_data = data_preparation(config, dataset)



In [None]:
for batched_data in eval_data:
    interaction, history_index, positive_u, positive_i = batched_data
    batch_size = len(interaction["user_id"])

    for inter_idx in range(batch_size):
        user_id = interaction[inter_idx]["user_id"]
        history_ids = interaction[inter_idx]["item_id_list"]
        history_length = min(
            config["MAX_ITEM_LIST_LENGTH"],
            interaction[inter_idx]["item_length"],
        )
        history_names = [eval_data.dataset.item_id2text(int(item_id)) for item_id in history_ids[:history_length]]
        history_item_ids = eval_data.dataset.id2token("item_id", history_ids[:history_length])
        prev_interactions = list(history_item_ids)
        user_token_id = eval_data.dataset.id2token("user_id", user_id)
        user_profile = ""
    break

## Create tools

Create LLM retrieval tool

In [None]:
from llm4rec.memory import ItemMemory

item_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/item_memory_summ.json"
item_memory = ItemMemory(item_ids=dataset.item_id_token[1:],
                         title_col='movie_title',
                         dataset_info_map=dataset.item_token2attr,
                         load_filename=item_filepath)

100%|██████████| 1682/1682 [00:00<00:00, 801866.26it/s]


In [None]:
from llm4rec.tasks import RetrievalRecommender
from llm4rec.agents.tools.retrieval import create_retrieval_tool

retrieval = RetrievalRecommender(
                embeddings=None,
                load_from_file=True,
                item_memory=item_memory,
                item2text=item_memory.retrieve,
                items_info_path=os.path.join(config['data_path'], f"{config['dataset']}.item"),
                csv_loader_args=dict(csv_args=config['csv_args'],
                                source_column=config['source_column']),
                text_splitter_args=dict(chunk_size=1000, chunk_overlap=0),
                search_type="similarity",
                search_kwargs=config['search_kwargs'],
                emb_model_name="all-MiniLM-L6-v2",
                emb_model_kwargs={"device":"cuda:0" if torch.cuda.is_available() else "cpu"},
)

retr_tool = create_retrieval_tool(retrieval, infer_schema=False)

In [None]:
retr_tool.name, retr_tool.description, retr_tool.args

In [None]:
retrieval_results = retr_tool.invoke(dict(user_profile=user_profile, prev_interactions=prev_interactions, top_k=20))
retrieval_results

Create traditional recsys retrieval tool

In [None]:
from recbole.model.general_recommender import ItemKNN
from llm4rec.tasks import GeneralRecBoleModelWrapper

knn_model = GeneralRecBoleModelWrapper(ItemKNN, config.final_config_dict, dataset=train_data.dataset, top_k=20, k=20)

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field

class RetrievalBaseInput(BaseModel):
    """Input for tool"""
    user_token_id: str = Field(description='User id')

knn_retrieval_tool = create_retrieval_tool(knn_model, args_schema=RetrievalBaseInput)

In [None]:
knn_retrieval_tool.name, knn_retrieval_tool.description, knn_retrieval_tool.args

('retrieval_recommender',
 "The tool can find similar items for specific list of previous items.         Never use this tool if you don't want to find some items similar with provided items.         There is a similarity score threshold in the tool, only {item}s with similarity above the threshold would be kept.         Besides, the tool could be used to retrieve the items similar to previous items for ranking tool to refine.         The input of the tool should be a list of previous item titles/names, which should be a Python list of strings, the user_profile information in type of string and top_k which is number of items to retrieve         Do not fake any item names.",
 {'user_token_id': {'title': 'User Token Id',
   'description': 'User id',
   'type': 'string'}})

In [None]:
knn_retrieval_results = knn_retrieval_tool.invoke(dict(user_token_id=user_token_id))
knn_retrieval_results

['174',
 '172',
 '210',
 '195',
 '96',
 '82',
 '56',
 '50',
 '79',
 '385',
 '204',
 '228',
 '161',
 '176',
 '181',
 '183',
 '568',
 '403',
 '550',
 '121']

Create LLM ranker tool

In [None]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from llm4rec.tasks import RankerRecommender
import os

path_to_env = os.path.join(repo_path, "api_keys.env")
load_dotenv(path_to_env)

llm = ChatGroq(model_name="llama3-70b-8192", temperature=0)
ranker = RankerRecommender(llm=llm, item2text=dataset.item_token2text)

In [None]:
from llm4rec.tools import create_ranking_tool

rank_tool = create_ranking_tool(ranker)
rank_tool.name, rank_tool.description, rank_tool.args

('ranker_recommender',
 'The tool is useful to refine items order (for better experiences) or remove unwanted items from the top.         The input of the tool should be previous interaction data (item ids and their text data and candidate recommendation items (item ids and ther text data).         The candidates depend on previous tool using. Only when there is a list of candidate items to recommend         this tool could be used.',
 {'prev_interactions': {'title': 'Prev Interactions',
   'description': 'Item ids of previous interactions',
   'type': 'array',
   'items': {'type': 'string'}},
  'candidates': {'title': 'Candidates',
   'description': 'Item ids of candidate items for recommendation from previous step',
   'type': 'array',
   'items': {'type': 'string'}}})

In [None]:
rank_tool.invoke(dict(prev_interactions=prev_interactions, candidates=retrieval_results))

['125',
 '1085',
 '1137',
 '344',
 '1115',
 '5',
 '43',
 '741',
 '10',
 '807',
 '1101',
 '643',
 '196',
 '212',
 '1378',
 '1649',
 '1611',
 '129',
 '852',
 '197']

Create tradional recsys ranker tool

In [None]:
from llm4rec.tasks import SequentialRecBoleModelWrapper
from recbole.model.sequential_recommender import SASRec

sas_model = SequentialRecBoleModelWrapper(SASRec, config.final_config_dict, train_data.dataset,
                                      n_layers=2, n_heads=2, embedding_size=64, hidden_size=64,
                                      inner_size=256, hidden_dropout_prob=0.5, attn_dropout_prob=0.5,
                                      hidden_act='gelu', layer_norm_eps=1e-12, initializer_range=0.02,
                                      loss_type='CE',
                                      pretrained_file=os.path.join(repo_path, 'examples', 'SASRec-ml-1m.pth'),
                                      top_k=20)

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
import typing as tp

class RankerInput(BaseModel):
    """Input for tool"""
    prev_interactions: tp.List[str] = Field(description='Item ids of previous interactions of the user')
    candidates: tp.List[str] = Field(
        description="Item ids of candidate items for recommendation from previous step"
    )

sasrec_ranker_tool = create_ranking_tool(sas_model, args_schema=RankerInput)
sasrec_ranker_tool.name, sasrec_ranker_tool.description, sasrec_ranker_tool.args

('ranker_recommender',
 'The tool is useful to refine items order (for better experiences) or remove unwanted items from the top.         The input of the tool should be previous interaction data (item ids and their text data and candidate recommendation items (item ids and ther text data).         The candidates depend on previous tool using. Only when there is a list of candidate items to recommend         this tool could be used.',
 {'prev_interactions': {'title': 'Prev Interactions',
   'description': 'Item ids of previous interactions of the user',
   'type': 'array',
   'items': {'type': 'string'}},
  'candidates': {'title': 'Candidates',
   'description': 'Item ids of candidate items for recommendation from previous step',
   'type': 'array',
   'items': {'type': 'string'}}})

In [None]:
sasrec_ranker_tool.invoke(dict(prev_interactions=prev_interactions, candidates=list(knn_retrieval_results)))

['568',
 '228',
 '79',
 '161',
 '172',
 '181',
 '403',
 '121',
 '195',
 '56',
 '210',
 '174',
 '96',
 '183',
 '550',
 '385',
 '176',
 '50',
 '204',
 '82']