In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os

repo_path = '/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec'
sys.path.append(repo_path)

In [None]:
!pip install -q -r '{repo_path}/requirements/requirements.txt'

In [None]:
%cd '{repo_path}'

/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec


In [None]:
from llm4rec.pipelines import RecBolePipelineRecommender
from llm4rec.dataset import RecboleSeqDataset
from llm4rec.evaluation.trainer import PipelineTrainer
from llm4rec.utils.dataset_utils import ml100k_preprocess
from recbole.data.utils import data_preparation
from recbole.config import Config
import os
import torch


model_cls = RecBolePipelineRecommender
dataset_name = 'ml-100k'

config = Config(model=model_cls, dataset=dataset_name,
             config_file_list=['./llm4rec/configs/dataset_ml100k.yaml',
                               './llm4rec/configs/overall.yaml'])

dataset = RecboleSeqDataset(config, preprocess_text_fn=ml100k_preprocess)
train_data, _, test_data = data_preparation(config, dataset)



In [None]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
import torch

embedding_size = 384
embedding_fn = HuggingFaceEmbeddings(
                model_name="all-MiniLM-L6-v2", model_kwargs={"device":"cuda:0" if torch.cuda.is_available() else "cpu"})

path_to_env = os.path.join(repo_path, "api_keys.env")
load_dotenv(path_to_env)

llm = ChatGroq(model_name="llama3-70b-8192", temperature=0)
#llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.1, "max_new_tokens":512,
#                                                                                  "return_full_text":False})

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Init memory

In [None]:
from llm4rec.memory import ItemMemory
from functools import partial
from langchain_community.document_loaders import WikipediaLoader

wiki_loader = partial(WikipediaLoader, load_max_docs=1)
item_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/item_memory_summ.json"

item_memory = ItemMemory(item_ids=dataset.item_id_token[1:],
                         title_col='movie_title',
                         dataset_info_map=dataset.item_token2attr,
                         load_filename=item_filepath,
                         summary_llm=llm,
                         augmentation_loader=wiki_loader)

100%|██████████| 1682/1682 [00:00<00:00, 592891.78it/s]


In [50]:
from llm4rec.memory.user_long_term_memory import UserLongTermMemory
from llm4rec.memory.user_short_term_memory import UserShortTermMemory
from llm4rec.memory.base_memory import BaseMemory
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.language_models.llms import BaseLLM
from langchain_core.embeddings import Embeddings
from recbole.data.dataset import Dataset
from tqdm import tqdm

import numpy as np
import typing as tp
import os


class UserMemory:
    """
    Stores and manipulates the historical data about user preferences and interactions.
    Can create user profile based on preferences.
    """

    def __init__(
        self,
        user_attributes: tp.Callable,
        short_term_limit: int,
        llm: tp.Union[BaseLLM, BaseChatModel],
        embeddings: Embeddings,
        emb_size: int,
        item_memory: BaseMemory,
        train_dataset: Dataset,
        min_rating: int = 1,
        max_rating: int = 5,
        num_to_retrieve: int = 3,
        update_long_term_every: int = None,
        load_filename: str = None,
    ):
        """
        Initialized UserMemory

        Args:
            user_attributes (Callable): A mapping fuction to get text user information from dataset.
            short_term_limit (int): Limit value for short-term memory.
            llm (BaseLLM, BaseChatModel): LLM model for reflecting on recent user preferences.
            embeddings (Embeddings): Embeddings model for retrieving of user long-term memory.
            emb_size (int): The dimension number of embedding vectors.
            item_memory (BaseMemory): Storage of item text information.
            train_dataset (Dataset): Dataset instance to initialize memory with dataset values.
            min_rating, max_rating: Values for scaling normalized rating values. Default range is 1-5
            num_to_retrieve (int): Number of retrieved relevant chunks from user long-term memory.
            update_long_term_every (int): How often to update long-term memory.
            load_filename (str): Complete file path to directory with saved memory values
        """
        # global memory
        self.user_attributes = user_attributes

        # personalized memory
        self.short_term_memory = UserShortTermMemory(
            llm=llm, item_memory=item_memory, memory_limit=short_term_limit
        )
        self.long_term_memory = UserLongTermMemory(
            embeddings=embeddings, emb_size=emb_size, k=num_to_retrieve
        )
        self.short_term_limit = short_term_limit
        self.update_long_term_every = (
            update_long_term_every if update_long_term_every else short_term_limit
        )

        self.llm = llm

        if load_filename is not None:
            self.load(load_filename)
        self._construct_memory(train_dataset, min_rating, max_rating)

    def _construct_memory(
        self, train_dataset: Dataset, min_rating: int = 1, max_rating: int = 5
    ) -> None:
        """
        Construct memory from values of train_dataset.
        """
        history_item_matrix = train_dataset.history_item_matrix()
        inter_matrix = train_dataset.inter_matrix("csr", value_field="rating")
        user_id_mapping = lambda user_ids: train_dataset.id2token("user_id", user_ids)
        item_id_mapping = lambda item_ids: train_dataset.id2token("item_id", item_ids)
        history_matrix, _, history_lens = history_item_matrix

        for user_id in tqdm(range(1, 350)): # len(history_matrix))):
            user_id_token = user_id_mapping(user_id)

            if user_id_token not in self.short_term_memory.memory_store:
                user_history = history_matrix[user_id][: history_lens[user_id]].tolist()
                ratings = (
                    inter_matrix[user_id, :].toarray() * (max_rating - min_rating)
                    + min_rating
                )
                ratings = ratings.astype("int")[0]

                user_id_token = user_id_mapping(user_id)
                item_id_tokens = item_id_mapping(user_history)

                for item, rating in zip(item_id_tokens, ratings):
                    self.update(
                        user_id_token, {"rating": int(rating), "item_id": str(item)}
                    )

    def update(self, id: str, data: tp.Any) -> None:
        """
        Update user memory.
        Long-term memory is updated every update_long_term_every times for user
        based on the number of performed updates.
        """
        self.short_term_memory.update(id, data)
        update_counts = self.short_term_memory.get_update_counts(id)

        if update_counts % self.update_long_term_every == 0:
            self.long_term_memory.update(id, self.short_term_memory.reflect(id))

    def retrieve(self, id: str, query: str, memory_type: str = "all") -> tp.Any:
        """
        Retrieve values from memory based on memory_type
        """
        if memory_type == "long":
            return self.long_term_memory.retrieve(id, query)
        elif memory_type == "short":
            return self.short_term_memory.retrieve(id, query)
        elif memory_type == "all":
            return {
                "long_term": self.long_term_memory.retrieve(id, query),
                "short_term": self.short_term_memory.retrieve(id, query),
            }

    def get_short_term_memory(self, id: str) -> tp.Any:
        return self.short_term_memory[id]

    def get_long_term_memory(self, id: str) -> tp.Any:
        return self.long_term_memory[id]

    def construct_user_profile(self, id: str, use_short_term: bool=False, use_long_term: bool=False) -> str:
        """
        Create user profile information by concatenating available information
        about user from dataset, short-term memory reflection and long-term memory retrieved values.
        """
        short_term_pref = self.short_term_memory.reflect(id)
        long_term_pref = self.retrieve(id, short_term_pref, memory_type="long")

        profile = f"User {id}"
        if self.user_attributes(id) != "":
            profile += f" attributes: {self.user_attributes(id)}\n"
        else:
            profile += "\n"

        if use_short_term:
            update_counts = self.short_term_memory.get_update_counts(id)
            if (update_counts - 1) % self.update_long_term_every != 0:
                profile += f"User recent preferences: {short_term_pref}\n"
        if use_long_term:
            profile += f"User long-term preferences: {long_term_pref}."

        #self.llm.invoke("Construct user profile using the following information: ")
        return profile

    def save(self, folder_path: str) -> None:
        """
        Save short-term memory and long-term memory to folder
        """
        self.short_term_memory.save(folder_path + "/short_term_mem.json")
        self.long_term_memory.save(folder_path + "/long_term_mem.json")

    def load(self, folder_path: str) -> None:
        """
        Load short-term memory and long-term memory from folder
        """
        assert os.path.exists(folder_path + "/short_term_mem.json")
        assert os.path.exists(folder_path + "/long_term_mem.json")

        self.short_term_memory.load(folder_path + "/short_term_mem.json")
        self.long_term_memory.load(folder_path + "/long_term_mem.json")

In [51]:
#from llm4rec.memory import UserMemory

user_filepath = "/content/drive/MyDrive/Colab Notebooks/thesis_work/llm-for-rec/examples/develop/user"

user_memory = UserMemory(train_dataset=train_data.dataset,
                         load_filename=user_filepath,
                         user_attributes=dataset.user_token2text,
                         short_term_limit=20, llm=llm,
                         embeddings=embedding_fn,
                         item_memory=item_memory,
                         emb_size=embedding_size)

100%|██████████| 349/349 [12:43<00:00,  2.19s/it]


In [None]:
user_memory.short_term_memory.memory_store

In [None]:
user_memory.long_term_memory.memory_store

In [53]:
user_memory.save(user_filepath)