In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import sys
sys.path.append("../")

# Import libs and modules

In [None]:
import os
import re
import typing as tp
from pathlib import Path

import pandas as pd
from langchain.prompts import PromptTemplate

from recallmate.columns import Columns
from recallmate.data.memory import ItemMemory, UserMemory
from recallmate.tasks.information_retrieval import RetrievalRecommenderSimple
from recallmate.llm import load_model_openai
from recallmate.agents.tools import create_retrieval_tool

# Load Data

In [None]:
%%time
!wget -q https://files.grouplens.org/datasets/movielens/ml-100k.zip -O ml-100k.zip
!unzip -o ml-100k.zip
!rm ml-100k.zip

In [None]:
BASE_PATH_DATA = Path("./ml-100k")

Interaction data, user and product information must contain mandatory attributes.

For interactions: 
    
    - "user_id": Columns.User
    - "item_id": Columns.Item

For user informations: 
    
    - "user_id": Columns.User

For item informations: 
    
    - "item_id": Columns.Item

In [None]:
inter_cols = [Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]
df_interactions = pd.read_csv(BASE_PATH_DATA / "ua.base", sep='\t', names=inter_cols, encoding='latin-1')
df_interactions.sort_values([Columns.User, Columns.Datetime], inplace=True, ignore_index=True)
df_interactions.head()

In [None]:
u_cols =  [Columns.User, 'age', 'sex', 'occupation', 'zip_code']
df_users = pd.read_csv(BASE_PATH_DATA / "u.user", sep='|', names=u_cols, encoding='latin-1')
df_users.head()

In [None]:
def extract_year(movie_title) -> str:
    match = re.search(r'\((\d{4})\)', movie_title)
    if match:
        year = int(match.group(1))
    else:
        year = -1
    return year


def extract_title(movie_title) -> str:
    title = ""
    match = re.search(r'^(.*?)\s*\(\d{4}\)', movie_title)
    if match:
        title = match.group(1)
    return title.strip()
    

# Load movie genres
genre_cols = ["genre_name", "genre_code"]
df_genres = pd.read_csv(BASE_PATH_DATA / "u.genre", sep="|", names=genre_cols, encoding="latin-1")
unique_genres = df_genres["genre_name"].tolist()

# Load movie info
i_cols =  [Columns.Item, "title", "release_date", "", "link"] + unique_genres
df_items = pd.read_csv(BASE_PATH_DATA / "u.item", sep="|", names=i_cols, encoding="latin-1")
df_items.drop(columns="", inplace=True)

# Transform genre from One-Hot to string
all_movie_genres = []
for idx in range(len(df_items)):
    row = df_items.iloc[idx]
    movie_genres = []
    for g in unique_genres:
        if row[g]:
            movie_genres.append(g)

    all_movie_genres.append(", ".join(movie_genres))
    
df_items["genres"] = all_movie_genres

# Remove year from title and get year as separeting columns
df_items.loc[:, "year"] = df_items["title"].map(extract_year)
df_items.loc[:, "title"] = df_items["title"].map(extract_title)

df_items["year"] = df_items["year"].astype(int)

df_items.drop(columns=["release_date", "link"] + unique_genres, inplace=True)

df_items.head()

# Create `ItemMemory`

Since the data can be varied and not all attributes of users and items can be used, we leave the option to create an input prompt and a function to process it on the user side.

However, the function and prompt may not be implemented, in this case they will be created on the module side with all attributes except for the unique identifiers:

```python
item_memory = ItemMemory(data=df_items)
```

Function for translating attributes into text data:

Signature this functions must be: 
```python
def create_prompt(prompt: PromptTemplate, data: tp.Union[tp.Dict, pd.Series]) -> str:
    ...
```

In [None]:
def create_prompt(prompt: PromptTemplate, data: pd.Series) -> str:
    prompt_init = prompt.format(
        title=data["title"],
        genres=data["genres"],
        year=data["year"]
    )
    return prompt_init

Prompt to initialize item information (must include attributes that are processed in the `create_prompt` function):

In [None]:
item_prompt_init = PromptTemplate.from_template(
    "The title is '{title}'; The genres are {genres}; The year is '{year}'"
)

Creating instance ItemMemory:

In [None]:
# Parameters:
#     data (pd.DataFrame): The dataframe containing the item data. 
#     col_id (str): The column name of the item id. Defaults to "item_id".
#     prompt_init (PromptTemplate): The promt for initialization.
#     create_prompt (Callable): The function for translating information into a string.

item_memory = ItemMemory(
    data=df_items, 
    prompt_init=item_prompt_init, 
    create_prompt=create_prompt
)
item_memory

# Create UserMemory

Function for translating attributes into text data:

Signature of these functions is the same

In [None]:
def create_prompt_short_memory(prompt: PromptTemplate, data: tp.Dict) -> str:
    prompt_short_init = prompt.format(**{variable: data[variable] for variable in prompt.input_variables})
    return prompt_short_init

def create_user_overview_from_features(prompt: PromptTemplate, data: pd.Series) -> str:
    user_overview = prompt.format(
        sex="male" if data["sex"] == "M" else "female",
        age=data["age"],
        occupation=data["occupation"]
    )
    return user_overview

Prompt to initialize item information (must include attributes that are processed in the corresponding functions above):

In [None]:
user_prompt_init = PromptTemplate.from_template(
    "I am a {sex}, I am {age} years old, my profession is a {occupation}."
)
user_long_memory_prompt_init = PromptTemplate.from_template(
    "I watched and gave a rating above 3.5 (minimum rating 1, maximum - 5) " 
    "to the following films (in historical order): {history}"
)
user_short_memory_prompt_init = PromptTemplate.from_template(
    "I enjoy watching movies in the following genres: {genres}"
)

Creating instance UserMemory:

In [None]:
# Parameters:
#     data (pd.DataFrame): The dataframe containing the user data. 
#     interactions (pd.DataFrame): The dataframe containing the interactions of the users.
#     item_memory (ItemMemort): The item memory.
#     use_prompt_init (bool): Use information of the user or no. Defults True.
#     col_id (str): The column name of the user id. Defaults to "user_id".
#     prompt_init (PromptTemplate): The prompt template for user's information. Defaults None.
#     create_prompt (Callable): Function for creating prompt with user's information.
#     user_long_memory_prompt_init (PromptTemplate): The prompt template for the user long memory per item. 
#         Must includes one variable `history`. Defaults PromptTemplate.from_template(
#         "I interected with contents to the following films (in historical order): {history}"
#     )
#     user_short_memory_prompt_init (PromptTemplate): The prompt template for the user short memory. Defaults None.
#     short_memory_create_prompt (Callable): Function for creating short-term user's prompt. Defaults None.
#     long_memory_n (int): The number of items to keep in the long term memory. Defaults to 20.
#     short_memory_n (int): The number of items to keep in the short term memory. Defaults to 5.

user_memory = UserMemory(
    data=df_users,
    interactions=df_interactions,
    item_memory=item_memory,
    use_prompt_init=True,
    prompt_init=user_prompt_init,
    create_prompt=create_user_overview_from_features,
    user_long_memory_prompt_init=user_long_memory_prompt_init,
    user_short_memory_prompt_init=user_short_memory_prompt_init,
    short_memory_create_prompt=create_prompt_short_memory,
    long_memory_n=10,
    short_memory_n=5,
)
user_memory

# Load model

Load token for model's OpenAI

In [None]:
from dotenv import load_dotenv

load_dotenv("../openai.env")
OPEN_AI_API_KEY = os.environ.get("API_KEY")

In [None]:
# Parameters:
#     model_name (str): The name of the model to load.
#     openai_api_key (str): The API key for accessing the OpenAI service.
#     mode (str): Type of the model for loading. Default is `generation`.
#     embeddings_model_args (Optional[Dict[str, Any]]): Additional arguments for the embeddings model. Default is None.
    
# Returns:
#     OpenAIEmbeddings: An instance of the OpenAIEmbeddings model loaded with the specified parameters.

llm = load_model_openai(
    model_name="text-embedding-ada-002",
    openai_api_key=OPEN_AI_API_KEY,
    mode="embedding",
)
llm

# Create Inofrmation Retrieval

Create an instance of the class for information search, with the help of which we will make recommendations

The information search component takes as input the memory about items, encoding llm and other auxiliary parameters.Create an instance of the class for information search, with the help of which we will make recommendations.

In [None]:
# Parameters:
#     item_memory (ItemMemory): Memory of the items.
#     embeddings_model (OpenAIEmbeddings | HuggingFaceEmbedding | OpenSourceLLMEmbeddings): Embedding model.
#     col_item_id (str): Column name with item id. Default "item_id"
#     text_splitter_args (TextSplitter): Arguments for splitting text. Default {'chunk_size': 1000, 'chunk_overlap': 0}
#     log (bool): Logging running.
            
retrieval_recommender = RetrievalRecommenderSimple(
    item_memory=item_memory,
    embeddings_model=llm,
    col_item_id=Columns.Item,
    text_splitter_args={'chunk_size': 1000, 'chunk_overlap': 0},
    log=True,
)
retrieval_recommender

# Recommend

To get the recommendations, you need to call the `run` method from infromation retrieval

In [None]:
# Set the number of recommendations we want to get
TOP_K = 100 

In [None]:
# Parameters:
#     user_memory (BaseMemory): The memory of the users.
#     instruct (str, optional): Additional instructions for the recommendation process. Defaults to None.
#     users_for_recommend (tp.Sequence[tp.Any], optional): Specific users for whom recommendations should be generated. Defaults to None.
#     use_user_memory_short (bool): Flag to indicate whether to use short-term memory in the recommendation process. Defaults to True.
#     use_user_memory_long (bool): Flag to indicate whether to use long-term memory in the recommendation process. Defaults to True.
#     search_type (str): The type of search to be performed. Defaults to "similarity".
#     search_kwargs (tp.Dict[str, tp.Any]): Keyword arguments for the search process. Defaults to {"k": 10}.
#     add_rank (bool): Flag to add ranking to the recommendations. Defaults to True.

# Returns:
#     pd.DataFrame: A DataFrame containing user recommendations with user IDs, item IDs, and ranks (optional).

reco_openai = retrieval_recommender.run(
    user_memory=user_memory,
    users_for_recommend=df_users[Columns.User].unique().tolist()[:10],
    use_user_memory_short=False,
    use_user_memory_long=True,
    search_type="similarity",
    search_kwargs={"k": TOP_K},
    add_rank=True,
)

In [None]:
reco_openai.head()

# Create IR Tool for Agent

The agent needs to know what the features do in order to make the correct selections. Therefore, a description must be created for them.

Creating description for IR Tool:

In [None]:
description_tool = (
    "Search content that is most similar to content from previous interactions with the recommender system.\n"
    "If you have any questions about searching related content, you should use this tool!\n"
    "Give your answer in the following format:\n"
    "```\n"
    "Candidate movies to recommend for the user:\n"
    "1. Movie ID: {{id}} Title: {{title}}. Release year: {{year}}. Genres: {{genres}}. Overview: {{overview}}\n"
    "...\n"
    "{{n}}. Movie ID: {{id}} Title: {{title}}. Release year: {{year}}. Genres: {{genres}}. Overview: {{overview}}\n"
    "```\n"
    "where {{n}} is number of recommended movies for the user, {{id}} is movie ID, {{year}} is release year of movie, "
    "{{genres}} are movie genres, {{overview}} is movie overview.\n"
    "You must strictly follow the given output format and not write anything outside the given format!"
)


Creating IR Tool:

In [None]:
# Parameters:
#     retrieval (Recommender): The retrieval to use for the retrieval
#     name (str): The name for the tool. This will be passed to the language model,
#         so should be unique and somewhat descriptive. Default  "retrieval_recommender".
#     description (str): The description for the tool. This will be passed to the language
#         model, so should be descriptive. Default (
#            "Search content that is most similar to content from previous interactions with the recommender system.\n"
#            "If you have any questions about searching related content, you should use this tool!\n"
#        )
#     args_schema (BaseModel): The schema of the tool's input arguments. Default RetrieveInput.
#     return_direct (bool): Whether to return the result directly or as a callback.
#     infer_schema (bool): Whether to infer the schema from the function's signature.

# Returns:
#     (Tool): Tool class to pass to an agent
                                          
retrieval_recommender_tool = create_retrieval_tool(
    retrieval=retrieval_recommender,
    description=description_tool,
)

# Recommend via IR Tool

The parameters are the same as for the run method in information retrieval, BUT wrapped in a dictionary

In [None]:
reco_openai_from_tool = retrieval_recommender_tool.invoke(
    dict(
        user_memory=user_memory,
        users_for_recommend=df_users[Columns.User].unique().tolist()[:10],
        use_user_memory_short=True,
        use_user_memory_long=True,
        search_type="similarity",
        search_kwargs={"k": TOP_K},
        add_rank=True,
    )
)

In [None]:
reco_openai_from_tool.head()