In [1]:
# Import libraries
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

import json
import asyncio
import os
import httpx
import time

In [2]:
df = pd.read_csv('sanrio_products.csv')
df.head()

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration
0,30070,"Badtz-maru 7"" Plush (I Love Me Series)",https://www.sanrio.com/cdn/shop/files/zz-25043...,In honor of the 40th Anniversary Sanrio Charac...,Badtz-maru,Badtz-maru,Plush,"plush, 7'' plush, embroidered, black",44.0,I Love Me,Single,0.0,
1,486183,Badtz-maru Mascot Badge Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25044...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, badge, keychain, embroidered, black",5.99,Sanrio Character Award,Single,0.0,
2,175064,Badtz-maru Mini Mascot Keychain (Sanrio Charac...,https://www.sanrio.com/cdn/shop/files/original...,I’ll adore you with all my heart! Clip this ad...,Badtz-maru,Badtz-maru,Accessory,"accessory, plush keychain, keychain, black",14.99,Sanrio Character Award,Single,0.0,
3,619744,Badtz-maru Customizable Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25046...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, keychain, stickers, customizable, r...",10.99,Sanrio Character Award,Multiple,4.0,
4,CNS0132,Hello Kitty and Friends BLDR Building Set (Bad...,https://www.sanrio.com/cdn/shop/files/CNS01321...,Get ready to unleash your competitive spirit a...,"Badtz-maru, Hello Kitty, Melody, Keroppi, Choc...",,Toys&Games,"toys, building set, playset, set, bldr bricks",49.99,Badtz-maru’s Bowling Alley,Multiple,0.0,


In [3]:
# Replace NaN values with empty string for specified columns
df[['Collaboration', 'Character-centric', 'Series']] = df[['Collaboration', 'Character-centric', 'Series']].fillna("")

# Handle potential NaN in 'Discount' before converting to string in create_similarity_matrix
df['Discount'] = df['Discount'].fillna(0)


display(df.describe())
display(df.info())

Unnamed: 0,Price ($),Discount
count,195.0,195.0
mean,31.607846,1.638564
std,33.002663,4.60949
min,5.99,0.0
25%,14.0,0.0
50%,24.0,0.0
75%,36.0,0.0
max,229.0,32.99


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item #             195 non-null    object 
 1   Title              195 non-null    object 
 2   Image URL          195 non-null    object 
 3   Description        195 non-null    object 
 4   Characters         195 non-null    object 
 5   Character-centric  195 non-null    object 
 6   Type               195 non-null    object 
 7   Tags               195 non-null    object 
 8   Price ($)          195 non-null    float64
 9   Series             195 non-null    object 
 10  Number of Items    195 non-null    object 
 11  Discount           195 non-null    float64
 12  Collaboration      195 non-null    object 
dtypes: float64(2), object(11)
memory usage: 19.9+ KB


None

# Functions

In [None]:
# Get structured JSON of user preferences from dataframe
async def get_llm_profile(rated_items_df, api_key):
    # Format user ratings into a string for the prompt
    ratings_summary = []
    for _, item in rated_items_df.iterrows():
        series = item['Series']
        if series == "":
            ratings_summary.append(
                f"- Rating: {item['rating']} stars\n"
                f"  Tags: {item['Character-centric']} {item['Tags']}\n"
            )
        else:
            ratings_summary.append(
                f"- Rating: {item['rating']} stars"
                f" | Tags: {item['Character-centric']} {item['Series']} {item['Tags']}\n"
            )
    ratings_text = "\n".join(ratings_summary)
    print("Ratings text: ", ratings_text)

    # Define the system prompt and query
    system_prompt = """
    You are an expert recommender system analyst. Your job is to analyze a user's item
    ratings and return a structured JSON object of their preferences.

    The user provides ratings (1-5 stars) and some attributes about the product.
    - 4-5 stars = loves
    - 1-2 stars = hates
    
    You must identify the key features (characters, types, series, tags) and
    assign weights based on the user's ratings.

    Token weights should be:
    - 6 for high-priority tokens (e.g., a specific character in a 5-star rating)
    - 3 for medium-priority tokens (e.g., a product type in a 5-star rating)
    - 2 for high-priority negative tokens (e.g., a specific character in a 1-star rating)
    - 1 for all other tokens.
      
    The input format is:
    - Rating: [1-5] stars | Tags: [tag1, tag2, tag3, ...]

    RULES:
    1.  Analyze the 'Tags' field for preferences.
    2.  "Loves" (4-5 stars) go in the "loves" array.
    3.  "Hates" (1-2 stars) go in the "hates" array.
    4.  Ignore 3-star ratings.
    5.  The JSON MUST follow this schema:
        {
          "loves": [{"token": "string", "weight": int}, ...],
          "hates": [{"token": "string", "weight": int}, ...]
        }
    6.  Do not include 3-star ratings in the output.
    7.  If there are no loves or hates, return an empty array for that key.
    """
    
    user_query = f"""
    Here are the user's ratings:

    {ratings_text}

    Return the structured JSON object.
    """

    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={api_key}"

    payload = {
        "contents": [{"parts": [{"text": user_query}]}],
        "systemInstruction": {
            "parts": [{"text": system_prompt}]
        },
        "generationConfig": {
            "responseMimeType": "application/json",
            "responseSchema": {
                "type": "OBJECT",
                "properties": {
                    "loves": {
                        "type": "ARRAY",
                        "items": {
                            "type": "OBJECT",
                            "properties": {
                                "token": {"type": "STRING"},
                                "weight": {"type": "NUMBER"}
                            }
                        }
                    },
                    "hates": {
                        "type": "ARRAY",
                        "items": {
                            "type": "OBJECT",
                            "properties": {
                                "token": {"type": "STRING"},
                                "weight": {"type": "NUMBER"}
                            }
                        }
                    }
                }
            }
        }
    }

    # Make the API Call
    print("Calling Gemini API...")
    
    max_retries = 3
    delay = 1.0 # Initial delay in seconds
    
    async with httpx.AsyncClient(timeout=60.0) as client:
        for attempt in range(max_retries):
            try:
                start_time = time.time()
                response = await client.post(
                    api_url,
                    headers={'Content-Type': 'application/json'},
                    json=payload
                )
                end_time = time.time()

                duration = end_time - start_time
                
                response.raise_for_status()
                
                result = response.json()
                json_text = result.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text', '{}')
                
                if not json_text:
                    raise ValueError("API returned empty response.")
                
                print(f"LLM JSON Response:\n{json_text}")
                return json.loads(json_text), duration

            except (httpx.RequestError, httpx.HTTPStatusError, json.JSONDecodeError, ValueError, IndexError, KeyError) as e:
                print(f"Error calling LLM (Attempt {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {delay} seconds...")
                    await asyncio.sleep(delay)
                    delay *= 2 # Exponential backoff
                else:
                    print("Max retries reached. Failing.")
                    # Fallback to an empty profile
                    return {"loves": [], "hates": []}

    # Fallback in case of unexpected exit
    return {"loves": [], "hates": []}

In [5]:
def create_tfidf_matrix(df):
    # Columns to use for content features
    phrase_cols = ['Type', 'Series', 'Collaboration', 'Character-centric']

    # Handle normal phrase columns
    for col in phrase_cols:
        df[col] = df[col].fillna('').astype(str).str.lower().str.replace(r'[\s,]+', '_', regex=True)
        # Remove any trailing underscores that might result from ", "
        df[col] = df[col].str.replace(r'_+', '_', regex=True).str.strip('_')
        # Handle 'none' string from fillna/conversion
        df[col] = df[col].replace('none', '')

    # Processing columns set up as lists
    def process_list_column(tag_string):
        if not isinstance(tag_string, str):
            return ""
        
        tags = tag_string.split(',')
        processed_tags = []
        for tag in tags:
            clean_tag = tag.strip()
            if clean_tag:
                # Replace internal spaces with underscores
                tokenized_tag = clean_tag.replace(' ', '_')
                processed_tags.append(tokenized_tag)
        
        # Join processed tags with a space
        return ' '.join(processed_tags)

    # Handle tags
    list_cols = ['Characters', 'Tags']
    for col in list_cols:
        if col in df.columns:
            # Ensure it's a string, lowercase
            df[col] = df[col].fillna('').astype(str).str.lower()
            df[col] = df[col].apply(process_list_column)
            # Clean up any double underscores
            df[col] = df[col].str.replace(r'_+', '_', regex=True)
            
    # Clean up any double underscores
    df[col] = df[col].str.replace(r'_+', '_', regex=True)

    # Build the weighted content soup string by repeating the data
    df['content_soup'] = (
        df['Character-centric'] + ' ' +
        df['Type'] + ' ' +
        df['Series'] + ' ' +
        df['Characters'] + ' ' +
        df['Tags'] + ' ' +
        df['Collaboration']
    )

    # Clean up soup: remove extra spaces
    df['content_soup'] = df['content_soup'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Create a mapping from 'Item #' to its index
    item_id_to_index = pd.Series(df.index, index=df['Item #']).to_dict()

    # Compute TF-IDF feature matrix from context soup
    tfidf = TfidfVectorizer(stop_words='english', token_pattern=r'\S+')
    tfidf_matrix = tfidf.fit_transform(df['content_soup'])

    return tfidf_matrix, tfidf, item_id_to_index

In [6]:
# Get recs based on profile from LLM
def get_llm_recommendations(profile_json, df, tfidf_matrix, tfidf_vectorizer, rated_item_ids, top_n=10):  
    try:
        vocab = tfidf_vectorizer.vocabulary_
        num_features = len(vocab)
        user_profile = np.zeros(num_features)

        # Failsafes for incorrect formatting
        for preference_list in [profile_json.get("loves", []), profile_json.get("hates", [])]:
            for item in preference_list:
                token = item.get("token", "")

                # Sometimes LLM gets the series wrong by adding "_series" to the end
                if token.endswith("_series"):
                    # Remove the "_series" suffix
                    corrected_token = token[:-7] 
                    # Check if the corrected token is in the vocab
                    if corrected_token in vocab:
                        item["token"] = corrected_token

                # If it gets badtz-maru wrong
                if token == "badtz_maru":
                    if "badtz-maru" in vocab:
                        item["token"] = "badtz-maru"
        
        # Add "loves" to the profile (Positive weights)
        for item in profile_json.get("loves", []):
            term = item.get("token")
            weight = item.get("weight", 1.0)
            
            if term in vocab:
                term_index = vocab[term]
                # Get pre-calculated IDF score for this term
                idf_score = tfidf_vectorizer.idf_[term_index]
                # Add the weighted score (TF * IDF)
                user_profile[term_index] += weight * idf_score
            else:
                print(f"LLM 'love' token not in vocab: {term}")

        # Subtract "hates" from the profile (Negative weights)
        for item in profile_json.get("hates", []):
            term = item.get("token")
            weight = item.get("weight", 1.0)
            
            if term in vocab:
                term_index = vocab[term]
                idf_score = tfidf_vectorizer.idf_[term_index]
                # Subtract the weighted score
                user_profile[term_index] -= weight * idf_score
            else:
                print(f"LLM 'hate' token not in vocab: {term}")
        
        # Calculate Similarity
        user_profile_sparse = csr_matrix(user_profile)
        cos_sim_scores = cosine_similarity(user_profile_sparse, tfidf_matrix).flatten()

        # Format and return results
        df_scores = pd.DataFrame({
            'Item #': df['Item #'],
            'Title': df['Title'],
            'Type': df['Type'],
            'Character-centric': df['Character-centric'],
            'Similarity': cos_sim_scores
        })

        # Filter out items that were already rated
        df_recommendations = df_scores[~df_scores['Item #'].isin(rated_item_ids)]
        
        # Filter out negative scores
        df_recommendations = df_recommendations[df_recommendations['Similarity'] > 0]
        df_recommendations = df_recommendations.sort_values(by='Similarity', ascending=False)

        return df_recommendations.head(top_n)

    except Exception as e:
        print(f"Error from get_llm_recommendations: {e}")
        return pd.DataFrame()

# Recommender System

In [7]:
# Number of products to grab
num_products = 5

# Get API key
local_api_key = os.environ.get("GEMINI_API_KEY")

# Create copy
cleaned_df = df.copy()

def generate_random_ratings(df, num_products):
  my_ratings = df.sample(n=num_products)
  my_ratings['rating'] = [random.randint(1,5) for _ in range(num_products)]

  return my_ratings

# Create tfidf_matrix from the dataframe
tfidf_matrix, tfidf, item_id_ind = create_tfidf_matrix(cleaned_df)

simulated_ratings_df = generate_random_ratings(cleaned_df, num_products)
rated_item_ids = simulated_ratings_df['Item #'].to_list()

simulated_ratings_df

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration,content_soup,rating
160,504602,"Chococat 12"" Plush (Winter Puffer Series)",https://www.sanrio.com/cdn/shop/files/504602-Z...,Bundle up and let’s stay cozy with your favori...,chococat,chococat,plush,plush 12''_plush winter quilted embroidered bl...,45.0,winter_puffer,Single,25.01,,chococat plush winter_puffer chococat plush 12...,4
118,642444,Pompompurin Gingham Pencil Pouch,https://www.sanrio.com/cdn/shop/files/imgi_12_...,Add a pop of happiness to your day with this g...,pompompurin,pompompurin,stationery,stationery pouch pencil_pouch pvc cotton gingh...,13.99,,Single,0.0,,pompompurin stationery pompompurin stationery ...,2
16,299286,Badtz-maru 2-pc Zipper Pouch Set (My Time Series),https://www.sanrio.com/cdn/shop/files/45506242...,"Take a peek into the silly life of Badtz-maru,...",badtz-maru,badtz-maru,bags,bag zipper_pouch pouch set poly mesh_fabric black,28.0,my_time,Multiple,0.0,,badtz-maru bags my_time badtz-maru bag zipper_...,1
54,883247,Hello Kitty Lovestruck Mascot Clip (Many Moods...,https://www.sanrio.com/cdn/shop/files/883247-Z...,Hello Kitty is a sweet friend to all and as sh...,hello_kitty,hello_kitty,accessory,accessory clip plush_clip lovestruck embroider...,18.0,many_moods,Single,0.0,,hello_kitty accessory many_moods hello_kitty a...,3
61,908258,Hello Kitty x Stoney Clover Lane Clear Front S...,https://www.sanrio.com/cdn/shop/files/SCL-HKF2...,Bring on the playful spirit with Hello Kitty a...,hello_kitty,hello_kitty,bags,bag pouch clear_pouch small_pouch hello_kitty_...,98.0,,Single,0.0,stoney_clover_lane,hello_kitty bags hello_kitty bag pouch clear_p...,4


In [8]:
# Make LLM profile
async def run_async_test():
    llm_profile, duration = await get_llm_profile(simulated_ratings_df, local_api_key)
    
    if not llm_profile.get("loves") and not llm_profile.get("hates"):
        print("Profile is empty; cannot generate recommendations.")
        return

    # Get recs
    print("\nGetting recommendations...")
    recommendations = get_llm_recommendations(
        llm_profile,
        cleaned_df,
        tfidf_matrix,
        tfidf,
        rated_item_ids
    )

    # Print results
    print(f'Processing Time: {duration:.2f}s')
    print("\n--- Top Recommendations ---")
    if recommendations.empty:
        print("No recommendations found.")
    else:
        print(recommendations)

In [10]:
await run_async_test()

Ratings text:  - Rating: 4 stars | Tags: chococat winter_puffer plush 12''_plush winter quilted embroidered black blue

- Rating: 2 stars
  Tags: pompompurin stationery pouch pencil_pouch pvc cotton gingham yellow blue

- Rating: 1 stars | Tags: badtz-maru my_time bag zipper_pouch pouch set poly mesh_fabric black

- Rating: 3 stars | Tags: hello_kitty many_moods accessory clip plush_clip lovestruck embroidered white red black

- Rating: 4 stars
  Tags: hello_kitty bag pouch clear_pouch small_pouch hello_kitty_bows nylon vinyl red clear

Calling Gemini API...
LLM JSON Response:
{
  "loves": [
    {
      "token": "chococat",
      "weight": 3
    },
    {
      "token": "winter_puffer",
      "weight": 3
    },
    {
      "token": "plush",
      "weight": 3
    },
    {
      "token": "12''_plush",
      "weight": 3
    },
    {
      "token": "winter",
      "weight": 3
    },
    {
      "token": "quilted",
      "weight": 3
    },
    {
      "token": "embroidered",
      "weight": 