In [1]:
# Import libraries
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

import json
import asyncio
import os
import httpx

In [2]:
df = pd.read_csv('sanrio_products.csv')
df.head()

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration
0,30070,"Badtz-maru 7"" Plush (I Love Me Series)",https://www.sanrio.com/cdn/shop/files/zz-25043...,In honor of the 40th Anniversary Sanrio Charac...,Badtz-maru,Badtz-maru,Plush,"plush, 7'' plush, embroidered, black",44.0,I Love Me,Single,0.0,
1,486183,Badtz-maru Mascot Badge Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25044...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, badge, keychain, embroidered, black",5.99,Sanrio Character Award,Single,0.0,
2,175064,Badtz-maru Mini Mascot Keychain (Sanrio Charac...,https://www.sanrio.com/cdn/shop/files/original...,I’ll adore you with all my heart! Clip this ad...,Badtz-maru,Badtz-maru,Accessory,"accessory, plush keychain, keychain, black",14.99,Sanrio Character Award,Single,0.0,
3,619744,Badtz-maru Customizable Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25046...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, keychain, stickers, customizable, r...",10.99,Sanrio Character Award,Multiple,4.0,
4,CNS0132,Hello Kitty and Friends BLDR Building Set (Bad...,https://www.sanrio.com/cdn/shop/files/CNS01321...,Get ready to unleash your competitive spirit a...,"Badtz-maru, Hello Kitty, Melody, Keroppi, Choc...",,Toys&Games,"toys, building set, playset, set, bldr bricks",49.99,Badtz-maru’s Bowling Alley,Multiple,0.0,


In [3]:
# Replace NaN values with empty string for specified columns
df[['Collaboration', 'Character-centric', 'Series']] = df[['Collaboration', 'Character-centric', 'Series']].fillna("")

# Handle potential NaN in 'Discount' before converting to string in create_similarity_matrix
df['Discount'] = df['Discount'].fillna(0)


display(df.describe())
display(df.info())

Unnamed: 0,Price ($),Discount
count,147.0,147.0
mean,30.19483,1.070952
std,33.709168,3.8599
min,5.99,0.0
25%,12.99,0.0
50%,21.0,0.0
75%,32.495,0.0
max,229.0,32.99


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item #             147 non-null    object 
 1   Title              147 non-null    object 
 2   Image URL          147 non-null    object 
 3   Description        147 non-null    object 
 4   Characters         147 non-null    object 
 5   Character-centric  147 non-null    object 
 6   Type               147 non-null    object 
 7   Tags               147 non-null    object 
 8   Price ($)          147 non-null    float64
 9   Series             147 non-null    object 
 10  Number of Items    147 non-null    object 
 11  Discount           147 non-null    float64
 12  Collaboration      147 non-null    object 
dtypes: float64(2), object(11)
memory usage: 15.1+ KB


None

# Functions

In [None]:
# Get structured JSON of user preferences from dataframe
async def get_llm_profile(rated_items_df, api_key):
    # Format user ratings into a string for the prompt
    ratings_summary = []
    for _, item in rated_items_df.iterrows():
        ratings_summary.append(
            f"- Rating: {item['rating']} stars\n"
            f"  Title: {item['Title']}\n"
            f"  Type: {item['Type']}\n"
            f"  Main Character: {item['Character-centric']}\n"
            f"  Tags: {item['Tags']}\n"
        )
    ratings_text = "\n".join(ratings_summary)

    # Define the system prompt and query
    system_prompt = """
    You are an expert recommender system analyst. Your job is to analyze a user's item
    ratings and return a structured JSON object of their preferences.

    The user provides ratings (1-5 stars) for Sanrio products.
    - 5 stars = loves
    - 1 star = hates
    
    Your first step is to analyze the user's ratings to INFER their priorities.
    Do they care more about the specific character ('Character-centric'), 
    the type of item ('Type'), or the series ('Series')?
    Look for patterns: for example, if they consistently rate 'plush' items highly
    regardless of the character, 'type' is their main priority.

    Your second step is to create a dynamic weighting system based on this inference.
    Assign a high weight (e..g, 6.0) to tokens from their top priority category,
    a medium weight (e.g., 3.0) to the next, and a baseline weight (e.g., 1.0) to all others.

    Your third step is to apply these dynamic weights to the tokens you
    extract from the user's 'loves' and 'hates' ratings.

    - Use 'main_focus_character' for the 'Character-centric' column.
    - Use 'character' for the 'Characters' column.
    - Use 'type' for the 'Type' column.
    - Use 'series' for the 'Series' column.
    
    Tokenization Rules:
    - All tokens must be lowercase.
    - Replace spaces with underscores (e.g., 'Hello Kitty' -> 'hello_kitty', '7'' plush' -> '7''_plush').
    - For the 'Series' column, do NOT append the word '_series' to the token. 
      (e.g., 'I Love Me Series' should be tokenized as 'i_love_me').
      
    Return ONLY a valid JSON object in the following format:
    {
      "loves": [
        {"token": "token_name", "weight": 6.0},
        {"token": "other_token", "weight": 1.0}
      ],
      "hates": [
        {"token": "hated_token", "weight": 6.0}
      ]
    }
    """
    
    user_query = f"""
    Here are the user's ratings:
    {ratings_text}

    Analyze these ratings and provide the structured JSON output.
    Remember to tokenize keywords (e.g., 'Hello Kitty' -> 'hello_kitty', '7'' plush' -> '7''_plush').
    """

    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-05-20:generateContent?key={api_key}"

    payload = {
        "contents": [{"parts": [{"text": user_query}]}],
        "systemInstruction": {
            "parts": [{"text": system_prompt}]
        },
        "generationConfig": {
            "responseMimeType": "application/json",
            "responseSchema": {
                "type": "OBJECT",
                "properties": {
                    "loves": {
                        "type": "ARRAY",
                        "items": {
                            "type": "OBJECT",
                            "properties": {
                                "token": {"type": "STRING"},
                                "weight": {"type": "NUMBER"}
                            }
                        }
                    },
                    "hates": {
                        "type": "ARRAY",
                        "items": {
                            "type": "OBJECT",
                            "properties": {
                                "token": {"type": "STRING"},
                                "weight": {"type": "NUMBER"}
                            }
                        }
                    }
                }
            }
        }
    }

    # Make the API Call
    print("Calling Gemini API...")
    
    max_retries = 3
    delay = 1.0 # Initial delay in seconds
    
    async with httpx.AsyncClient(timeout=60.0) as client:
        for attempt in range(max_retries):
            try:
                response = await client.post(
                    api_url,
                    headers={'Content-Type': 'application/json'},
                    json=payload
                )
                
                response.raise_for_status()
                
                result = response.json()
                json_text = result.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text', '{}')
                
                if not json_text:
                    raise ValueError("API returned empty response.")
                
                print(f"LLM JSON Response:\n{json_text}")
                return json.loads(json_text)

            except (httpx.RequestError, httpx.HTTPStatusError, json.JSONDecodeError, ValueError, IndexError, KeyError) as e:
                print(f"Error calling LLM (Attempt {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {delay} seconds...")
                    await asyncio.sleep(delay)
                    delay *= 2 # Exponential backoff
                else:
                    print("Max retries reached. Failing.")
                    # Fallback to an empty profile
                    return {"loves": [], "hates": []}

    # Fallback in case of unexpected exit
    return {"loves": [], "hates": []}

In [None]:
def create_tfidf_matrix(df):
    # Columns to use for content features
    phrase_cols = ['Type', 'Series', 'Collaboration', 'Character-centric']

    # Handle normal phrase columns
    for col in phrase_cols:
        df[col] = df[col].fillna('').astype(str).str.lower().str.replace(r'[\s,]+', '_', regex=True)
        # Remove any trailing underscores that might result from ", "
        df[col] = df[col].str.replace(r'_+', '_', regex=True).str.strip('_')
        # Handle 'none' string from fillna/conversion
        df[col] = df[col].replace('none', '')

    # Processing columns set up as lists
    def process_list_column(tag_string):
        if not isinstance(tag_string, str):
            return ""
        
        tags = tag_string.split(',')
        processed_tags = []
        for tag in tags:
            clean_tag = tag.strip()
            if clean_tag:
                # Replace internal spaces with underscores
                tokenized_tag = clean_tag.replace(' ', '_')
                processed_tags.append(tokenized_tag)
        
        # Join processed tags with a space
        return ' '.join(processed_tags)

    # Handle tags
    list_cols = ['Characters', 'Tags']
    for col in list_cols:
        if col in df.columns:
            # Ensure it's a string, lowercase
            df[col] = df[col].fillna('').astype(str).str.lower()
            df[col] = df[col].apply(process_list_column)
            # Clean up any double underscores
            df[col] = df[col].str.replace(r'_+', '_', regex=True)
            
    # Clean up any double underscores
    df[col] = df[col].str.replace(r'_+', '_', regex=True)

    # Build the weighted content soup string by repeating the data
    df['content_soup'] = (
        df['Character-centric'] + ' ' +
        df['Type'] + ' ' +
        df['Series'] + ' ' +
        df['Characters'] + ' ' +
        df['Tags'] + ' ' +
        df['Collaboration']
    )

    # Clean up soup: remove extra spaces
    df['content_soup'] = df['content_soup'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Create a mapping from 'Item #' to its index
    item_id_to_index = pd.Series(df.index, index=df['Item #']).to_dict()

    # Compute TF-IDF feature matrix from context soup
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['content_soup'])

    return tfidf_matrix, tfidf, item_id_to_index

In [6]:
# Get recs based on profile from LLM
def get_llm_recommendations(profile_json, df, tfidf_matrix, tfidf_vectorizer, rated_item_ids, top_n=10):  
    try:
        vocab = tfidf_vectorizer.vocabulary_
        num_features = len(vocab)
        user_profile = np.zeros(num_features)

        # Failsafes for incorrect formatting
        for preference_list in [profile_json.get("loves", []), profile_json.get("hates", [])]:
            for item in preference_list:
                token = item.get("token", "")

                # Sometimes LLM gets the series wrong by adding "_series" to the end
                if token.endswith("_series"):
                    # Remove the "_series" suffix
                    corrected_token = token[:-7] 
                    # Check if the corrected token is in the vocab
                    if corrected_token in vocab:
                        item["token"] = corrected_token

                # If it gets badtz-maru wrong
                if token == "badtz_maru":
                    if "badtz-maru" in vocab:
                        item["token"] = "badtz-maru"
        
        # Add "loves" to the profile (Positive weights)
        for item in profile_json.get("loves", []):
            term = item.get("token")
            weight = item.get("weight", 1.0)
            
            if term in vocab:
                term_index = vocab[term]
                # Get pre-calculated IDF score for this term
                idf_score = tfidf_vectorizer.idf_[term_index]
                # Add the weighted score (TF * IDF)
                user_profile[term_index] += weight * idf_score
            else:
                print(f"LLM 'love' token not in vocab: {term}")

        # Subtract "hates" from the profile (Negative weights)
        for item in profile_json.get("hates", []):
            term = item.get("token")
            weight = item.get("weight", 1.0)
            
            if term in vocab:
                term_index = vocab[term]
                idf_score = tfidf_vectorizer.idf_[term_index]
                # Subtract the weighted score
                user_profile[term_index] -= weight * idf_score
            else:
                print(f"LLM 'hate' token not in vocab: {term}")
        
        # Calculate Similarity
        user_profile_sparse = csr_matrix(user_profile)
        cos_sim_scores = cosine_similarity(user_profile_sparse, tfidf_matrix).flatten()

        # Format and return results
        df_scores = pd.DataFrame({
            'Item #': df['Item #'],
            'Title': df['Title'],
            'Type': df['Type'],
            'Character-centric': df['Character-centric'],
            'Similarity': cos_sim_scores
        })

        # Filter out items that were already rated
        df_recommendations = df_scores[~df_scores['Item #'].isin(rated_item_ids)]
        
        # Filter out negative scores
        df_recommendations = df_recommendations[df_recommendations['Similarity'] > 0]
        df_recommendations = df_recommendations.sort_values(by='Similarity', ascending=False)

        return df_recommendations.head(top_n)

    except Exception as e:
        print(f"Error from get_llm_recommendations: {e}")
        return pd.DataFrame()

# Recommender System

In [None]:
# Number of products to grab
num_products = 5

# Get API key
local_api_key = os.environ.get("GEMINI_API_KEY")

# Create copy
cleaned_df = df.copy()

def generate_random_ratings(df, num_products):
  my_ratings = df.sample(n=num_products)
  my_ratings['rating'] = [random.randint(1,5) for _ in range(num_products)]

  return my_ratings

simulated_ratings_df = generate_random_ratings(cleaned_df, num_products)
rated_item_ids = simulated_ratings_df['Item #'].to_list()

# Create tfidf_matrix from the dataframe
tfidf_matrix, tfidf, item_id_ind = create_tfidf_matrix(cleaned_df)

simulated_ratings_df

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration,rating
88,892530,Hello Kitty and Friends Ceramic Mushroom Teapot,https://www.sanrio.com/cdn/shop/files/892238-Z...,Carefully crafted and decorated with the iconi...,"Hello Kitty, Cinnamonroll, Melody, Keroppi",Hello Kitty,Home Goods,"home goods, ceramic teapot, teapot, mushroom t...",76.0,,Single,0.0,,4
86,132144,Hello Kitty Plush Mascot Keychain (Summer Fest...,https://www.sanrio.com/cdn/shop/files/imgi_107...,This fun design series captures the essence of...,Hello Kitty,Hello Kitty,Accessory,"accessory, keychaIn, plush keychain, embroider...",26.0,Summer Festival,Single,0.0,,1
136,613029,Pompompurin 2-Piece Pouch Set (Friendship Time...,https://www.sanrio.com/cdn/shop/files/45506246...,Delight in this adorable friendship series fea...,Pompompurin,Pompompurin,Bags,"bag, pouch, zipper pouch, set, muffin, besties...",24.0,Friendship Time,Single,0.0,,1
112,124508,Pitatto Friends Plush Pompompurin Hoodie (Medium),https://www.sanrio.com/cdn/shop/files/zz-25061...,Enter the world of play and make-believe with ...,Pompompurin,Pompompurin,Plush,"plush, medium display plush, display accessory...",21.0,,Single,0.0,,4
91,892734,Hello Kitty Let's Ride Ceramic Mug Gift Set (S...,https://www.sanrio.com/cdn/shop/files/892734-Z...,Ride into the sunset with this western-inspire...,Hello Kitty,Hello Kitty,Home Goods,"home goods, mug, ceramic mug, gift set, set, c...",20.0,Western,Multiple,0.0,,1


In [8]:
# Make LLM profile
async def run_async_test():
    llm_profile = await get_llm_profile(simulated_ratings_df, local_api_key)
    
    if not llm_profile.get("loves") and not llm_profile.get("hates"):
        print("Profile is empty; cannot generate recommendations.")
        return

    # Get recs
    print("\nGetting recommendations...")
    recommendations = get_llm_recommendations(
        llm_profile,
        cleaned_df,
        tfidf_matrix,
        tfidf,
        rated_item_ids
    )

    # Print results
    print("\n--- Top Recommendations ---")
    if recommendations.empty:
        print("No recommendations found.")
    else:
        print(recommendations)

In [9]:
await run_async_test()

Calling Gemini API...


LLM JSON Response:
{"loves": [{"token": "hello_kitty", "weight": 6.0}, {"token": "home_goods", "weight": 3.0}, {"token": "ceramic_teapot", "weight": 1.0}, {"token": "teapot", "weight": 1.0}, {"token": "mushroom_teapot", "weight": 1.0}, {"token": "mushrooms", "weight": 1.0}, {"token": "ceramic", "weight": 1.0}, {"token": "red", "weight": 1.0}, {"token": "brown", "weight": 1.0}, {"token": "white", "weight": 1.0}, {"token": "pompompurin", "weight": 6.0}, {"token": "plush", "weight": 3.0}, {"token": "medium_display_plush", "weight": 1.0}, {"token": "display_accessory", "weight": 1.0}, {"token": "display_hoodie", "weight": 1.0}, {"token": "yellow", "weight": 1.0}
  ]}

Getting recommendations...

--- Top Recommendations ---
     Item #                                              Title        Type  \
75   448681             Hello Kitty Gingham Imabari Bath Towel  home_goods   
89   892432             Hello Kitty Cowgirl Ceramic Wind Chime  home_goods   
87   892238                   Hello K