In [1]:
# Import libraries
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

import json
import asyncio
import os
import httpx

In [2]:
df = pd.read_csv('sanrio_products.csv')
df.head()

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration
0,30070,"Badtz-maru 7"" Plush (I Love Me Series)",https://www.sanrio.com/cdn/shop/files/zz-25043...,In honor of the 40th Anniversary Sanrio Charac...,Badtz-maru,Badtz-maru,Plush,"plush, 7'' plush, embroidered, black",44.0,I Love Me,Single,0.0,
1,486183,Badtz-maru Mascot Badge Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25044...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, badge, keychain, embroidered, black",5.99,Sanrio Character Award,Single,0.0,
2,175064,Badtz-maru Mini Mascot Keychain (Sanrio Charac...,https://www.sanrio.com/cdn/shop/files/original...,I’ll adore you with all my heart! Clip this ad...,Badtz-maru,Badtz-maru,Accessory,"accessory, plush keychain, keychain, black",14.99,Sanrio Character Award,Single,0.0,
3,619744,Badtz-maru Customizable Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25046...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, keychain, stickers, customizable, r...",10.99,Sanrio Character Award,Multiple,4.0,
4,CNS0132,Hello Kitty and Friends BLDR Building Set (Bad...,https://www.sanrio.com/cdn/shop/files/CNS01321...,Get ready to unleash your competitive spirit a...,"Badtz-maru, Hello Kitty, Melody, Keroppi, Choc...",,Toys&Games,"toys, building set, playset, set, bldr bricks",49.99,Badtz-maru’s Bowling Alley,Multiple,0.0,


In [3]:
# Replace NaN values with empty string for specified columns
df[['Collaboration', 'Character-centric', 'Series']] = df[['Collaboration', 'Character-centric', 'Series']].fillna("")

# Handle potential NaN in 'Discount' before converting to string in create_similarity_matrix
df['Discount'] = df['Discount'].fillna(0)


display(df.describe())
display(df.info())

Unnamed: 0,Price ($),Discount
count,147.0,147.0
mean,30.19483,1.070952
std,33.709168,3.8599
min,5.99,0.0
25%,12.99,0.0
50%,21.0,0.0
75%,32.495,0.0
max,229.0,32.99


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item #             147 non-null    object 
 1   Title              147 non-null    object 
 2   Image URL          147 non-null    object 
 3   Description        147 non-null    object 
 4   Characters         147 non-null    object 
 5   Character-centric  147 non-null    object 
 6   Type               147 non-null    object 
 7   Tags               147 non-null    object 
 8   Price ($)          147 non-null    float64
 9   Series             147 non-null    object 
 10  Number of Items    147 non-null    object 
 11  Discount           147 non-null    float64
 12  Collaboration      147 non-null    object 
dtypes: float64(2), object(11)
memory usage: 15.1+ KB


None

# Functions

In [4]:
# Get structured JSON of user preferences from dataframe
async def get_llm_profile(rated_items_df, api_key):
    # Format user ratings into a string for the prompt
    ratings_summary = []
    for _, item in rated_items_df.iterrows():
        ratings_summary.append(
            f"- Rating: {item['rating']} stars\n"
            f"  Title: {item['Title']}\n"
            f"  Type: {item['Type']}\n"
            f"  Main Character: {item['Character-centric']}\n"
            f"  Tags: {item['Tags']}\n"
        )
    ratings_text = "\n".join(ratings_summary)

    # Define the system prompt and query
    system_prompt = """
    You are an expert recommender system analyst. Your job is to analyze a user's item
    ratings and return a structured JSON object of their preferences.

    The user provides ratings (1-5 stars) for Sanrio products.
    - 5 stars = loves
    - 1 star = hates
    
    You must identify the key features (characters, types, series, tags) and
    assign weights based on the user's ratings.
    
    Tokenization Rules:
    1.  All tokens must be lowercase.
    2.  Replace all spaces and commas with a single underscore (e.g., 'Hello Kitty' -> 'hello_kitty'). The exception is badtz-maru, dash (-) not underscore (_).
    3.  For the 'Series' column: Tokenize the series name *exactly* as it appears (lowercased, with underscores). 
        **Do NOT append the word '_series' to the token.** (e.g., 'I Love Me' -> 'i_love_me', NOT 'i_love_me_series').
    4.  For 'Tags': Tokenize each individual tag. (e.g., "7'' plush" -> "7''_plush").

    Weighting Rules:
    - 'main_focus_character' (from 'Character-centric' column): weight 6.0
    - 'type' (from 'Type' column): weight 3.0
    - 'series' (from 'Series' column): weight 2.0
    - 'character' (from 'Characters' column): weight 1.0
    - all other 'tags': weight 1.0
      
    Return ONLY a valid JSON object in the following format:
    {
      "loves": [
        {"token": "token_name", "weight": 6.0},
        {"token": "other_token", "weight": 1.0}
      ],
      "hates": [
        {"token": "hated_token", "weight": 6.0}
      ]
    }
    """
    
    user_query = f"""
    Here are the user's ratings:
    {ratings_text}

    Analyze these ratings and provide the structured JSON output.
    Remember to tokenize keywords (e.g., 'Hello Kitty' -> 'hello_kitty', '7'' plush' -> '7''_plush').
    """

    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-05-20:generateContent?key={api_key}"

    payload = {
        "contents": [{"parts": [{"text": user_query}]}],
        "systemInstruction": {
            "parts": [{"text": system_prompt}]
        },
        "generationConfig": {
            "responseMimeType": "application/json",
            "responseSchema": {
                "type": "OBJECT",
                "properties": {
                    "loves": {
                        "type": "ARRAY",
                        "items": {
                            "type": "OBJECT",
                            "properties": {
                                "token": {"type": "STRING"},
                                "weight": {"type": "NUMBER"}
                            }
                        }
                    },
                    "hates": {
                        "type": "ARRAY",
                        "items": {
                            "type": "OBJECT",
                            "properties": {
                                "token": {"type": "STRING"},
                                "weight": {"type": "NUMBER"}
                            }
                        }
                    }
                }
            }
        }
    }

    # Make the API Call
    print("Calling Gemini API...")
    
    max_retries = 3
    delay = 1.0 # Initial delay in seconds
    
    async with httpx.AsyncClient(timeout=60.0) as client:
        for attempt in range(max_retries):
            try:
                response = await client.post(
                    api_url,
                    headers={'Content-Type': 'application/json'},
                    json=payload
                )
                
                response.raise_for_status()
                
                result = response.json()
                json_text = result.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text', '{}')
                
                if not json_text:
                    raise ValueError("API returned empty response.")
                
                print(f"LLM JSON Response:\n{json_text}")
                return json.loads(json_text)

            except (httpx.RequestError, httpx.HTTPStatusError, json.JSONDecodeError, ValueError, IndexError, KeyError) as e:
                print(f"Error calling LLM (Attempt {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {delay} seconds...")
                    await asyncio.sleep(delay)
                    delay *= 2 # Exponential backoff
                else:
                    print("Max retries reached. Failing.")
                    # Fallback to an empty profile
                    return {"loves": [], "hates": []}

    # Fallback in case of unexpected exit
    return {"loves": [], "hates": []}

In [5]:
def create_tfidf_matrix(df, weights=None):
    # Columns to use for content features
    text_cols = ['Characters', 'Type', 'Tags', 'Series', 'Collaboration', 'Character-centric']
    phrase_cols = ['Characters', 'Type', 'Series', 'Collaboration', 'Character-centric']

    # Handle normal phrase columns
    for col in phrase_cols:
        df[col] = df[col].fillna('').astype(str).str.lower().str.replace(r'[\s,]+', '_', regex=True)
        # Remove any trailing underscores that might result from ", "
        df[col] = df[col].str.replace(r'_+', '_', regex=True).str.strip('_')
        # Handle 'none' string from fillna/conversion
        df[col] = df[col].replace('none', '')

    # Handle tags
    col = 'Tags'
    df[col] = df[col].fillna('').astype(str).str.lower()
    df[col] = df[col].apply(lambda x:
        ' '.join([
            # Strip whitespace, replace internal spaces with _
            tag.strip().replace(' ', '_') for tag in x.split(',') if tag.strip()
        ])
    )
    # Clean up any double underscores
    df[col] = df[col].str.replace(r'_+', '_', regex=True)

    # Initialize weights if without
    if weights is None:
        weights = {
            'star_char': 6,
            'type': 3,
            'series': 2,
            'baseline': 1
        }

    # Build the weighted content soup string by repeating the data
    df['content_soup'] = (
        (df['Character-centric'] + ' ') * weights.get('star_char', 6) +
        (df['Type'] + ' ') * weights.get('type', 3) +
        (df['Series'] + ' ') * weights.get('series', 2) +
        (df['Characters'] + ' ') * weights.get('baseline', 1) +
        (df['Tags'] + ' ') * weights.get('baseline', 1) +
        (df['Collaboration'] + ' ') * weights.get('baseline', 1)
    )

    # Clean up soup: remove extra spaces
    df['content_soup'] = df['content_soup'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Create a mapping from 'Item #' to its index
    item_id_to_index = pd.Series(df.index, index=df['Item #']).to_dict()

    # Compute TF-IDF feature matrix from context soup
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['content_soup'])

    return tfidf_matrix, tfidf, item_id_to_index

In [None]:
# Get recs based on profile from LLM
def get_llm_recommendations(profile_json, df, tfidf_matrix, tfidf_vectorizer, rated_item_ids, top_n=10):  
    try:
        vocab = tfidf_vectorizer.vocabulary_
        num_features = len(vocab)
        user_profile = np.zeros(num_features)
        
        # Add "loves" to the profile (Positive weights)
        for item in profile_json.get("loves", []):
            term = item.get("token")
            weight = item.get("weight", 1.0)
            
            if term in vocab:
                term_index = vocab[term]
                # Get pre-calculated IDF score for this term
                idf_score = tfidf_vectorizer.idf_[term_index]
                # Add the weighted score (TF * IDF)
                user_profile[term_index] += weight * idf_score
            else:
                print(f"LLM 'love' token not in vocab: {term}")

        # Subtract "hates" from the profile (Negative weights)
        for item in profile_json.get("hates", []):
            term = item.get("token")
            weight = item.get("weight", 1.0)
            
            if term in vocab:
                term_index = vocab[term]
                idf_score = tfidf_vectorizer.idf_[term_index]
                # Subtract the weighted score
                user_profile[term_index] -= weight * idf_score
            else:
                print(f"LLM 'hate' token not in vocab: {term}")
        
        # Calculate Similarity
        user_profile_sparse = csr_matrix(user_profile)
        cos_sim_scores = cosine_similarity(user_profile_sparse, tfidf_matrix).flatten()

        # Format and return results
        df_scores = pd.DataFrame({
            'Item #': df['Item #'],
            'Title': df['Title'],
            'Type': df['Type'],
            'Character-centric': df['Character-centric'],
            'Similarity': cos_sim_scores
        })

        # Filter out items that were already rated
        df_recommendations = df_scores[~df_scores['Item #'].isin(rated_item_ids)]
        
        # Filter out negative scores
        df_recommendations = df_recommendations[df_recommendations['Similarity'] > 0]
        df_recommendations = df_recommendations.sort_values(by='Similarity', ascending=False)

        return df_recommendations.head(top_n)

    except Exception as e:
        print(f"Error from get_llm_recommendations: {e}")
        return pd.DataFrame()

# Recommender System

In [7]:
# Number of products to grab
num_products = 5

# Get API key
local_api_key = os.environ.get("GEMINI_API_KEY")

# Create copy
cleaned_df = df.copy()

def generate_random_ratings(df, num_products):
  my_ratings = df.sample(n=num_products)
  my_ratings['rating'] = [random.randint(1,5) for _ in range(num_products)]

  return my_ratings

simulated_ratings_df = generate_random_ratings(cleaned_df, num_products)
rated_item_ids = simulated_ratings_df['Item #'].to_list()

# Create tfidf_matrix from the dataframe
tfidf_matrix, tfidf, item_id_ind = create_tfidf_matrix(cleaned_df)

simulated_ratings_df

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration,rating
76,679411,Hello Kitty 2-pc Travel Pouch Set,https://www.sanrio.com/cdn/shop/files/imgi_13_...,Start your trip with a smile using this deligh...,Hello Kitty,Hello Kitty,Bags,"bag, pouch, travel pouch, set, poly, black, re...",28.0,,Multiple,0.0,,5
143,295647,Pompompurin Mini Pouch Charm (Spring Bunny Ser...,https://www.sanrio.com/cdn/shop/files/zz-25022...,"The flowers are blooming, and the time is chan...",Pompompurin,Pompompurin,Accessory,"accessory, charm, pouch charm, mini pouch char...",12.99,Spring Bunny,Single,8.0,,4
142,822914,Pompompurin Plush Measuring Tape,https://www.sanrio.com/cdn/shop/files/original...,How do you measure cute? With your favorite Sa...,Pompompurin,Pompompurin,Home Goods,"home goods, measuring tape, plush measuring ta...",24.0,,Single,0.0,,1
115,642142,Pompompurin 2-Color Gingham Retractable Pen,https://www.sanrio.com/cdn/shop/files/zz-25056...,These 2-color gingham retractable pens glide e...,Pompompurin,Pompompurin,Stationery,"stationery, pen, retractable pen, two-color pe...",11.99,,Single,0.0,,3
78,612898,Hello Kitty Plush Mascot Keychain (Retro Quilt...,https://www.sanrio.com/cdn/shop/files/zz-25046...,A nostalgic feel of quilted goods featuring yo...,Hello Kitty,Hello Kitty,Accessory,"accessory, keychain, plush keychain, embroider...",18.0,Retro Quilt,Single,0.0,,5


In [None]:
# Make LLM profile
async def run_async_test():
    llm_profile = await get_llm_profile(simulated_ratings_df, local_api_key)
    
    if not llm_profile.get("loves") and not llm_profile.get("hates"):
        print("Profile is empty; cannot generate recommendations.")
        return

    # Get recs
    print("\nGetting recommendations...")
    recommendations = get_llm_recommendations(
        llm_profile,
        cleaned_df,
        tfidf_matrix,
        tfidf,
        rated_item_ids
    )

    # Print results
    print("\n--- Top Recommendations ---")
    if recommendations.empty:
        print("No recommendations found.")
    else:
        print(recommendations)

In [10]:
await run_async_test()

Calling Gemini API...
Error calling LLM (Attempt 1/3): 
Retrying in 1.0 seconds...
LLM JSON Response:
{"hates": [{"token": "pompompurin", "weight": 6.0}, {"token": "home_goods", "weight": 3.0}, {"token": "home_goods", "weight": 1.0}, {"token": "measuring_tape", "weight": 1.0}, {"token": "plush_measuring_tape", "weight": 1.0}, {"token": "embroidered", "weight": 1.0}, {"token": "yellow", "weight": 1.0}], "loves": [{"token": "hello_kitty", "weight": 6.0}, {"token": "bags", "weight": 3.0}, {"token": "bag", "weight": 1.0}, {"token": "pouch", "weight": 1.0}, {"token": "travel_pouch", "weight": 1.0}, {"token": "set", "weight": 1.0}, {"token": "poly", "weight": 1.0}, {"token": "black", "weight": 1.0}, {"token": "red", "weight": 1.0}, {"token": "white", "weight": 1.0}, {"token": "hello_kitty", "weight": 6.0}, {"token": "accessory", "weight": 3.0}, {"token": "retro_quilt", "weight": 2.0}, {"token": "accessory", "weight": 1.0}, {"token": "keychain", "weight": 1.0}, {"token": "plush_keychain", "we