In [1]:
# Import libraries
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv('sanrio_products.csv')
df.head()

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration
0,30070,"Badtz-maru 7"" Plush (I Love Me Series)",https://www.sanrio.com/cdn/shop/files/zz-25043...,In honor of the 40th Anniversary Sanrio Charac...,Badtz-maru,Badtz-maru,Plush,"plush, 7'' plush, embroidered, black",44.0,I Love Me,Single,0.0,
1,486183,Badtz-maru Mascot Badge Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25044...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, badge, keychain, embroidered, black",5.99,Sanrio Character Award,Single,0.0,
2,175064,Badtz-maru Mini Mascot Keychain (Sanrio Charac...,https://www.sanrio.com/cdn/shop/files/original...,I’ll adore you with all my heart! Clip this ad...,Badtz-maru,Badtz-maru,Accessory,"accessory, plush keychain, keychain, black",14.99,Sanrio Character Award,Single,0.0,
3,619744,Badtz-maru Customizable Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25046...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, keychain, stickers, customizable, r...",10.99,Sanrio Character Award,Multiple,4.0,
4,CNS0132,Hello Kitty and Friends BLDR Building Set (Bad...,https://www.sanrio.com/cdn/shop/files/CNS01321...,Get ready to unleash your competitive spirit a...,"Badtz-maru, Hello Kitty, Melody, Keroppi, Choc...",,Toys&Games,"toys, building set, playset, set, bldr bricks",49.99,Badtz-maru’s Bowling Alley,Multiple,0.0,


In [3]:
# Replace NaN values with empty string for specified columns
df[['Collaboration', 'Character-centric', 'Series']] = df[['Collaboration', 'Character-centric', 'Series']].fillna("")

# Handle potential NaN in 'Discount' before converting to string in create_similarity_matrix
df['Discount'] = df['Discount'].fillna(0)


display(df.describe())
display(df.info())

Unnamed: 0,Price ($),Discount
count,147.0,147.0
mean,30.19483,1.070952
std,33.709168,3.8599
min,5.99,0.0
25%,12.99,0.0
50%,21.0,0.0
75%,32.495,0.0
max,229.0,32.99


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item #             147 non-null    object 
 1   Title              147 non-null    object 
 2   Image URL          147 non-null    object 
 3   Description        147 non-null    object 
 4   Characters         147 non-null    object 
 5   Character-centric  147 non-null    object 
 6   Type               147 non-null    object 
 7   Tags               147 non-null    object 
 8   Price ($)          147 non-null    float64
 9   Series             147 non-null    object 
 10  Number of Items    147 non-null    object 
 11  Discount           147 non-null    float64
 12  Collaboration      147 non-null    object 
dtypes: float64(2), object(11)
memory usage: 15.1+ KB


None

# Functions

In [None]:
def create_tfidf_matrix(df, weights=None):
    # Columns to use for content features
    text_cols = ['Characters', 'Type', 'Tags', 'Series', 'Collaboration', 'Character-centric']
    phrase_cols = ['Characters', 'Type', 'Series', 'Collaboration', 'Character-centric']

    # Handle normal phrase columns
    for col in phrase_cols:
        df[col] = df[col].fillna('').astype(str).str.lower().str.replace(r'[\s,]+', '_', regex=True)
        # Remove any trailing underscores that might result from ", "
        df[col] = df[col].str.replace(r'_+', '_', regex=True).str.strip('_')
        # Handle 'none' string from fillna/conversion
        df[col] = df[col].replace('none', '')

    # Handle tags
    col = 'Tags'
    df[col] = df[col].fillna('').astype(str).str.lower()
    df[col] = df[col].apply(lambda x:
        ' '.join([
            # Strip whitespace, replace internal spaces with _
            tag.strip().replace(' ', '_') for tag in x.split(',') if tag.strip()
        ])
    )
    # Clean up any double underscores
    df[col] = df[col].str.replace(r'_+', '_', regex=True)

    # Initialize weights if without
    if weights is None:
        weights = {
            'star_char': 6,
            'type': 3,
            'series': 2,
            'baseline': 1
        }

    # Build the weighted content soup string by repeating the data
    df['content_soup'] = (
        (df['Character-centric'] + ' ') * weights.get('star_char', 6) +
        (df['Type'] + ' ') * weights.get('type', 3) +
        (df['Series'] + ' ') * weights.get('series', 2) +
        (df['Characters'] + ' ') * weights.get('baseline', 1) +
        (df['Tags'] + ' ') * weights.get('baseline', 1) +
        (df['Collaboration'] + ' ') * weights.get('baseline', 1)
    )

    # Clean up soup: remove extra spaces
    df['content_soup'] = df['content_soup'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Create a mapping from 'Item #' to its index
    item_id_to_index = pd.Series(df.index, index=df['Item #']).to_dict()

    # Compute TF-IDF feature matrix from context soup
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['content_soup'])

    return tfidf_matrix, tfidf, item_id_to_index

In [5]:
def get_recommendations(user_ratings, df, tfidf_matrix, item_id_to_index, top_n=10):
  # Create an empty user profile vector
  user_profile = np.zeros(tfidf_matrix.shape[1])

  # Check if any valid ratings were provided
  valid_ratings_found = False

  # Build the user's profile based on their ratings
  for item_id, rating in user_ratings.items():
    item_id = str(item_id)
    if item_id in item_id_to_index:
        idx = item_id_to_index[item_id]

        # Convert rating to a weight (-2 to +2 scale)
        weight = rating - 3.0

        if weight != 0:
            valid_ratings_found = True
            # Get the item's TF-IDF vector
            item_vector = tfidf_matrix[idx].toarray().flatten()
            # Add the weighted vector to the user's profile
            user_profile += item_vector * weight
    else:
        print(f"Warning: Item # '{item_id}' not found in the database.")

  # If no valid ratings (all neutral or no items found), return empty
  if not valid_ratings_found:
      print("No recommendations to generate. Please provide non-neutral ratings (1, 2, 4, or 5).")
      return pd.DataFrame(columns=['Item #', 'Title', 'Type', 'Character-centric' 'Similarity'])

  # Reshape profile to 2D array for cosine_similarity function
  user_profile_sparse = csr_matrix(user_profile)

  # Calculate similarity between the user's profile and all items
  cos_sim_scores = cosine_similarity(user_profile_sparse, tfidf_matrix).flatten()

  # Create a DataFrame of items and their similarity scores
  df_scores = pd.DataFrame({
      'Item #': df['Item #'],
      'Title': df['Title'],
      'Type': df['Type'],
      'Character-centric': df['Character-centric'],
      'Similarity': cos_sim_scores
  })

  # Get list of items the user has already rated
  rated_item_ids = [str(k) for k in user_ratings.keys()]

  # Filter out items the user has already rated
  df_recommendations = df_scores[~df_scores['Item #'].isin(rated_item_ids)]

  # Sort by similarity to get the top recommendations
  df_recommendations = df_recommendations.sort_values(by='Similarity', ascending=False)

  return df_recommendations.head(top_n)

# Recommender System

In [11]:
# Random number of products to simulate
num_products = random.randint(3,9)

cleaned_df = df.copy()

# Create tfidf_matrix from the dataframe
tfidf_matrix, tfidf, item_id_ind = create_tfidf_matrix(cleaned_df)

print("Content soup:")
for item in cleaned_df['content_soup'].head():
    print(item)

def generate_random_ratings(df, num_products):
  random_item_indices = random.sample(range(len(df)), num_products)
  random_items = df.iloc[random_item_indices]

  my_ratings = {}
  print("Randomly picked products and their ratings:")
  for index, row in random_items.iterrows():
      item_id = row['Item #']
      title = row['Title']
      item_type = row['Type']
      rating = random.randint(1, 5)
      my_ratings[item_id] = rating
      print(f"  - (Item #{item_id}, {item_type}) {title}: {rating}")
  return my_ratings

# Get a random number of products the user has "bought" using the new function
print(f"\nSimulating ratings for {num_products} random products.\n")

my_ratings = generate_random_ratings(cleaned_df, num_products)

print(f"\nGenerating recommendations based on {len(my_ratings)} ratings...\n")

# Get recommendations using the random ratings
recommendations = get_recommendations(my_ratings, cleaned_df, tfidf_matrix, item_id_ind)

if not recommendations.empty:
  print("--- Top 10 Recommendations For You ---")
  recommendations = recommendations.rename(columns={'Similarity': '% Match', 'Character-centric': 'Featured Character'})
  recommendations['% Match'] = recommendations['% Match'].round(2) * 100
  print(recommendations.to_string(index=False))
else:
  print("No recommendations generated based on the random ratings.")

Content soup:
badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru plush plush plush i_love_me i_love_me badtz-maru plush 7''_plush embroidered black
badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru accessory accessory accessory sanrio_character_award sanrio_character_award badtz-maru accessory badge keychain embroidered black
badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru accessory accessory accessory sanrio_character_award sanrio_character_award badtz-maru accessory plush_keychain keychain black
badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru badtz-maru accessory accessory accessory sanrio_character_award sanrio_character_award badtz-maru accessory keychain stickers customizable resin pvc black blue yellow
toys&games toys&games toys&games badtz-maru’s_bowling_alley badtz-maru’s_bowling_alley badtz-maru_hello_kitty_melody_keroppi_chococat toys building_set playset set bldr_bricks

Simulating ratings for 6 random products.

Random