In [1]:
# Import libraries
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv('sanrio_products.csv')
df.head()

Unnamed: 0,Item #,Title,Image URL,Description,Characters,Character-centric,Type,Tags,Price ($),Series,Number of Items,Discount,Collaboration
0,30070,"Badtz-maru 7"" Plush (I Love Me Series)",https://www.sanrio.com/cdn/shop/files/zz-25043...,In honor of the 40th Anniversary Sanrio Charac...,Badtz-maru,Badtz-maru,Plush,"plush, 7'' plush, embroidered, black",44.0,I Love Me,Single,0.0,
1,486183,Badtz-maru Mascot Badge Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25044...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, badge, keychain, embroidered, black",5.99,Sanrio Character Award,Single,0.0,
2,175064,Badtz-maru Mini Mascot Keychain (Sanrio Charac...,https://www.sanrio.com/cdn/shop/files/original...,I’ll adore you with all my heart! Clip this ad...,Badtz-maru,Badtz-maru,Accessory,"accessory, plush keychain, keychain, black",14.99,Sanrio Character Award,Single,0.0,
3,619744,Badtz-maru Customizable Keychain (Sanrio Chara...,https://www.sanrio.com/cdn/shop/files/zz-25046...,Celebrating 40 years of this fan-favorite cont...,Badtz-maru,Badtz-maru,Accessory,"accessory, keychain, stickers, customizable, r...",10.99,Sanrio Character Award,Multiple,4.0,
4,CNS0132,Hello Kitty and Friends BLDR Building Set (Bad...,https://www.sanrio.com/cdn/shop/files/CNS01321...,Get ready to unleash your competitive spirit a...,"Badtz-maru, Hello Kitty, Melody, Keroppi, Choc...",,Toys&Games,"toys, building set, playset, set, bldr bricks",49.99,Badtz-maru’s Bowling Alley,Multiple,0.0,


In [3]:
# Replace NaN values with empty string for specified columns
df[['Collaboration', 'Character-centric', 'Series']] = df[['Collaboration', 'Character-centric', 'Series']].fillna("")

# Handle potential NaN in 'Discount' before converting to string in create_similarity_matrix
df['Discount'] = df['Discount'].fillna(0)


display(df.describe())
display(df.info())

Unnamed: 0,Price ($),Discount
count,195.0,195.0
mean,31.607846,1.638564
std,33.002663,4.60949
min,5.99,0.0
25%,14.0,0.0
50%,24.0,0.0
75%,36.0,0.0
max,229.0,32.99


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item #             195 non-null    object 
 1   Title              195 non-null    object 
 2   Image URL          195 non-null    object 
 3   Description        195 non-null    object 
 4   Characters         194 non-null    object 
 5   Character-centric  195 non-null    object 
 6   Type               195 non-null    object 
 7   Tags               195 non-null    object 
 8   Price ($)          195 non-null    float64
 9   Series             195 non-null    object 
 10  Number of Items    195 non-null    object 
 11  Discount           195 non-null    float64
 12  Collaboration      195 non-null    object 
dtypes: float64(2), object(11)
memory usage: 19.9+ KB


None

# Functions

In [None]:
def create_tfidf_matrix(df, weights=None):
    # Columns to use for content features
    phrase_cols = ['Type', 'Series', 'Collaboration', 'Character-centric']

    # Handle normal phrase columns
    for col in phrase_cols:
        df[col] = df[col].fillna('').astype(str).str.lower().str.replace(r'[\s,]+', '_', regex=True)
        # Remove any trailing underscores that might result from ", "
        df[col] = df[col].str.replace(r'_+', '_', regex=True).str.strip('_')
        # Handle 'none' string from fillna/conversion
        df[col] = df[col].replace('none', '')

    # Processing columns set up as lists
    def process_list_column(tag_string):
        if not isinstance(tag_string, str):
            return ""
        
        tags = tag_string.split(',')
        processed_tags = []
        for tag in tags:
            clean_tag = tag.strip()
            if clean_tag:
                # Replace internal spaces with underscores
                tokenized_tag = clean_tag.replace(' ', '_')
                processed_tags.append(tokenized_tag)
        
        # Join processed tags with a space
        return ' '.join(processed_tags)

    # Handle tags
    list_cols = ['Characters', 'Tags']
    for col in list_cols:
        if col in df.columns:
            # Ensure it's a string, lowercase
            df[col] = df[col].fillna('').astype(str).str.lower()
            df[col] = df[col].apply(process_list_column)
            # Clean up any double underscores
            df[col] = df[col].str.replace(r'_+', '_', regex=True)

    # Clean up any double underscores
    df[col] = df[col].str.replace(r'_+', '_', regex=True)

    # Initialize weights if without
    if weights is None:
        weights = {
            'star_char': 6,
            'type': 3,
            'series': 2,
            'baseline': 1
        }

    # Build the weighted content soup string by repeating the data
    df['content_soup'] = (
        (df['Character-centric'] + ' ') * weights.get('star_char', 6) +
        (df['Type'] + ' ') * weights.get('type', 3) +
        (df['Series'] + ' ') * weights.get('series', 2) +
        (df['Characters'] + ' ') * weights.get('baseline', 1) +
        (df['Tags'] + ' ') * weights.get('baseline', 1) +
        (df['Collaboration'] + ' ') * weights.get('baseline', 1)
    )

    # Clean up soup: remove extra spaces
    df['content_soup'] = df['content_soup'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Create a mapping from 'Item #' to its index
    item_id_to_index = pd.Series(df.index, index=df['Item #']).to_dict()

    # Compute TF-IDF feature matrix from context soup
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['content_soup'])

    # Get calculations
    for i in range(5):
        # Pick random product
        random_ind = random.randint(0, len(df))
        print(df['Title'].iloc[random_ind])

        # Print content soup and its scores
        feature_names = tfidf.get_feature_names_out()
        tfidf_results = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=df['content_soup'])
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_colwidth', None)

        # Sort and filter for non-zero values (very sparse matrix)
        tfidf_results = tfidf_results[df['content_soup'].iloc[random_ind]]
        print("Number of words: ", len(tfidf_results))
        tfidf_results = tfidf_results[tfidf_results > 0].sort_values(ascending=False)

        # Print results
        print(tfidf_results)
        print()

    return tfidf_matrix, tfidf, item_id_to_index

In [5]:
def get_recommendations(user_ratings, df, tfidf_matrix, item_id_to_index, top_n=10):
  # Create an empty user profile vector
  user_profile = np.zeros(tfidf_matrix.shape[1])

  # Check if any valid ratings were provided
  valid_ratings_found = False

  # Build the user's profile based on their ratings
  for item_id, rating in user_ratings.items():
    item_id = str(item_id)
    if item_id in item_id_to_index:
        idx = item_id_to_index[item_id]

        # Convert rating to a weight (-2 to +2 scale)
        weight = rating - 3.0

        if weight != 0:
            valid_ratings_found = True
            # Get the item's TF-IDF vector
            item_vector = tfidf_matrix[idx].toarray().flatten()
            # Add the weighted vector to the user's profile
            user_profile += item_vector * weight
    else:
        print(f"Warning: Item # '{item_id}' not found in the database.")

  # If no valid ratings (all neutral or no items found), return empty
  if not valid_ratings_found:
      print("No recommendations to generate. Please provide non-neutral ratings (1, 2, 4, or 5).")
      return pd.DataFrame(columns=['Item #', 'Title', 'Type', 'Character-centric' 'Similarity'])

  # Reshape profile to 2D array for cosine_similarity function
  user_profile_sparse = csr_matrix(user_profile)

  # Calculate similarity between the user's profile and all items
  cos_sim_scores = cosine_similarity(user_profile_sparse, tfidf_matrix).flatten()

  # Create a DataFrame of items and their similarity scores
  df_scores = pd.DataFrame({
      'Item #': df['Item #'],
      'Title': df['Title'],
      'Type': df['Type'],
      'Character-centric': df['Character-centric'],
      'Similarity': cos_sim_scores
  })

  # Get list of items the user has already rated
  rated_item_ids = [str(k) for k in user_ratings.keys()]

  # Filter out items the user has already rated
  df_recommendations = df_scores[~df_scores['Item #'].isin(rated_item_ids)]

  # Sort by similarity to get the top recommendations
  df_recommendations = df_recommendations.sort_values(by='Similarity', ascending=False)

  return df_recommendations.head(top_n)

# Recommender System

In [13]:
# Random number of products to simulate
num_products = random.randint(3,9)

cleaned_df = df.copy()

# Create tfidf_matrix from the dataframe
tfidf_matrix, tfidf, item_id_ind = create_tfidf_matrix(cleaned_df)

print("Content soup:")
for item in cleaned_df['content_soup'].head():
    print(item)

def generate_random_ratings(df, num_products):
  random_item_indices = random.sample(range(len(df)), num_products)
  random_items = df.iloc[random_item_indices]

  my_ratings = {}
  print("Randomly picked products and their ratings:")
  for index, row in random_items.iterrows():
      item_id = row['Item #']
      title = row['Title']
      item_type = row['Type']
      featured_char = row['Character-centric']
      series = row['Series']
      collaboration = row['Collaboration']
      tags = row['Tags']
      characters = row['Characters']
      rating = random.randint(1, 5)
      my_ratings[item_id] = rating
      print(f"  - (Item #{item_id}, {item_type}) {title}: {rating}")
      print(f"      Character focus: {featured_char}")
      print(f"      Characters: {characters}")
      print(f"      Series: {series}")
      print(f"      Collaboration: {collaboration}")
      print(f"      Tags: {tags}")

  return my_ratings

# Get a random number of products the user has "bought" using the new function
print(f"\nSimulating ratings for {num_products} random products.\n")

my_ratings = generate_random_ratings(cleaned_df, num_products)

print(f"\nGenerating recommendations based on {len(my_ratings)} ratings...\n")

# Get recommendations using the random ratings
recommendations = get_recommendations(my_ratings, cleaned_df, tfidf_matrix, item_id_ind)

if not recommendations.empty:
  print("--- Top 10 Recommendations For You ---")
  recommendations = recommendations.rename(columns={'Similarity': '% Match', 'Character-centric': 'Featured Character'})
  recommendations['% Match'] = recommendations['% Match'].round(2) * 100
  print(recommendations.to_string(index=False))
else:
  print("No recommendations generated based on the random ratings.")

Hello Kitty Card Case (Retro Quilt Series)
Number of words:  346
hello_kitty    0.735328
retro_quilt    0.391156
accessory      0.385536
card_case      0.252134
quilted        0.184232
pink           0.145981
cotton         0.143881
poly           0.134635
Name: hello_kitty hello_kitty hello_kitty hello_kitty hello_kitty hello_kitty accessory accessory accessory retro_quilt retro_quilt hello_kitty accessory card_case poly cotton quilted pink, dtype: float64

Hello Kitty Cowgirl Ceramic Bank
Number of words:  346
hello_kitty     0.680236
home_goods      0.498193
western         0.303954
bank            0.233244
ceramic_bank    0.233244
ceramic         0.180925
pink            0.135044
blue            0.114921
red             0.112534
white           0.105113
Name: hello_kitty hello_kitty hello_kitty hello_kitty hello_kitty hello_kitty home_goods home_goods home_goods western western hello_kitty home_goods bank ceramic_bank ceramic white red blue pink, dtype: float64

Chococat 10" Season