In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity

class ContentBasedRecommender:
    def __init__(self, metadata_df, kmeans_model, tfidf_vectorizer, top_n=5):
        self.metadata_df = metadata_df
        self.kmeans_model = kmeans_model
        self.tfidf_vectorizer = tfidf_vectorizer
        self.top_n = top_n
        
        # Store the TF-IDF matrix and cluster assignments
        self.tfidf_matrix = self.tfidf_vectorizer.transform(self.metadata_df['text_features'])
        self.clusters = self.kmeans_model.predict(self.tfidf_matrix)
        self.centroids = self.kmeans_model.cluster_centers_
        
        # Add cluster assignments to metadata
        self.metadata_df['cluster'] = self.clusters
    
    def get_recommendations(self, product_id):
        # Find the game index
        game_idx = self.metadata_df[self.metadata_df['parent_asin'] == product_id].index[0]
        
        # Get the cluster of the game
        game_cluster = self.clusters[game_idx]
        
        # Get all games in the same cluster
        cluster_mask = (self.clusters == game_cluster)
        cluster_indices = np.where(cluster_mask)[0]
        
        # Get the centroid vector for this cluster
        centroid_vector = self.centroids[game_cluster].reshape(1, -1)
        
        # Get TF-IDF vectors for all games in the cluster
        cluster_vectors = self.tfidf_matrix[cluster_indices]
        
        # Calculate Euclidean distances to centroid
        distances = euclidean_distances(cluster_vectors, centroid_vector).flatten()
        
        # Create a DataFrame with distances
        distance_scores = pd.DataFrame({
            'index': cluster_indices,
            'distance': distances
        })
        
        # Remove the query game itself
        distance_scores = distance_scores[distance_scores['index'] != game_idx]
        
        # Sort by distance (lower is better)
        distance_scores = distance_scores.sort_values('distance')
        
        # Get top N similar games
        top_indices = distance_scores.head(self.top_n)['index'].tolist()
        
        # Return the top N games (by parent_asin)
        return self.metadata_df.iloc[top_indices]['parent_asin'].tolist()

def evaluate_cosine_similarity(recommender, recommended_asins, true_asins, metadata_df):
    """Just calculates cosine similarity between recommended and true items"""
    # Get TF-IDF vectors for recommended items
    rec_indices = [metadata_df[metadata_df['parent_asin'] == asin].index[0] 
                  for asin in recommended_asins]
    rec_vectors = recommender.tfidf_matrix[rec_indices]
    
    # Get TF-IDF vectors for true items
    true_indices = [metadata_df[metadata_df['parent_asin'] == asin].index[0] 
                   for asin in true_asins]
    true_vectors = recommender.tfidf_matrix[true_indices]
    
    # Calculate and return average cosine similarity
    similarity_matrix = cosine_similarity(rec_vectors, true_vectors)
    return np.mean(similarity_matrix)


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load data using provided script
import sys
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)
from utils.data_processing import get_filtered_review_data, get_metadata


In [3]:
# Load video game data
X_train, y_train, X_val, y_val, X_test, y_test = get_filtered_review_data(
    'Video_Games', 
    include_columns=['user_id', 'product_id', 'text', 'title']
)
metadata = get_metadata('Video_Games')

# 1. Prepare text features for TF-IDF
# Combine relevant text fields from metadata
metadata['text_features'] = metadata['title'] + ' ' + metadata['description'] + ' ' + metadata['features']
metadata['text_features'] = metadata['text_features'].fillna('')

Loading preprocessed data from data/Video_Games_min5_testfrac0.20_cols['user_id', 'product_id', 'text', 'title'].pkl
Loading metadata from data/Video_Games_metadata.pkl


In [23]:
# 2. Create TF-IDF vectors
tfidf = TfidfVectorizer(
    min_df=5,
    max_df=0.8,
    stop_words='english',     # Remove English stop words
    ngram_range=(1, 2)        # Use unigrams and bigrams
)
tfidf_matrix = tfidf.fit_transform(metadata['text_features'])
print(tfidf_matrix.shape)

(137269, 428464)


In [24]:
# 3. Apply K-means clustering (k=12)
kmeans = KMeans(n_clusters=12, random_state=42)
kmeans.fit(tfidf_matrix)

In [30]:
# 4. Create the recommender
recommender = ContentBasedRecommender(metadata, kmeans, tfidf, top_n=5)

In [43]:
# Code to run a single test
test_idx = np.random.choice(X_test.index)
test_case = X_test.loc[test_idx]
test_rating = y_test[test_idx]

# Get all items for this user in test set
user_test = X_test[X_test['user_id'] == test_case['user_id']]
user_test_ratings = y_test[user_test.index]

# Print user's ratings to see what we're working with
print("User's ratings in test set:")
for idx, rating in user_test_ratings.items():
    print(f"Rating: {rating:.2f}")

# Maybe lower the threshold if 0.7 is too high
rating_threshold = 0.5 # lowered from 0.7
print(f"\nUsing rating threshold: {rating_threshold}")

# Get the true items (items with ratings above threshold)
true_product_ids = user_test[user_test_ratings > rating_threshold]['product_id'].tolist()

# Create mapping
product_id_to_asin = dict(zip(X_test['product_id'].unique(), 
                             metadata['parent_asin'].unique()))

# Map product_ids to ASINs using our mapping
true_asins = [product_id_to_asin[pid] for pid in true_product_ids]

# Get recommendations for the test item
test_asin = product_id_to_asin[test_case['product_id']]

print("\nQuery item:")
print(metadata[metadata['parent_asin'] == test_asin]['title'].iloc[0])

print("\nRecommended items:")
recommendations = recommender.get_recommendations(test_asin)
for asin in recommendations:
    print(metadata[metadata['parent_asin'] == asin]['title'].iloc[0])

print("\nTrue items (items the user actually liked):")
for asin in true_asins:
    print(metadata[metadata['parent_asin'] == asin]['title'].iloc[0] + "\n")

# Only calculate similarity if we have true items
if true_asins:
    print(len(recommendations))
    print(len(true_asins))
    similarity_score = evaluate_cosine_similarity(recommender, recommendations, true_asins, metadata)
    print(f"\nCosine Similarity Score: {similarity_score:.4f}")
else:
    print("\nNo true items found (user didn't rate any items above threshold)")

User's ratings in test set:
Rating: 1.00

Using rating threshold: 0.5

Query item:
JINHOABF Wireless Controller for PS3 Controller,Built-in Dual Vibration Gamepad Compatible for Playstation 3,with Charger Cable (Blue+Red)

Recommended items:
Controller
controller
Nonbliep Controller
PS4 Controller Charger, PS4 Charging Station for Playstation 4, DualShock 4 PS4 Controller Charger, USB Dobe PS4 Controller Charging Dock Station for Playstation 4/ PS4 Slim / PS4 Pro/PS4 Controller
PS4 Controller Charging Station Dock for Playstation 4 Controller, PS4 Controller Charger for DualShock 4 Remote, Playstation 4 Charger Station for Sony Playstation 4/PS4/PS4 Slim/PS4 Pro Controller…

True items (items the user actually liked):
JINHOABF Wireless Controller for PS3 Controller,Built-in Dual Vibration Gamepad Compatible for Playstation 3,with Charger Cable (Blue+Red)

5
1

Cosine Similarity Score: 0.1781


In [41]:

print("Test case information:")
print(f"User ID: {test_case['user_id']}")
print(f"Product ID: {test_case['product_id']}")
print(f"Rating: {test_rating}")

print("\nAll ratings for this user in test set:")
print("Number of items rated by this user:", len(user_test))
print("\nRatings:")
for idx, row in user_test.iterrows():
    rating = y_test[idx]
    print(f"Product ID: {row['product_id']}, Rating: {rating:.2f}")

# Print the true items calculation
print("\nTrue items calculation:")
print(f"Number of ratings > 0: {sum(user_test_ratings > 0)}")
true_product_ids = user_test[user_test_ratings > 0]['product_id'].tolist()
print(f"Number of true_product_ids: {len(true_product_ids)}")

Test case information:
User ID: 61459
Product ID: 5594
Rating: 1.0

All ratings for this user in test set:
Number of items rated by this user: 1

Ratings:
Product ID: 5594, Rating: 1.00

True items calculation:
Number of ratings > 0: 1
Number of true_product_ids: 1


In [4]:
# First filter the test set outside the function
high_rating_mask = y_test > 0.5
X_test_filtered = X_test[high_rating_mask]
y_test_filtered = y_test[high_rating_mask]

print(f"Total test cases: {len(X_test)}")
print(f"Test cases with rating > 0.5: {len(X_test_filtered)}")

def evaluate_tfidf_params(X_test_filtered, y_test_filtered, metadata, n_percent, min_df, max_df, max_features=1000, random_state=42):
    """
    Evaluate TF-IDF parameters on a random subset of filtered test data
    
    Parameters:
    -----------
    X_test_filtered: filtered test features DataFrame (already threshold filtered)
    y_test_filtered: filtered test ratings (already threshold filtered)
    metadata: metadata DataFrame
    n_percent: percentage of filtered test data to use (0-100)
    min_df: minimum document frequency for TF-IDF
    max_df: maximum document frequency for TF-IDF
    max_features: maximum number of features for TF-IDF
    random_state: random seed for reproducibility
    
    Returns:
    --------
    dict: evaluation results
    """
    # Set random seed
    np.random.seed(random_state)
    
    # Calculate number of samples to use
    n_samples = int(len(X_test_filtered) * n_percent / 100)
    
    # Randomly sample test cases
    random_indices = np.random.choice(X_test_filtered.index, size=n_samples, replace=False)
    X_test_sample = X_test_filtered.loc[random_indices]
    y_test_sample = y_test_filtered[random_indices]
    
    # Create new TF-IDF vectorizer with given parameters
    tfidf = TfidfVectorizer(
        min_df=min_df,
        max_df=max_df,
        stop_words='english',
        ngram_range=(1, 2)
    )
    
    # Prepare text features
    metadata['text_features'] = metadata['title'] + ' ' + metadata['description'] + ' ' + metadata['features']
    metadata['text_features'] = metadata['text_features'].fillna('')
    
    # Fit TF-IDF
    tfidf_matrix = tfidf.fit_transform(metadata['text_features'])
    
    # Create and fit KMeans
    kmeans = KMeans(n_clusters=12, random_state=42)
    kmeans.fit(tfidf_matrix)
    
    # Create recommender
    recommender = ContentBasedRecommender(metadata, kmeans, tfidf)
    
    # Create mapping
    product_id_to_asin = dict(zip(X_test_filtered['product_id'].unique(), 
                                 metadata['parent_asin'].unique()))
    
    # Store similarity scores
    similarity_scores = []
    count = 0
    # For each sampled test case
    for idx in X_test_sample.index:
        test_case = X_test_sample.loc[idx]
        
        try:
            # Get recommendations
            test_asin = product_id_to_asin[test_case['product_id']]
            recommendations = recommender.get_recommendations(test_asin)
            
            # Calculate similarity
            similarity_score = evaluate_cosine_similarity(recommender, recommendations, [test_asin], metadata)
            similarity_scores.append(similarity_score)
            print(f"Test Case {count}/740 : {similarity_score}")
            count += 1
            
        except Exception as e:
            continue
    
    # Calculate metrics
    results = {
        'avg_similarity': np.mean(similarity_scores),
        'std_similarity': np.std(similarity_scores),
        'n_evaluated': len(similarity_scores),
        'params': {
            'min_df': min_df,
            'max_df': max_df,
            'max_features': max_features
        }
    }
    
    return results

# Example usage:
param_combinations = [
    {'min_df': 2, 'max_df': 0.95},
    {'min_df': 5, 'max_df': 0.9},
    {'min_df': 10, 'max_df': 0.8},
    {'min_df': 20, 'max_df': 0.85},
    {'min_df': 50, 'max_df': 0.9},
    {'min_df': 0.005, 'max_df': 0.95},   
    {'min_df': 0.01,  'max_df': 0.9},    
    {'min_df': 0.02,  'max_df': 0.85},   
    {'min_df': 0.01,  'max_df': 0.75},
    {'min_df': 0.005, 'max_df': 0.8}
]
results = []
for params in param_combinations:
    print(f"\nTesting parameters: {params}")
    result = evaluate_tfidf_params(
        X_test_filtered=X_test_filtered,
        y_test_filtered=y_test_filtered,
        metadata=metadata,
        n_percent=10,  # Use 10% of filtered test data
        min_df=params['min_df'],
        max_df=params['max_df']
    )
    results.append(result)
    print(f"Average similarity: {result['avg_similarity']:.4f}")
    print(f"Number of cases evaluated: {result['n_evaluated']}")

# Find best parameters
best_result = max(results, key=lambda x: x['avg_similarity'])
print("\nBest parameters:")
print(f"min_df: {best_result['params']['min_df']}")
print(f"max_df: {best_result['params']['max_df']}")
print(f"Average similarity: {best_result['avg_similarity']:.4f}")

Total test cases: 94762
Test cases with rating > 0.5: 74030

Testing parameters: {'min_df': 2, 'max_df': 0.95}
Test Case 0/740 : 0.16078537357354925
Test Case 1/740 : 0.045034012479084756
Test Case 2/740 : 0.0
Test Case 3/740 : 0.5873311589678979
Test Case 4/740 : 0.0
Test Case 5/740 : 0.07978824408895957
Test Case 6/740 : 0.17285153225044034
Test Case 7/740 : 0.0
Test Case 8/740 : 0.0
Test Case 9/740 : 0.0
Test Case 10/740 : 0.0
Test Case 11/740 : 0.08405595486525709
Test Case 12/740 : 0.11823526364458888
Test Case 13/740 : 0.102143874366698
Test Case 14/740 : 0.0
Test Case 15/740 : 0.2688922168974205
Test Case 16/740 : 0.0
Test Case 17/740 : 0.13962590449287895
Test Case 18/740 : 0.1962667833178311
Test Case 19/740 : 0.22087029482115073
Test Case 20/740 : 0.03641727643729643
Test Case 21/740 : 0.0
Test Case 22/740 : 0.0
Test Case 23/740 : 0.05862128963230202
Test Case 24/740 : 0.0
Test Case 25/740 : 0.18863032032962518
Test Case 26/740 : 0.05870151081373922
Test Case 27/740 : 0.12422

In [5]:
print(results)

[{'avg_similarity': 0.06509395426397717, 'std_similarity': 0.10190960825776642, 'n_evaluated': 7403, 'params': {'min_df': 2, 'max_df': 0.95, 'max_features': 1000}}, {'avg_similarity': 0.0792722752897522, 'std_similarity': 0.10536853060385859, 'n_evaluated': 7403, 'params': {'min_df': 5, 'max_df': 0.9, 'max_features': 1000}}, {'avg_similarity': 0.08654991947696172, 'std_similarity': 0.1164035678790485, 'n_evaluated': 7403, 'params': {'min_df': 10, 'max_df': 0.8, 'max_features': 1000}}, {'avg_similarity': 0.10117072923348602, 'std_similarity': 0.12593207109002327, 'n_evaluated': 7403, 'params': {'min_df': 20, 'max_df': 0.85, 'max_features': 1000}}, {'avg_similarity': 0.11947815052344099, 'std_similarity': 0.1286790484143575, 'n_evaluated': 7403, 'params': {'min_df': 50, 'max_df': 0.9, 'max_features': 1000}}, {'avg_similarity': 0.1832166971491413, 'std_similarity': 0.17190384434007633, 'n_evaluated': 7403, 'params': {'min_df': 0.005, 'max_df': 0.95, 'max_features': 1000}}, {'avg_similarit

In [8]:
for result in results:
    print(f"min_df: {result['params']['min_df']} max_df: {result['params']['max_df']} avg_similarity: {result['avg_similarity']}")
    print("================")


min_df: 2 max_df: 0.95 avg_similarity: 0.06509395426397717
min_df: 5 max_df: 0.9 avg_similarity: 0.0792722752897522
min_df: 10 max_df: 0.8 avg_similarity: 0.08654991947696172
min_df: 20 max_df: 0.85 avg_similarity: 0.10117072923348602
min_df: 50 max_df: 0.9 avg_similarity: 0.11947815052344099
min_df: 0.005 max_df: 0.95 avg_similarity: 0.1832166971491413
min_df: 0.01 max_df: 0.9 avg_similarity: 0.21764167386576377
min_df: 0.02 max_df: 0.85 avg_similarity: 0.23965905626312822
min_df: 0.01 max_df: 0.75 avg_similarity: 0.21764167386576377
min_df: 0.005 max_df: 0.8 avg_similarity: 0.1832166971491413


Demo

In [9]:
tfidf = TfidfVectorizer(
    min_df=0.02,
    max_df=0.85,
    stop_words='english',
    ngram_range=(1, 2)
)

In [10]:
metadata['text_features'] = metadata['title'] + ' ' + metadata['description'] + ' ' + metadata['features']
metadata['text_features'] = metadata['text_features'].fillna('')

# Fit TF-IDF
tfidf_matrix = tfidf.fit_transform(metadata['text_features'])

# Create and fit KMeans
kmeans = KMeans(n_clusters=12, random_state=42)
kmeans.fit(tfidf_matrix)

In [13]:
# Create recommender
recommender = ContentBasedRecommender(metadata, kmeans, tfidf)

# Create mapping from product_id to ASIN
product_id_to_asin = dict(zip(X_test_filtered['product_id'].unique(), 
                             metadata['parent_asin'].unique()))

# Get a random test case with high rating
random_idx = random.choice(X_test_filtered.index)
test_case = X_test_filtered.loc[random_idx]
test_asin = product_id_to_asin[test_case['product_id']]

# Get recommendations
recommendations = recommender.get_recommendations(test_asin)

# Print the results
print("Demo Recommendation Results")
print("-" * 50)
print(f"\nTest Product:")
test_product = metadata[metadata['parent_asin'] == test_asin].iloc[0]
print(f"Title: {test_product['title']}")
print(f"ASIN: {test_asin}")
print(f"Actual Rating: {y_test_filtered[random_idx]:.2f}")

print("\nTop 5 Recommended Products:")
print("-" * 50)
for i, rec_asin in enumerate(recommendations, 1):
    rec_product = metadata[metadata['parent_asin'] == rec_asin].iloc[0]
    print(f"\n{i}. {rec_product['title']}")
    # Calculate similarity score for this recommendation
    similarity = evaluate_cosine_similarity(
        recommender, 
        [rec_asin], 
        [test_asin], 
        metadata
    )
    print(f"   Similarity Score: {similarity:.4f}")

# Calculate overall similarity for all recommendations
overall_similarity = evaluate_cosine_similarity(
    recommender,
    recommendations,
    [test_asin],
    metadata
)
print(f"\nOverall Average Similarity Score: {overall_similarity:.4f}")

Demo Recommendation Results
--------------------------------------------------

Test Product:
Title: KOLMAX HUNTER Typewriter Style Mechanical Gaming Keyboard,RGB Backlit Wired Gaming Keyboard with Blue Switch Retro Steampunk Round 104 Anti-ghosting Retro Round Keycaps for Windows/Mac/PC
ASIN: B0B6GLJ6YQ
Actual Rating: 0.75

Top 5 Recommended Products:
--------------------------------------------------

1. FELICON Mechanical Gaming Keyboard 87 Keys Compact Tenkeyless Rainbow LED Backlit Illuminate Wired Computer Keyboard with Blue Switches and Mouse Pad for Windows PC Gamers
   Similarity Score: 0.5549

2. GIM 87 Keys Gaming Keyboard TKL Mechanical Keyboard Blue Switches RGB Backlit Compact Keyboard Portable USB Wired Keyboard for Windows PC Gamers Black
   Similarity Score: 0.5818

3. MOTOSPEED Gaming Mechanical Keyboard RGB Backlit Transparent Bottom Anti-ghosting 87 Keys,Illuminated USB Gaming Keyboard for Mac/PC/Laptop White
   Similarity Score: 0.5891

4. EagleTec KG060-BR RGB LED