In [336]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# Loading and transforming the dataset to meet the desired specifications

In [337]:
metadf = pd.read_csv('https://raw.githubusercontent.com/ascottR/Recommendation_SYS/refs/heads/main/artifacts/meta_final.csv')

reviewdf = pd.read_csv('https://raw.githubusercontent.com/ascottR/Recommendation_SYS/refs/heads/main/artifacts/review_final.csv')

In [338]:
metadf.columns

Index(['Unnamed: 0', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'store', 'categories', 'details',
       'parent_asin'],
      dtype='object')

In [339]:
reviewdf.columns

Index(['user_id', 'rating', 'title', 'asin', 'parent_asin', 'timestamp',
       'helpful_vote', 'verified_purchase'],
      dtype='object')

In [340]:
metadf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29356 entries, 0 to 29355
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      29356 non-null  int64  
 1   title           29356 non-null  object 
 2   average_rating  29356 non-null  float64
 3   rating_number   29356 non-null  int64  
 4   features        29356 non-null  object 
 5   description     29356 non-null  object 
 6   price           29356 non-null  float64
 7   images          29356 non-null  object 
 8   store           29338 non-null  object 
 9   categories      29356 non-null  object 
 10  details         29356 non-null  object 
 11  parent_asin     29356 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 2.7+ MB


In [341]:
metadf['title']

Unnamed: 0,title
0,"Portable Karaoke Machine for Adults, Unique Gi..."
1,"Musical Instrument Lyre Harp, 16 Metal Strings..."
2,"Set of 5 White Guitar Picks, 1.20mm Thick with..."
3,Danelectro Stock '59 Electric Guitar - Red
4,HENGYEE 52.5mm Guitar Stratocaster Tremolo Bri...
...,...
29351,American Dj Big Shot Led Variable Speed Led Po...
29352,Golden Age Tune-o-matic Bridge For Archtop Gui...
29353,Fender Vintera 60s Stratocaster Electric Guita...
29354,VocoPro (VHF4000


In [342]:
print(f"Mean global rating: {round(metadf['average_rating'].mean(),2)}.")

Mean global rating: 4.28.


In [343]:
# Dropping unnecessary columns from reviewdf
processed_df = reviewdf.drop(['timestamp', 'helpful_vote','asin','title'], axis=1)

# Function to convert string representations of lists to actual lists
def convert_strings_to_lists(df, columns):
    for column in columns:
        df[column] = df[column].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

# Apply the function to the rating and parent_asin columns
convert_strings_to_lists(processed_df, ['rating', 'parent_asin','verified_purchase'])

#explode the dataset
df_exploded = processed_df.explode(['rating', 'parent_asin','verified_purchase'])

df_exploded.shape


(250965, 4)

In [344]:
reviewdf.columns

Index(['user_id', 'rating', 'title', 'asin', 'parent_asin', 'timestamp',
       'helpful_vote', 'verified_purchase'],
      dtype='object')

In [345]:
df_exploded

Unnamed: 0,user_id,rating,parent_asin,verified_purchase
0,AE2226PENZTTCDKFGRTUCUX2NU2Q,5,B01DECWM0G,True
0,AE2226PENZTTCDKFGRTUCUX2NU2Q,5,B00HU25WSQ,False
1,AE2252DKW4XJIZP5QPFMQVJBVRTA,5,B07F2NZWY5,True
1,AE2252DKW4XJIZP5QPFMQVJBVRTA,4,B091R14NS3,True
2,AE225P7FIMCUQD3TPAOF5LSGCLYQ,4,B0757Q45NK,True
...,...,...,...,...
85035,AHZZX3DQ6IZHVV7GI77GL4PAQKCA,5,B07635X9VJ,True
85036,AHZZXAWBVWME2D72MMSB6HU7TCAA,5,B09M7CRWH3,True
85036,AHZZXAWBVWME2D72MMSB6HU7TCAA,5,B0BFKQ9QXD,True
85036,AHZZXAWBVWME2D72MMSB6HU7TCAA,4,B09V91H5XM,True


In [None]:
metadf.shape

In [None]:
# Step 1: Count the number of ratings per user
user_rating_counts = df_exploded.groupby('user_id').size()

# Step 2: Create a boxplot to visualize outliers
plt.figure(figsize=(10, 6))
sns.boxplot(x=user_rating_counts)

# Step 3: Add titles and labels
plt.title('Boxplot of Number of Reviews per User', fontsize=16)
plt.xlabel('Number of Reviews', fontsize=12)

# Step 4: Show the plot
plt.show()

In [None]:
# Step 1: Count the number of ratings per user
user_rating_counts = df_exploded.groupby('user_id').size()

# Step 2: Calculate IQR to detect outliers
Q1 = np.percentile(user_rating_counts, 25)  # First quartile (25%)
Q3 = np.percentile(user_rating_counts, 75)  # Third quartile (75%)
IQR = Q3 - Q1  # Interquartile Range

# Step 3: Define the lower and upper bounds to detect outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 4: Filter users within the bounds (i.e., non-outliers)
non_outlier_users = user_rating_counts[(user_rating_counts >= lower_bound) & (user_rating_counts <= upper_bound)].index

# Step 5: Keep only the rows of users that are not outliers
df_filtered_no_outliers = df_exploded[df_exploded['user_id'].isin(non_outlier_users)]

# Step 6: Show the shape of the filtered DataFrame
print("Filtered DataFrame (users without outliers):")
print(df_filtered_no_outliers.shape)

In [None]:
df_filtered_no_outliers

In [None]:
null_values = df_filtered_no_outliers.isnull().sum()

# Print the result
print(null_values)

# Most Popular Product Reccomendation
Most Populer product reccomendation based on average_rating and rating_number.

In [None]:
sns.histplot(metadf['rating_number'], bins=30, kde=False)
plt.title("Distribution of the number of ratings per item")
plt.xlabel("Number of Ratings")
plt.ylabel("Count of Items")
plt.show()

In [None]:
# Round the 'average_rating' to the nearest 0.5
metadf['rounded_rating'] = np.round(metadf['average_rating'] * 2) / 2

# Plot the distribution of the rounded ratings
sns.countplot(x="rounded_rating", data=metadf, palette="viridis")
plt.title("Distribution of product ratings (rounded to nearest 0.5)", fontsize=14)
plt.show()

In [None]:
# Calculate the overall average rating
overall_average = metadf['average_rating'].mean()

# Set a minimum count of ratings to consider
C = metadf['rating_number'].quantile(0.75)
print(f"75th percentile of rating counts: {C}")

# Calculate Bayesian average for each item
metadf['bayesian_average'] = (metadf['rating_number'] * metadf['average_rating'] + C * overall_average) / (metadf['rating_number'] + C)

# Rank items based on the Bayesian average
ranked_items = metadf.nlargest(10, 'bayesian_average')

# Display the results
ranked_items.sort_values('bayesian_average', ascending=False).head(10)

# Check Sparsity

In [None]:
from scipy.sparse import csr_matrix

def create_X_custom(df):
    """
    Generates a sparse matrix from a DataFrame containing user ratings for products.

    Args:
        df: pandas DataFrame containing user_id, parent_asin, and ratings (as lists)

    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        item_mapper: dict that maps item id's to item indices
        item_inv_mapper: dict that maps item indices to item id's
    """
    # Explode the DataFrame to create individual rows for each rating
    #df_filtered_no_outliers = df.explode(['rating', 'parent_asin'])

    # Ensure the rating column contains numeric values
    df_exploded['rating'] = df_exploded['rating'].astype(float)

    # Check data types
    print("Data types after explosion:")
    print(df_exploded.dtypes)

    # Count unique users and items
    M =  df_exploded['user_id'].nunique()  # Unique users
    N =  df_exploded['parent_asin'].nunique()  # Unique items

    # Print M and N
    print("Number of unique users (M):", M)
    print("Number of unique items (N):", N)

    user_mapper = dict(zip(np.unique(df_exploded["user_id"]), list(range(M))))
    item_mapper = dict(zip(np.unique(df_exploded["parent_asin"]), list(range(N))))

    user_inv_mapper = dict(zip(list(range(M)), np.unique(df_exploded["user_id"])))
    item_inv_mapper = dict(zip(list(range(N)), np.unique(df_exploded["parent_asin"])))

    user_index = [user_mapper[i] for i in df_exploded['user_id']]
    item_index = [item_mapper[i] for i in df_exploded['parent_asin']]

    # Create the sparse matrix
    X = csr_matrix((df_exploded["rating"], (user_index, item_index)), shape=(M, N))

    return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

# Example usage
X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_X_custom(df_filtered_no_outliers)

In [None]:
from scipy.io import mmwrite

# Save the sparse matrix X to a file in Matrix Market format
mmwrite('sparse_matrix.mtx', X)

In [None]:
n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

In [None]:
n_ratings_per_user = X.getnnz(axis=1)
len(n_ratings_per_user)

In [None]:
print(f"Most active user rated {n_ratings_per_user.max()} items.")
print(f"Least active user rated {n_ratings_per_user.min()} items.")

In [None]:
n_ratings_per_item = X.getnnz(axis=0)
len(n_ratings_per_item)

print(f"Most rated item has {n_ratings_per_item.max()} ratings.")
print(f"Least rated item has {n_ratings_per_item.min()} ratings.")

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
sns.kdeplot(n_ratings_per_user, shade=True)
plt.xlim(0)
plt.title("Number of Ratings Per User", fontsize=14)
plt.xlabel("number of ratings per user")
plt.ylabel("density")
plt.subplot(1,2,2)
sns.kdeplot(n_ratings_per_item, shade=True)
plt.xlim(0)
plt.title("Number of Ratings Per Item", fontsize=14)
plt.xlabel("number of ratings per Item")
plt.ylabel("density")
plt.show()

# Colaberative Filtering

In [None]:
from sklearn.neighbors import NearestNeighbors

def find_similar_products_by_title(product_title, X, item_mapper, item_inv_mapper, product_titles, k, metric='cosine'):
    """
    Finds k-nearest neighbours for a given product title.

    Args:
        product_title: Title of the product of interest
        X: user-item utility matrix (sparse matrix)
        k: number of similar products to retrieve
        metric: distance metric for kNN calculations

    Output: returns list of k similar product details based on product titles
    """
    # Map the product title to its corresponding product ID
    product_id = None
    for pid, title in product_titles.items():
        if title.lower() == product_title.lower():  # Case-insensitive match
            product_id = pid
            break

    if product_id is None:
        raise ValueError(f"Product with title '{product_title}' not found in the dataset.")

    # Transpose the user-item matrix so products are the rows
    X = X.T
    neighbour_ids = []

    # Get the index of the product using its ID
    product_ind = item_mapper[product_id]
    product_vec = X[product_ind]

    # Reshape the product vector to be compatible with kneighbors
    if isinstance(product_vec, np.ndarray):
        product_vec = product_vec.reshape(1, -1)

    # Use k+1 since kNN output includes the product ID of interest
    kNN = NearestNeighbors(n_neighbors=k + 1, algorithm="brute", metric=metric)
    kNN.fit(X)

    # Find the nearest neighbours
    neighbour = kNN.kneighbors(product_vec, return_distance=False)

    # Collect similar product IDs, skipping the first one (the product itself)
    for i in range(1, k + 1):  # Start from 1 to skip the input product
        n = neighbour.item(i)
        neighbour_ids.append(item_inv_mapper[n])

    return neighbour_ids


# Example product_titles mapping 'parent_asin' to 'title'
product_details = metadf.set_index('parent_asin')[['title', 'features', 'description', 'price', 'images', 'store', 'categories', 'details']]

# Map 'parent_asin' to 'title' for searching by product title
product_titles = dict(zip(metadf['parent_asin'], metadf['title']))

# Define the product_title you want to find similar items for
product_title = 'VocoPro (VHF4000'  # Replace with an actual product title from your dataset

# Find similar products based on your product title
similar_products = find_similar_products_by_title(product_title, X, item_mapper, item_inv_mapper, product_titles, metric='cosine', k=10)

# Get the product details for the input product title
input_product_id = None
for pid, title in product_titles.items():
    if title.lower() == product_title.lower():
        input_product_id = pid
        break

input_product_details = product_details.loc[input_product_id]

# Print the details of the input product
print(f"Because you viewed/bought '{input_product_details['title']}', here are similar products:")

# Display details of similar products
for similar_product_id in similar_products:
    product_info = product_details.loc[similar_product_id]
    print("\nSimilar Product:")
    print(f"Title: {product_info['title']}")
    print(f"Features: {product_info['features']}")
    print(f"Description: {product_info['description']}")
    print(f"Price: {product_info['price']}")
    print(f"Images: {product_info['images']}")
    print(f"Store: {product_info['store']}")
    print(f"Categories: {product_info['categories']}")
    print(f"Details: {product_info['details']}")
