In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import os

# Get a list of all CSV files in the current directory
csv_files = [file for file in os.listdir('.') if file.endswith('.csv')]

# Read each CSV file and append them together
df_list = []
for file in csv_files:
    df_list.append(pd.read_csv(file))

df = pd.concat(df_list, ignore_index=True)

In [3]:
df.drop("Unnamed: 0", axis = 1, inplace = True)

In [4]:
df.columns

Index(['product', 'category', 'subcategory', 'link', 'image', 'asin', 'brand',
       'rating', 'qty_rating', 'description', 'extend_description'],
      dtype='object')

In [5]:
df.head(2)

Unnamed: 0,product,category,subcategory,link,image,asin,brand,rating,qty_rating,description,extend_description
0,"Keika Charcoal Black Soap Bar for Acne, Eczema...",Cleansers,Bars,https://www.amazon.com/gp/slredirect/picassoRe...,https://m.media-amazon.com/images/I/81PGL0+oLZ...,B07CK9L2J5,,,,,
1,Urban Skin Rx Clear Skin Cleansing Bar | 3-in-...,Cleansers,Bars,https://www.amazon.com/gp/slredirect/picassoRe...,https://m.media-amazon.com/images/I/71TIhVEebN...,B07N7N7VQY,,,,,


In [6]:
df['product'].to_list()[0]

'Keika Charcoal Black Soap Bar for Acne, Eczema, Psoriasis, Face, Body, Men Women Teens with Oily Skin, 5 oz.'

In [7]:
df.columns

Index(['product', 'category', 'subcategory', 'link', 'image', 'asin', 'brand',
       'rating', 'qty_rating', 'description', 'extend_description'],
      dtype='object')

In [15]:
data['category'].dropna().shape

(86152,)

In [23]:
# Load the data into a pandas DataFrame
data = df

# Define the feature columns to use for recommendations
feature_cols = ['category', 'subcategory', 'brand', 'description', 'extend_description']

# Create a TfidfVectorizer to transform the text data into numerical features
vectorizer = TfidfVectorizer(stop_words='english')
feature_matrix = vectorizer.fit_transform(data[feature_cols].fillna(''))
# feature_matrix = vectorizer.fit_transform(data['category'].dropna())

# Calculate the pairwise cosine similarity between all items
similarity_matrix = cosine_similarity(feature_matrix)

# Helper function to get top N recommended products for a given product
def get_recommendations(product_name, top_n=5):
    # Get the index of the given product in the DataFrame
    idx = data[data['product'] == product_name].index[0]
    
    # Get the pairwise cosine similarities between the given product and all other products
    similarities = similarity_matrix[idx]
    
    # Get the indices of the top N most similar products
    top_indices = similarities.argsort()[::-1][1:top_n+1]
    
    # Return the names of the top N most similar products
    for item in data.iloc[top_indices]['product'].tolist():
        print(item)
        print("\n")



prod_name = 'Cetaphil Bar Soap, Deep Cleansing Face and Body Bar, Pack of 3, For Dry to Normal, Sensitive Skin, Soap Free, Hypoallergenic, Paraben Free, Fragrance Free, Removes Makeup, Dirt and Oil'
get_recommendations(prod_name, top_n = 10)


CETAPHIL Gentle Cleansing Bar, 4.5 oz Bar (Pack of 6), Nourishing Cleansing Bar For Dry, Sensitive Skin, Non-Comedogenic, (Packaging May Vary)


Baxter of California Vitamin Cleansing Bar for Men | Citrus and Herbal Musk Essence | All Skin Types | 7 Oz | Holiday Gift Guide


Urban Skin Rx Clear Skin Cleansing Bar | 3-in-1 Daily Cleanser, Exfoliator and Mask Removes Excess Oil and Improves Blemishes, Formulated with Salicylic Acid, Eucalyptus and Sulfur | 3.7 Oz


Keika Charcoal Black Soap Bar for Acne, Eczema, Psoriasis, Face, Body, Men Women Teens with Oily Skin, 5 oz.




In [2]:
# import pandas as pd
# from surprise import Dataset, Reader, SVD
# from surprise.model_selection import train_test_split

# # Load the data into a pandas DataFrame
# data = df

# # Define the reader for Surprise library
# reader = Reader(rating_scale=(1, 5))

# # Load the data into a Surprise Dataset
# dataset = Dataset.load_from_df(data[['product', 'qty_rating', 'rating']], reader)

# # Split the data into train and test sets
# trainset, testset = train_test_split(dataset, test_size=0.2)

# # Define the SVD algorithm for collaborative filtering
# algo = SVD()

# # Train the algorithm on the trainset
# algo.fit(trainset)

# # Predict the ratings for the testset
# predictions = algo.test(testset)

# # Helper function to get top N recommended products for a given user
# def get_recommendations(user_id, top_n=5):
#     # Get the ratings for the given user from the DataFrame
#     user_ratings = data[data['asin'] == user_id][['product', 'rating']]
    
#     # Convert the DataFrame to a Surprise Dataset
#     user_dataset = Dataset.load_from_df(user_ratings, reader)
    
#     # Train the algorithm on the full dataset
#     full_trainset = dataset.build_full_trainset()
#     algo.fit(full_trainset)
    
#     # Predict the ratings for the user
#     user_predictions = algo.test(user_dataset.build_full_trainset().build_testset())
    
#     # Sort the predicted ratings in descending order and get the top N recommendations
#     top_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:top_n]
    
#     # Return the names of the top N recommended products
#     return [pred.iid for pred in top_predictions]

# # Example usage
# get_recommendations('UserID', top_n=10)
