In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# LOADING DATASETS
product_df = pd.read_csv('/content/product_info.csv')
review_df_01 = pd.read_csv('/content/reviews_0-250.csv', index_col = 0, dtype={'author_id':'str'})
review_df_02 = pd.read_csv('/content/reviews_250-500.csv', index_col = 0, dtype={'author_id':'str'})
review_df_03 = pd.read_csv('/content/reviews_500-750.csv', index_col = 0, dtype={'author_id':'str'})
review_df_04 = pd.read_csv('/content/reviews_750-1250.csv', index_col = 0, dtype={'author_id':'str'})
review_df_05 = pd.read_csv('/content/reviews_1250-end.csv', index_col = 0, dtype={'author_id':'str'})

In [3]:
# MERGIG ALL REVIEWS DATAFRAMES
review_df = pd.concat([review_df_01, review_df_02, review_df_03, review_df_04, review_df_05], axis=0)

In [4]:
# CHECKING COLUMNS THAT ARE COMMON IN BOTH DATAFRAMES
cols_to_use = product_df.columns.difference(review_df.columns)
cols_to_use = list(cols_to_use)
cols_to_use.append('product_id')
print(cols_to_use)

['brand_id', 'child_count', 'child_max_price', 'child_min_price', 'highlights', 'ingredients', 'limited_edition', 'loves_count', 'new', 'online_only', 'out_of_stock', 'primary_category', 'reviews', 'sale_price_usd', 'secondary_category', 'sephora_exclusive', 'size', 'tertiary_category', 'value_price_usd', 'variation_desc', 'variation_type', 'variation_value', 'product_id']


In [5]:
# AS DATAFRAMES HAVE COMMON COLUMN 'product_id', WE CAN MERGE THEM ON 'product_id'
df = pd.merge(review_df, product_df[cols_to_use], how='outer', on=['product_id', 'product_id'])
df = df.iloc[:500000]
cols = """variation_desc
sale_price_usd
value_price_usd
child_max_price
child_min_price
review_title"""
cols_list = cols.split("\n")
df.drop(columns=cols_list,axis=1,inplace=True)

In [6]:
# DROP ROWS WITH MISSING VALUES
df.dropna(axis=0,inplace=True)

# TRANSFORMING DATE COLUMNS
df['submission_time'] = pd.to_datetime(df['submission_time'])

df['year']= df['submission_time'].dt.year

df['month']= df['submission_time'].dt.month

df['day']= df['submission_time'].dt.day

df['weekday']= df['submission_time'].dt.weekday

dw_mapping={
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

In [7]:
df['dayofweek']= df['submission_time'].dt.weekday.map(dw_mapping)
# ONE-HOT ENCODING CATEGORICAL VARIABLES
categorical_columns = ['skin_tone','eye_color', 'hair_color', 'primary_category', 'secondary_category', 'size', 'tertiary_category', 'variation_type', 'variation_value', 'dayofweek','skin_type']
df = pd.get_dummies(df, columns=categorical_columns)

df_aggregated = df.groupby(['author_id', 'product_id']).agg({'rating': 'mean'}).reset_index()

user_item_matrix = df_aggregated.pivot(index='author_id', columns='product_id', values='rating').fillna(0)

interaction_matrix = csr_matrix(user_item_matrix.values)
svd = TruncatedSVD(n_components=50)
user_matrix = svd.fit_transform(interaction_matrix)
product_matrix = svd.components_.T

In [14]:
print("User IDs:")
print(user_item_matrix.index.tolist())

print("\nProduct IDs:")
print(user_item_matrix.columns.tolist())

User IDs:
['10000117144', '10000569316', '10000578851', '10000892274', '1000094971', '10001065571', '10001502449', '10001768414', '10001961830', '10002123851', '1000235057', '10003432083', '10004120452', '10004121688', '10004122080', '10004127873', '10004132823', '10004134520', '10004139630', '1000416744', '10004181168', '10004183507', '10004186478', '10004186829', '10004186986', '10004188531', '10004192165', '10004195528', '10004198466', '10004220476', '10005361161', '10005362987', '10005365525', '10005367623', '10005368394', '10005370290', '10005371490', '10005374994', '10005482830', '10005719148', '1000583638', '10005856729', '10005897913', '10005899356', '10006368415', '10006373629', '10006589497', '10006591319', '1000694573', '10008081323', '1000851435', '10009634461', '10009817763', '10009912631', '10009997993', '10010479819', '10010539300', '10010866348', '10010872433', '1001087549', '10012267831', '10013420466', '10013440363', '10013498157', '10013548412', '1001402604', '100143

In [8]:
def recommend_products(user_id, user_matrix, product_matrix, user_item_matrix, top_n=10):
    user_index = user_item_matrix.index.get_loc(user_id)
    user_ratings = user_matrix[user_index]
    scores = user_ratings.dot(product_matrix.T)
    product_indices = scores.argsort()[::-1][:top_n]
    recommended_product_ids = user_item_matrix.columns[product_indices]
    return df[df['product_id'].isin(recommended_product_ids)][['product_id', 'product_name']].drop_duplicates()

In [9]:
tfidf = TfidfVectorizer(stop_words='english')
product_descriptions = df.groupby('product_id')['review_text'].apply(lambda x: " ".join(x)).reset_index()
tfidf_matrix = tfidf.fit_transform(product_descriptions['review_text'])

In [10]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

product_indices = pd.Series(product_descriptions.index, index=product_descriptions['product_id'])

In [11]:
def recommend_similar_products(product_id, product_indices, cosine_sim=cosine_sim, top_n=10):
    idx = product_indices[product_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    product_indices = [i[0] for i in sim_scores]
    recommended_products = product_descriptions.iloc[product_indices]['product_id']
    return df[df['product_id'].isin(recommended_products)][['product_id', 'product_name']].drop_duplicates()


In [15]:
user_id = '10000117144'
product_id = 'P114902'
recommendations = recommend_products(user_id, user_matrix, product_matrix, user_item_matrix)
print(f"Recommendations for User ID {user_id}:")
print(recommendations)
print(f"Recommendations for similar products to Product ID {product_id}:")
print(recommend_similar_products(product_id, product_indices))

# Calculate error for this instance
actual_rating = user_item_matrix.loc[user_id, product_id]
predicted_rating = user_matrix[user_item_matrix.index.get_loc(user_id)].dot(product_matrix.T)[product_indices.tolist().index(product_id)]
error = (actual_rating - predicted_rating) ** 2
print(f"Error for this instance: {error}")



Recommendations for User ID 10000117144:
       product_id                                       product_name
16140       P7880                 Soy Hydrating Gentle Face Cleanser
199022    P433520       Magic Cream Moisturizer with Hyaluronic Acid
271008     P94421  Vinoperfect Radiance Dark Spot Serum Vitamin C...
381327    P433887  Squalane + Omega Repair Deep Hydration Moistur...
383421    P432829  Adaptogen Deep Moisturizing Cream with Ashwaga...
403799    P443845                    Hyaluronic Acid Hydrating Serum
421214    P447791           Avocado Fine Line Eye Cream with Retinol
424975    P446930                     Cream Skin Toner & Moisturizer
461882    P453253                             Vegan Milk Moisturizer
495858    P443833                Salicylic Acid Acne + Pore Cleanser
Recommendations for similar products to Product ID P114902:
       product_id                                       product_name
40186     P269122          Alpha Beta Extra Strength Daily Peel Pads
85