## Content-Based Recommender
Implementing a content-based recommender system using a csv containing cosmetic products' reviews.


## Imports

In [31]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error 
from sklearn.metrics.pairwise import cosine_similarity

## Importing the dataset
The dataset contains 5626 reviews from multiple users.
Dataset url: https://www.kaggle.com/datasets/jithinanievarghese/cosmetics-and-beauty-products-reviews-top-brands

In [5]:
metadata = pd.read_csv('top_brands_cosmetics_product_reviews.csv', low_memory=False)

# keeping the products with more than 90% of the total rating count
quantile = 0.9
metadata = metadata[metadata.product_rating_count > metadata.product_rating_count.quantile(quantile)].reset_index()
pd.DataFrame(metadata.columns, columns=['columns']).T  # printing the columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
columns,index,product_id,brand_name,review_id,review_title,review_text,author,review_date,review_rating,is_a_buyer,pro_user,review_label,product_title,mrp,price,product_rating,product_rating_count,product_tags,product_url


In [6]:
metadata.product_rating_count.quantile(quantile)

98477.0

In [7]:
metadata.head()

Unnamed: 0,index,product_id,brand_name,review_id,review_title,review_text,author,review_date,review_rating,is_a_buyer,pro_user,review_label,product_title,mrp,price,product_rating,product_rating_count,product_tags,product_url
0,2630,787778,Nykaa Cosmetics,20267584,Madras kaapi 05,Easy to apply. Mini travel friendly for daily ...,Niggitha John,2021-08-31 12:27:02,5.0,True,False,Verified Buyer,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,239,203,3.9,80807,,https://www.nykaa.com/nykaa-matte-to-last-mini...
1,2631,787778,Nykaa Cosmetics,23180898,Must have,Quality is really good and since the size is s...,Tatung Yumme,2022-02-28 15:03:26,5.0,True,False,Verified Buyer,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,239,203,3.9,80807,,https://www.nykaa.com/nykaa-matte-to-last-mini...
2,2632,787778,Nykaa Cosmetics,18339680,The perfect red,"I finally end my search for the most regal,dee...",sushmita mukherji,2021-05-12 15:54:33,5.0,True,False,Verified Buyer,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,239,203,3.9,80807,,https://www.nykaa.com/nykaa-matte-to-last-mini...
3,2633,787778,Nykaa Cosmetics,19639377,Kufri 21,Just mad and in love with color and soo cute m...,Hetal Makvana,2021-07-26 14:44:37,5.0,True,False,Verified Buyer,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,239,203,3.9,80807,,https://www.nykaa.com/nykaa-matte-to-last-mini...
4,2634,787778,Nykaa Cosmetics,19458814,Recommending for normal skin tone.loved it,Super nude shade suitable for normal to dusky ...,Fathima Nisar,2021-07-17 14:46:07,5.0,True,False,Verified Buyer,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,239,203,3.9,80807,,https://www.nykaa.com/nykaa-matte-to-last-mini...


## Feature extraction
Transform the plot summaries into vector representations to be able to apply numeric machine learning algorithms 

In [13]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
metadata.review_text = metadata.review_text.fillna('')
tfidf_model = vectorizer.fit_transform(metadata.review_text)

Each plot summary will be transformed into a sequence of words to point a high dimensional semantic space (the tf-idf model is used here). This counts the number of times a word appears in the document to decide the importance of the word in the document

In [14]:
print(f'Matrix contains {tfidf_model.shape[0]} rows and {tfidf_model.shape[1]} columns')

Matrix contains 5622 rows and 1982 columns


BOW (Bag of Words) model counts the amount of times a word appears in a document (sparse, most of the entries in the vector are 0)

TF-IDF (Term Frequency - Inverse Document Frequency) model counts the amount of times a word appears in a document, but also takes into account how often the word appears in all documents. it down-weights words that appear frequently across documents, making them less informative than those that appear rarely.

Every plot summary is encoded as a single vector whose length is equal to the size of the vocabulary of all the plot summary.

TD-IDF transforms the plot summaries into a matrix. It ignores words that appear in more than 80% of the reviews and the ones that occur in less than 2 -> the noise is reduced.

## Ispect the tf-idf model using popular makeup terms

In [15]:
popular_terms = ['cream', 'skin', 'awesome', 'shade', 'mascara', 'red', 'powder', 'blush']

columns = vectorizer.get_feature_names_out()
tdidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_model, columns=columns)
tdidf_df[popular_terms].head()

Unnamed: 0,cream,skin,awesome,shade,mascara,red,powder,blush
0,0.0,0.205144,0.0,0.149753,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.114084,0.0,0.221099,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.251692,0.0,0.183732,0.0,0.0,0.0,0.0


Apply the cosine similarity between different products based on their plot summary term frequency occurrence-signature

In [16]:
def get_content_based_recommendation(product_title, top_n=10, metric='cosine'):
    # get the index of the product that matches the title
    # the index is used to find the row in the tf-idf matrix that corresponds to the product
    idx = metadata[metadata.product_title.str.lower() == product_title.lower()].empty

    model = NearestNeighbors(n_neighbors=top_n, metric=metric)
    model.fit(tfidf_model)
    # use a k-nearest neighbors model to find the most similar products
    similar_products = model.kneighbors(tfidf_model[idx], return_distance=False)[0]

    # top 10 most similar products
    return metadata.iloc[similar_products]

In [17]:
get_content_based_recommendation('Olay Regenerist Whip Mini and Ultimate Eye Cream Combo')[
    ['product_title', 'review_rating', 'product_rating', 'product_rating_count', 'review_text']]

Unnamed: 0,product_title,review_rating,product_rating,product_rating_count,review_text
1,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,Quality is really good and since the size is s...
688,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,It's a beautiful shade and has a really pleasa...
415,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,Over all nice product but size is too small.. ...
1722,Nykaa Serial Kisser Lip Balm - Raspberry,5.0,4.0,16264,I really liked the product..!
692,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,It's very small only 1.3ml but it looks very c...
1252,Nykaa Serial Kisser Lip Balm - Grape,5.0,4.0,16264,This is a really good lip balm with a really g...
196,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,Very cute easy to carry lipstick love it
1150,Nykaa So Matte! Mini Lipstick - 26 M Bon Bon,5.0,4.0,98477,Easy to carry around as it's small. Loved the ...
75,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,It is very very good .... Very small and easy ...
2126,Nykaa So Matte! Mini Lipstick - 17 M Cranberry...,5.0,4.0,98477,Really good


In [18]:
get_content_based_recommendation('Olay Total Effects 7 In One Anti-Ageing Day Cream Normal SPF 15')[
    ['product_title', 'review_rating', 'product_rating', 'product_rating_count', 'review_text']]

Unnamed: 0,product_title,review_rating,product_rating,product_rating_count,review_text
1,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,Quality is really good and since the size is s...
688,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,It's a beautiful shade and has a really pleasa...
415,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,Over all nice product but size is too small.. ...
1722,Nykaa Serial Kisser Lip Balm - Raspberry,5.0,4.0,16264,I really liked the product..!
692,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,It's very small only 1.3ml but it looks very c...
1252,Nykaa Serial Kisser Lip Balm - Grape,5.0,4.0,16264,This is a really good lip balm with a really g...
196,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,Very cute easy to carry lipstick love it
1150,Nykaa So Matte! Mini Lipstick - 26 M Bon Bon,5.0,4.0,98477,Easy to carry around as it's small. Loved the ...
75,Nykaa Matte to Last! Mini Liquid Lipstick - Jh...,5.0,3.9,80807,It is very very good .... Very small and easy ...
2126,Nykaa So Matte! Mini Lipstick - 17 M Cranberry...,5.0,4.0,98477,Really good


## Cosine Similarity = measure of similarity between two vectors
- normalized dot product
- independent of the magnitude of the vectors
- is 0 if the vectors are orthogonal
- is 1 if the vectors are equal
- symmetric
- non negative
- similarity between 2 vectors is always between 0 and 1 

## Collaborative Filtering

In [19]:
rating_list = pd.read_csv('top_brands_cosmetics_product_reviews.csv', sep=',', usecols=['author', 'product_title', 'product_rating', 'review_date'])

items = pd.read_csv('top_brands_cosmetics_product_reviews.csv', 
                    usecols=['product_title', 'product_url', 'brand_name'], encoding='latin-1')

print(f'Number of ratings: {rating_list.author.nunique()} | Number of items: {items.product_title.nunique()}')

Number of ratings: 41669 | Number of items: 292


In [20]:
rating_list.head()

Unnamed: 0,author,review_date,product_title,product_rating
0,Ashton Dsouza,2021-01-23 15:17:18,Olay Ultra Lightweight Moisturiser: Luminous W...,4.1
1,Amrit Neelam,2020-09-07 15:30:42,Olay Ultra Lightweight Moisturiser: Luminous W...,4.1
2,Sanchi Gupta,2020-11-13 12:24:14,Olay Ultra Lightweight Moisturiser: Luminous W...,4.1
3,Ruchi Shah,2020-06-14 11:56:50,Olay Ultra Lightweight Moisturiser: Luminous W...,4.1
4,Sukanya Sarkar,2020-12-22 15:24:35,Olay Ultra Lightweight Moisturiser: Luminous W...,4.1


In [22]:
X_train, X_test, y_train, y_test = train_test_split(rating_list, rating_list.product_rating, test_size=0.1, random_state=42)

## Pivot the ratings to user-item matrix

In [24]:
ratings = X_train.pivot_table(index=['author'], columns=['product_title'], values='product_rating').fillna(0)
mean_ratings = ratings.mean(axis=1)
print(f'Number of users: {ratings.shape[0]} | Number of items: {ratings.shape[1]}')

Number of users: 38325 | Number of items: 288


In [25]:
ratings.head()

product_title,Colorbar Mogra Mist - Scent Of An India Temple - 001,Get Party Ready With Kay Beauty Lip Liner - Dance Off & Matte Lipstick - Premier,Herbal Essences Aloe & Bamboo Conditioner Soft Smooth Hair- No- Sulphates and Paraben(400gm),"Herbal Essences Aloe & Bamboo Shampoo + Conditioner For Soft Smooth Hair, No Sulphates & Paraben","Herbal Essences Aloe & Bamboo Shampoo + Conditioner Kit For Soft Hair, Sulphate & Paraben Free","Herbal Essences Aloe & Bamboo Shampoo For Soft Smooth Hair, No-Sulphates, Paraben and Silicones","Herbal Essences Aloe & Eucalyptus Shampoo + Aloe & Bamboo Conditioner, No- Sulphates & Paraben","Herbal Essences Aloe & Eucalyptus Shampoo For Soft Smooth Hair, No- Sulphates, Paraben and Silicones",Herbal Essences Argan 2 Shampoo + Conditioner,Herbal Essences Argan Oil Of Moroccan Shampoo & Conditioner With Free Neem Comb,...,Olay Eye Cream - With Niacinamide & Pentapeptides,Olay Regenerist Whip Mini and Ultimate Eye Cream Combo,Olay Total Effects 7 In One Anti-Ageing Day Cream Normal SPF 15,Olay Total Effects 7 In One Day Cream Normal SPF 15 (Trial Size) 8gm,Olay Total Effects Day Cream For Sensitive Skin - Niacinamide,Olay Ultimate Skin Regimen Kit,Olay Ultra Lightweight Moisturiser: Luminous Whip Day Cream (non SPF),Olay White Radiance Day & Night Cream for Brightening and Glow,Retinol & Hyaluronic Acid Sleeping Mask for Intense Hydration - Pack of 2,The Ultimate Nude Duo - Kay Beauty Long Stay Matte Lipstick - Debut & Nail Enamel - Nutty 21
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Monika,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
# Ho,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#Udhyama# Bunga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$weety Soni,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate similarity

In [27]:
def get_all_recommendations(user_id, model, use_means=True):
    distance, knn = model.kneighbors(ratings.fillna(0)) # nearest neighbors
    knn = pd.DataFrame(knn + 1, index=ratings.index)
    sim = pd.DataFrame(1 - distance, index=ratings.index) # invert the distance
    neighbors = knn.loc[user_id, 1:]
    similarities = sim.loc[user_id, 1:]
    similarities.index = ratings.loc[neighbors].index
    
    if use_means:
        return pd.Series(mean_ratings.loc[user_id] + ratings.loc[neighbors].subtract(mean_ratings.loc[neighbors], axis='index').mul(similarities, axis='index').sum(axis='index') / similarities.sum(), name='recommendation')
    else:
        return pd.Series(ratings.loc[neighbors].mul(similarities, axis='index').sum(axis='index') / similarities.sum(), name='recommendation')

## Compute a single recommendation for a given user, product and model

In [28]:
def get_recommendations (user_id, product_id, model, use_means=True):
    if product_id not in ratings.columns:
        return 2.5
    recommendations = get_all_recommendations(user_id, model, use_means=use_means)
    return recommendations.loc[product_id]

Compute Root Mean Squared Error (RMSE) to evaluate to predict ratings for all products for every user in the dataset

Then, line the predicted ratings with the actual ratings in the test set and calculate the RMSE.7

In [30]:
model = NearestNeighbors(n_neighbors=40, metric='cosine')
model.fit(ratings.fillna(0))

def get_RMSE(X_test, model, use_means=True):
    group = X_test[['product_title', 'product_rating']].groupby(X_test.author)
    mse = []
    i = 0
    for key in group.groups:
        if key not in rating_list['author']:
            continue  # Skip users not in the dataset
        predictions = get_all_recommendations(key, model=model, use_means=use_means)
        rated_products = group.get_group(key).set_index('product_title')
        df = rated_products.join(predictions).dropna().reset_index()
        mse.append(df)
        if i % 100 == 0:
            score = np.sqrt(mean_squared_error(df.product_rating, df.recommendation))
            print(f'{i}/{X_test.author.nunique()} - RMSE: {score:.4f}')
        i += 1
    mse = pd.concat(mse).reset_index(drop=True)
    score = np.sqrt(mean_squared_error(mse.product_rating, mse.recommendation))
    print(f'{X_test.author.nunique()}/{X_test.author.nunique()} - RMSE: {score:.4f}')
    
get_RMSE(X_test, model)

ValueError: No objects to concatenate

In [None]:
ratings_dict = {
    "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
    "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
    "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3],
}

df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

movielens = Dataset.load_builtin('ml-100k') 

In [None]:
trainingSet = movielens.build_full_trainset()
algo.fit(trainingSet)

In [None]:
def get_recommendation(id_user, id_movie, ratings):
    #cosine similarity of the ratings
    similarity_matrix = cosine_similarity(ratings.fillna(0), ratings.fillna(0))
    similarity_matrix_df = pd.DataFrame(similarity_matrix, index=ratings.index, columns=ratings.index)

    cosine_scores = similarity_matrix_df[id_user]
    ratings_scores = ratings[id_movie]
    ratings_scores.dropna().dot(cosine_scores[~ratings_scores.isna()]) / cosine_scores[~ratings_scores.isna()].sum()
    return np.dot(ratings_scores.dropna(), cosine_scores[~ratings_scores.isna()]) / cosine_scores[
        ~ratings_scores.isna()].sum()

Get recommendation for user 196 for movie 8

In [None]:
get_recommendation(196, 8, ratings)