# AML Project: Skincare Product Recommendation System
Dataset: [Sephora Product Review](https://www.kaggle.com/datasets/nadyinky/sephora-products-and-skincare-reviews)

In [1]:
! kaggle datasets download nadyinky/sephora-products-and-skincare-reviews

Dataset URL: https://www.kaggle.com/datasets/nadyinky/sephora-products-and-skincare-reviews
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading sephora-products-and-skincare-reviews.zip to /content
 94% 138M/147M [00:01<00:00, 95.9MB/s]
100% 147M/147M [00:01<00:00, 91.0MB/s]


In [2]:
! unzip sephora-products-and-skincare-reviews.zip
! rm -rf sephora-products-and-skincare-reviews.zip

Archive:  sephora-products-and-skincare-reviews.zip
  inflating: product_info.csv        
  inflating: reviews_0-250.csv       
  inflating: reviews_1250-end.csv    
  inflating: reviews_250-500.csv     
  inflating: reviews_500-750.csv     
  inflating: reviews_750-1250.csv    


In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

import gc
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Review Text Embedding

In [None]:
! set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-base-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-base-en-v1.5').to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# load raw data
reviews = pd.DataFrame()
data_dir = Path("./")
files = []
for file in data_dir.iterdir():
    if str(file).startswith("reviews"):
        files.append(file)
files

[PosixPath('reviews_250-500.csv'),
 PosixPath('reviews_500-750.csv'),
 PosixPath('reviews_0-250.csv'),
 PosixPath('reviews_1250-end.csv'),
 PosixPath('reviews_750-1250.csv')]

In [None]:
idx = 2
reviews = pd.read_csv(files[idx], index_col=0)
reviews = reviews.reset_index(drop=True)
reviews.head()

Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,2079014373,5,1.0,,0,0,0,2023-03-14,These are the only pimple patches I’ve used th...,Best Pimple Patches,medium,blue,normal,,P442857,Focuspot Micro Tip Patches,Dr. Jart+,20.0
1,12631885517,4,1.0,,0,0,0,2023-02-08,One of my ingrown hair turned inflamed and sor...,It works!,mediumTan,brown,combination,black,P442857,Focuspot Micro Tip Patches,Dr. Jart+,20.0
2,2321761961,5,1.0,1.0,1,0,1,2023-02-05,I have tried 10 different acne/blemish patches...,Good for a large or painful breakout! Sleep in...,,hazel,combination,blonde,P442857,Focuspot Micro Tip Patches,Dr. Jart+,20.0
3,1380382883,4,1.0,,0,0,0,2023-01-24,"Love these for my mid-size breakouts, specifyi...",Micro tips are a plus!!,light,brown,combination,black,P442857,Focuspot Micro Tip Patches,Dr. Jart+,20.0
4,8871759068,4,1.0,1.0,1,0,1,2023-01-15,Best so far - though still not particularly ef...,,,,,,P442857,Focuspot Micro Tip Patches,Dr. Jart+,20.0


In [None]:
reviews_text = reviews["review_text"].fillna("").tolist()
len(reviews_text)

116262

In [None]:
def embed(sentences, norm = True):
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        sentence_embeddings = model_output[0][:, 0]
    if norm:
        # normalize embeddings
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

In [None]:
tag = str(files[idx]).split(".")[0].split("_")[-1]
tag

'500-750'

In [None]:
import os
os.makedirs(f"{tag}", exist_ok=True)
os.environ["TARGET_DIR"] = f"{tag}"

In [None]:
batchsize = 500
for i, start_idx in tqdm(enumerate(range(0, len(reviews_text), batchsize))):
    gc.collect()
    torch.cuda.empty_cache()
    if os.path.exists(f"{tag}/embeddings_{tag}_{i}_block.pt"):
        continue
    end_idx = min(start_idx + batchsize, len(reviews_text))
    batch_reviews = reviews_text[start_idx:end_idx]
    batch_embeddings = embed(batch_reviews)
    embeddings = batch_embeddings
    torch.save(embeddings, f"{tag}/embeddings_{tag}_{i}_block.pt")

233it [49:36, 12.77s/it]


In [None]:
test = torch.load(f"{tag}/embeddings_{tag}_{i-1}_block.pt")

  test = torch.load(f"{tag}/embeddings_{tag}_{i-1}_block.pt")


In [None]:
test.shape

torch.Size([500, 768])

In [None]:
! mkdir /content/drive/My\ Drive/Colab\ Notebooks/ReviewEmbed/$TARGET_DIR/

mkdir: cannot create directory ‘/content/drive/My Drive/Colab Notebooks/ReviewEmbed/500-750/’: File exists


In [None]:
! cp $TARGET_DIR/* /content/drive/My\ Drive/Colab\ Notebooks/ReviewEmbed/$TARGET_DIR/