
# 🪧 About the Dataset

This dataset was collected via Python scraper in March 2023 and contains:

information about all beauty products (over 8,000) from the Sephora online store, including product and brand names, prices, ingredients, ratings, and all features.
user reviews (over 1 million on over 2,000 products) of all products from the Skincare category, including user appearances, and review ratings by other users


***Sentiment Analysis***: Is the emotional tone of the review positive, negative, or neutral? Which brands or products have the most positive or negative reviews?

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Imports

import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import nltk

In [None]:
nltk.download('stopwords')

# 📦Load Data


In [None]:
df = pd.read_csv('/kaggle/input/sephora-products-and-skincare-reviews/reviews_0_250.csv')

In [None]:
df.shape

In [None]:
df.head()

**From above output:** *Only few columns look to be of some use. is_recommended, rating and review_text*

Let's keep these 3 and drop the rest

In [None]:
df = df[['review_text','is_recommended', 'rating']]
df.rename(columns = {'is_recommended': 'label', 'review_text':'text'}, inplace = True)
df.head()

In [None]:
df.info()

In [None]:
df.label.value_counts()

In [None]:
# print percentage of both labels present

print("Positive labels percentage", round(df.label.value_counts()[1]/len(df) *100 ,2), "%")
print("Negative labels percentage", round(df.label.value_counts()[0]/len(df) *100 ,2), "%")

# 📐Dataset is imbalanced

In [None]:
# load other dataset files as well

df2 = pd.read_csv('/kaggle/input/sephora-products-and-skincare-reviews/reviews_250_500.csv')
df2 = df2[['review_text','is_recommended', 'rating']]
df2.rename(columns = {'is_recommended': 'label', 'review_text':'text'}, inplace = True)
df2.label.value_counts()

In [None]:
df3 = pd.read_csv('/kaggle/input/sephora-products-and-skincare-reviews/reviews_750_1000.csv')
df3 = df3[['review_text','is_recommended', 'rating']]
df3.rename(columns = {'is_recommended': 'label', 'review_text':'text'}, inplace = True)
df3.label.value_counts()

In [None]:
# concatenate df2 and df3
df_concat = pd.concat([df2,df3], axis = 0)

# get only negative labels which we are short of'

df_neg = df_concat[df_concat['label'] == 0]
df_neg.label.value_counts()

In [None]:
# concatenate df_neg with labels of orignal df

df = pd.concat([df, df_neg])
df['label'].value_counts()

In [None]:
# print percentage of both labels present

print("Positive labels percentage", round(df.label.value_counts()[1]/len(df) *100 ,2), "%")
print("Negative labels percentage", round(df.label.value_counts()[0]/len(df) *100 ,2), "%")

# ✂️ Downsizing majority class

In [None]:
df_neg = df[df['label'] == 0]
df_pos = df[df['label'] == 1].sample(len(df_neg)) # samples a number of rows equal to the length of df_neg

In [None]:
df_neg.label.value_counts()

In [None]:
df_pos.label.value_counts()

In [None]:
# concatenating and shuffling to get final usable dataset

df = pd.concat([df_pos, df_neg], axis = 0)
df = shuffle(df)
df.head()

In [None]:
# print percentage of both labels present

print("Positive labels percentage", round(df.label.value_counts()[1]/len(df) *100 ,2), "%")
print("Negative labels percentage", round(df.label.value_counts()[0]/len(df) *100 ,2), "%")

In [None]:
# checking null values
df.isnull().sum()

In [None]:
# drop null values

df = df.dropna()
df = df.reset_index(drop = True)


In [None]:
df.isnull().sum()

In [None]:
df.info()

# 🔡 Text Preprocessing

In [None]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from nltk.stem import PorterStemmer

def preprocess_text(text, remove_digits=True):
    # Removing HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Removing square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # Removing special characters
    if remove_digits:
        text = re.sub('[^a-zA-Z\s]', '', text)
    else:
        text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
    # Lowercasing
    text = text.lower()
    
    # Stemming
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    
    # Removing stopwords
    stopword_list = set(stopwords.words('english'))
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text


In [None]:
print('Before preprocessing \n', df['text'][2])

df['text'] = df['text'].apply(preprocess_text)

print('After preprocessing \n', df['text'][2])

In [None]:
# 10% for test
train_df, test_df = train_test_split(df, random_state =42, test_size = 0.10, shuffle = True)

train_df , val_df = train_test_split(train_df, test_size=0.25, random_state= 42)


In [None]:
module_url = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1" #@param ["https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1", "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1", "https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"] {allow-input: true}

In [None]:
!pip install git+https://github.com/tensorflow/docs

In [None]:
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
def train_and_evaluate_model(module_url, embed_size, name, trainable=False):
  hub_layer = hub.KerasLayer(module_url, input_shape = [], output_shape = [embed_size], dtype = tf.string, trainable = trainable)
  model = tf.keras.models.Sequential([
      hub_layer,
      tf.keras.layers.Dense(256, activation = 'relu'),
      tf.keras.layers.Dense(64, activation = 'relu'),
      tf.keras.layers.Dense(1, activation = 'sigmoid')

  ])

  model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001), loss = tf.losses.BinaryCrossentropy(), metrics = ['accuracy'])
  model.summary()
  history = model.fit(train_df['text'], train_df['label'], 
                      epochs = 100,
                      batch_size = 32, 
                      validation_data = (val_df['text'], val_df['label']), 
                      callbacks =[tfdocs.modeling.EpochDots(),
                      tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 2, mode = 'min')
                      
                      ], verbose = 0)
  return history

In [None]:
histories = {}

In [None]:
histories['gnews-swivel-20dim'] = train_and_evaluate_model(module_url, embed_size = 20, name = 'gnews-swivel-20dim')

In [None]:
plt.rcParams['figure.figsize'] = (12, 8)
plotter = tfdocs.plots.HistoryPlotter(metric = 'accuracy')
plotter.plot(histories)
plt.xlabel("Epochs")
plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')
plt.title("Accuracy Curves for Models")
plt.show()

# Finetuning model

In [None]:
histories['gnews-swivel-20dim_finetuned'] = train_and_evaluate_model(module_url, embed_size = 20, name = 'gnews-swivel-20dimfinetuned', trainable = True)

In [None]:
plt.rcParams['figure.figsize'] = (12, 8)
plotter = tfdocs.plots.HistoryPlotter(metric = 'accuracy')
plotter.plot(histories)
plt.xlabel("Epochs")
plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')
plt.title("Accuracy Curves for Models")
plt.show()

# Finetuned Swivel worked better than usual swivel model. You can try with different pretrained models with higher dimension sizes.