# Neural Network for Sentiment Analysis

# Data exploration

Data: Sephora Products and Skincare review, found at: https://www.kaggle.com/datasets/nadyinky/sephora-products-and-skincare-reviews?resource=download

Inspiration taken from the sentiment analysis task on Kaggle: \
https://www.kaggle.com/code/aashidutt3/sentiment-analysis-sephora-reviews \
last checked on Jan 23, 2024

In [1]:
import pandas as pd
import glob
from sklearn.utils import shuffle

In [2]:
def get_total_reviews_for_all_dataset(file_paths):
    """Iterate over a list of file paths and print the file path together with the total rows in it.
    Print the total reviews across all files.

    Parameter:
    -file_paths: a Python list containing file paths in string format.
    """
    total_reviews = 0
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        rows_in_file = df.shape[0]
        total_reviews += rows_in_file
        print(f"File: {file_path}, Reviews: {rows_in_file}")

    print("\nTotal reviews across all files:", total_reviews)

In [3]:
def process_csv_file(file_path):
    """Take the path to a csv file as an argument as a string, 
    read it with pandas and keep the following columns that are valuable for the task:
    -'review_text', rename to 'text'
    -'is_recommended', rename to 'label'
    -'rating', name stays unchanged.
    Return the DataFrame containing these three columns.
    
    Parameter:
    -file_path: path to a csv file."""
    
    #reading the csv file into a DataFrame with pandas
    df = pd.read_csv(file_path)

    #selecting the columns that are valuable for the task
    df = df[['review_text', 'is_recommended', 'rating']]

    #renaming the columns
    df.rename(columns={'is_recommended': 'label', 'review_text': 'text'}, inplace=True)

    return df

In [4]:
def print_label_percentages(df):
    """Take a DataFrame (df) as an argument and
    prints the percentage of positive and negative labels in it."""
    
    positive_percentage = round(df['label'].value_counts()[1] / len(df) * 100, 2)
    negative_percentage = round(df['label'].value_counts()[0] / len(df) * 100, 2)

    print("Positive labels percentage:", positive_percentage, "%")
    print("Negative labels percentage:", negative_percentage, "%")

In [5]:
def calculate_percentage_and_count_for_values(df, target_column, condition_column, values):
    """Calculate and print the percentages and counts for a list of specified values in a DataFrame.

    Parameters:
    - df: a pandas DataFrame
    - target_column: the name of the column for which to calculate the percentage in the df, provided as Python string
    - condition_column: the name of the column containing the condition for filtering, provided as Python string
    - values: a list of values to filter and calculate the percentages and counts in the df

    Returns None.
    Print the percentage and counts of the specified values."""
    
    for value in values:
        # filtering the df based on the condition
        filtered_df = df[df[condition_column] == value]

        # calculate positive and negative percentages
        positive_percentage = round(filtered_df[target_column].value_counts(normalize=True)[1] * 100, 2)
        negative_percentage = round(filtered_df[target_column].value_counts(normalize=True)[0] * 100, 2)

        # getting the counts
        positive_count = filtered_df[target_column].value_counts()[1]
        negative_count = filtered_df[target_column].value_counts()[0]

        print(f"""For {condition_column} value {value}: \n Positive {target_column}: {positive_percentage}% - count: {positive_count} \n Negative {target_column}: {negative_percentage}% - count: {negative_count} \n""")

In [6]:
def write_df_to_file(df, file_path, index=False):
    """Write a pandas DataFrame to a csv file.

    Parameters:
    - dataframe: pandas DataFrame
    - file_path: path to the output file in string format
    - index: whether to include the index in the output file as Boolean (default is False)
    """
    df.to_csv(file_path, index=index)
    print(f"df successfully written to {file_path}")

In [7]:
#checking the amount of files in the dataset
file_paths = []
for filename in glob.glob('./sephora-data/reviews*'):
    print(filename)
    file_paths.append(filename)

./sephora-data/reviews_0-250.csv
./sephora-data/reviews_1250-end.csv
./sephora-data/reviews_750-1250.csv
./sephora-data/reviews_250-500.csv
./sephora-data/reviews_500-750.csv


In [8]:
get_total_reviews_for_all_dataset(file_paths)

  df = pd.read_csv(file_path)


File: ./sephora-data/reviews_0-250.csv, Reviews: 602130


  df = pd.read_csv(file_path)


File: ./sephora-data/reviews_1250-end.csv, Reviews: 49977


  df = pd.read_csv(file_path)


File: ./sephora-data/reviews_750-1250.csv, Reviews: 119317
File: ./sephora-data/reviews_250-500.csv, Reviews: 206725
File: ./sephora-data/reviews_500-750.csv, Reviews: 116262

Total reviews across all files: 1094411


In [9]:
#reading the first file with pandas
df = pd.read_csv(file_paths[0])

  df = pd.read_csv(file_paths[0])


In [10]:
df.shape

(602130, 19)

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,1,31423088263,1,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


In [12]:
#keeping the data that is valueable for the task
df = process_csv_file(file_paths[0])
df.head()

  df = pd.read_csv(file_path)


Unnamed: 0,text,label,rating
0,I use this with the Nudestix “Citrus Clean Bal...,1.0,5
1,I bought this lip mask after reading the revie...,0.0,1
2,My review title says it all! I get so excited ...,1.0,5
3,I’ve always loved this formula for a long time...,1.0,5
4,"If you have dry cracked lips, this is a must h...",1.0,5


In [13]:
#checking the count of reviews, labels and rating in the first file
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602130 entries, 0 to 602129
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   text    601131 non-null  object 
 1   label   484644 non-null  float64
 2   rating  602130 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 13.8+ MB


In [14]:
#checking the amount of labels per class in the first file
df.label.value_counts()

label
1.0    406094
0.0     78550
Name: count, dtype: int64

In [15]:
#printing percentage of both labels in first file. 1 stands for positive and 0 for negative reviews
print_label_percentages(df)

Positive labels percentage: 67.44 %
Negative labels percentage: 13.05 %


In [16]:
#checking the distribution per rating in the unbalanced dataset
values = [1,2,3,4,5]
calculate_percentage_and_count_for_values(df, 'label', 'rating', values)

For rating value 1: 
 Positive label: 0.91% - count: 252 
 Negative label: 99.09% - count: 27538 

For rating value 2: 
 Positive label: 3.67% - count: 896 
 Negative label: 96.33% - count: 23508 

For rating value 3: 
 Positive label: 35.35% - count: 13166 
 Negative label: 64.65% - count: 24081 

For rating value 4: 
 Positive label: 96.52% - count: 84007 
 Negative label: 3.48% - count: 3026 

For rating value 5: 
 Positive label: 99.87% - count: 307773 
 Negative label: 0.13% - count: 397 



In [17]:
# Filter to count only rows where 'label' column is not null
filtered_df = df[df['label'].notna()]

# Count the occurrences of each value in 'label' in the filtered DataFrame
value_counts = filtered_df['label'].value_counts()
print(value_counts)

# print updated percentage of both labels present
print_label_percentages(filtered_df)

label
1.0    406094
0.0     78550
Name: count, dtype: int64
Positive labels percentage: 83.79 %
Negative labels percentage: 16.21 %


## Preparing the data

In [18]:
#downsizing the majority class but also reducing the length of the corpus for experimental purposes
df_neg = filtered_df[filtered_df['label'] == 0].sample(25000)
df_pos = filtered_df[filtered_df['label'] == 1].sample(len(df_neg)) #sampling a number of rows equal to the length of negative labels (df_neg)

In [19]:
df_neg.label.value_counts()

label
0.0    25000
Name: count, dtype: int64

In [20]:
df_pos.label.value_counts()

label
1.0    25000
Name: count, dtype: int64

In [21]:
#concatenating and shuffling to get final usable dataset
final_df = pd.concat([df_pos, df_neg], axis = 0)
final_df = shuffle(final_df)
final_df.head()

Unnamed: 0,text,label,rating
388390,has helped so much reduce blackheads and pores...,1.0,5
55194,"It’s okay, not the best. I prefer the Clinque ...",0.0,2
206131,Best new addition to my skin care routine this...,1.0,5
391555,This product goes on beautifully! It IS import...,1.0,5
196929,After hearing rave reviews from a couple of fr...,0.0,1


In [22]:
# print percentage of both labels present
print_label_percentages(final_df)

Positive labels percentage: 50.0 %
Negative labels percentage: 50.0 %


In [23]:
#checking if the data contains null values
final_df.isnull().sum()

text      95
label      0
rating     0
dtype: int64

In [24]:
#dropping null values
final_df = final_df.dropna()
final_df = final_df.reset_index(drop = True)

In [25]:
final_df.isnull().sum()

text      0
label     0
rating    0
dtype: int64

In [26]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49905 entries, 0 to 49904
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    49905 non-null  object 
 1   label   49905 non-null  float64
 2   rating  49905 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ MB


In [27]:
final_df.label.value_counts()

label
0.0    24954
1.0    24951
Name: count, dtype: int64

In [28]:
#checking the final distribution per rating in the balanced dataset
calculate_percentage_and_count_for_values(final_df, 'label', 'rating', values)

For rating value 1: 
 Positive label: 0.15% - count: 13 
 Negative label: 99.85% - count: 8831 

For rating value 2: 
 Positive label: 0.91% - count: 68 
 Negative label: 99.09% - count: 7378 

For rating value 3: 
 Positive label: 9.94% - count: 848 
 Negative label: 90.06% - count: 7685 

For rating value 4: 
 Positive label: 84.61% - count: 5185 
 Negative label: 15.39% - count: 943 

For rating value 5: 
 Positive label: 99.38% - count: 18837 
 Negative label: 0.62% - count: 117 



In [29]:
#for rating 3, the majority results is negative labels ('not recommended') in the cleaned data 
#so there is no need to introduce a neutral label

In [30]:
#writing the final df into a file as the final dataset to work on this SA task
output_file_path = './sephora-data/sa-reviews_smaller.csv'

write_df_to_file(final_df, output_file_path)

df successfully written to ./sephora-data/sa-reviews_smaller.csv
