## Extract Transform and Load the Data set.

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

reviews_df = pd.read_csv("Reviews.csv")

# Only keep reviews with non-zero helpfulness rating.
helpfulnessMask = reviews_df["HelpfulnessNumerator"] > 0
reviews_df = reviews_df[helpfulnessMask]

# Compute the length of each review and filter out reviews less than 512 characters.
reviews_df['ReviewLength'] = list(map(len, reviews_df['Text']))
reviewLengthMask = reviews_df["ReviewLength"] <= 512
reviews_df = reviews_df[reviewLengthMask]

# Drop unecessary columns
reviews_df.drop(columns=["ProductId", "ProfileName", "HelpfulnessDenominator", "Time", "Summary"], inplace=True)
reviews_df.set_index("UserId", inplace=True)

# Determine how many reviews correspond to each user id.
review_counts = reviews_df.pivot_table(index=["UserId"], aggfunc='size')
review_count_df = review_counts.to_frame()
review_count_df.rename(columns={0:"ReviewCount"}, inplace=True)

# Join the dataframes and keep reviews by users with 20 or more reviews (more reviews = more average interaction)
reviews_df = reviews_df.join(review_count_df, how="outer")
reviews_df = reviews_df[reviews_df["ReviewCount"] >= 20]

print(reviews_df.shape)
reviews_df.head(20)

(7609, 6)


Unnamed: 0_level_0,Id,HelpfulnessNumerator,Score,Text,ReviewLength,ReviewCount
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A106ZCP7RSXMRU,19034,1,5,We like the regular Little Bear cheese puffs s...,331,40
A106ZCP7RSXMRU,62479,1,4,"Gorilla Munch tastes a lot like Cap'n Crunch, ...",378,40
A106ZCP7RSXMRU,74434,2,5,We love Garden of Eatin' chips. Our favorites ...,246,40
A106ZCP7RSXMRU,74472,2,5,We only buy organic natural foods without any ...,426,40
A106ZCP7RSXMRU,94442,2,5,We love Garden of Eatin' chips. Our favorites ...,246,40
A106ZCP7RSXMRU,94480,2,5,We only buy organic natural foods without any ...,426,40
A106ZCP7RSXMRU,151847,3,3,"We tried McCanns oatmeal about a year ago, ord...",512,40
A106ZCP7RSXMRU,154638,2,5,These were my favorite cracker snack until we ...,413,40
A106ZCP7RSXMRU,159008,2,5,This is our favorite brand of popcorn. They do...,410,40
A106ZCP7RSXMRU,161749,3,5,We love these! It is hard not to eat the whole...,275,40


## Sentiment Analysis

In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from scipy.special import softmax

sentiment_analysis_pipeline = pipeline('sentiment-analysis')

sentiment_objects = list(map(sentiment_analysis_pipeline, reviews_df['Text']))
sentiment_labels = [obj[0]['label'] for obj in sentiment_objects]
sentiment_scores = [obj[0]['score'] for obj in sentiment_objects]

reviews_df["SentimentLabels"] = sentiment_labels
reviews_df["SentimentScores"] = sentiment_scores

reviews_df.to_csv("transformed_reviews.csv", header=True, index=True)

reviews_df.head()

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
    PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.0.1+cpu)
    Python  3.9.13 (you have 3.9.13)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unnamed: 0_level_0,Id,HelpfulnessNumerator,Score,Text,ReviewLength,ReviewCount,SentimentLabels,SentimentScores
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A106ZCP7RSXMRU,19034,1,5,We like the regular Little Bear cheese puffs s...,331,40,POSITIVE,0.999824
A106ZCP7RSXMRU,62479,1,4,"Gorilla Munch tastes a lot like Cap'n Crunch, ...",378,40,POSITIVE,0.997502
A106ZCP7RSXMRU,74434,2,5,We love Garden of Eatin' chips. Our favorites ...,246,40,POSITIVE,0.999241
A106ZCP7RSXMRU,74472,2,5,We only buy organic natural foods without any ...,426,40,POSITIVE,0.995697
A106ZCP7RSXMRU,94442,2,5,We love Garden of Eatin' chips. Our favorites ...,246,40,POSITIVE,0.999241


**Random Sample of 50 Positive and 50 Negative Reviews for T-test**

In [3]:
print(type(sentiment_labels))

<class 'list'>


In [26]:
pos_reviews = reviews_df[reviews_df['SentimentLabels'] == 'POSITIVE']
neg_reviews = reviews_df[reviews_df['SentimentLabels'] == 'NEGATIVE']

pos_reviews.drop(columns=["Text", "ReviewLength"], inplace=True)
neg_reviews.drop(columns=["Text", "ReviewLength"], inplace=True)

print(pos_reviews.shape)
pos_reviews.sample(50)

(5662, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_reviews.drop(columns=["Text", "ReviewLength"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_reviews.drop(columns=["Text", "ReviewLength"], inplace=True)


Unnamed: 0_level_0,Id,HelpfulnessNumerator,Score,ReviewCount,SentimentLabels,SentimentScores
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A30LM7TMYK1C84,212775,4,5,25,POSITIVE,0.993092
ADMWYMH11LD27,421285,1,4,27,POSITIVE,0.99966
A1QK0XCAPHIW1L,537851,2,5,21,POSITIVE,0.999825
A2WVF9ZQ068DN0,437546,3,5,23,POSITIVE,0.999691
A2158Y2U61AU9G,369873,2,5,28,POSITIVE,0.995125
A3F3B1JPACN215,111381,13,4,51,POSITIVE,0.911933
A2FBP7NGRMG73O,101136,3,5,25,POSITIVE,0.999815
A1QRA8XPFUEKW5,178259,1,5,25,POSITIVE,0.99096
AISQLBDGS2KXR,559348,1,5,28,POSITIVE,0.99983
A1YP5WLIHGG136,264686,1,4,30,POSITIVE,0.99069


In [25]:
print(neg_reviews.shape)
neg_reviews.sample(50)

(1947, 6)


Unnamed: 0_level_0,Id,HelpfulnessNumerator,Score,ReviewCount,SentimentLabels,SentimentScores
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A2OEUROGZDTXUJ,26163,1,5,43,NEGATIVE,0.995191
AHUYS6XZ39TJK,530704,2,1,25,NEGATIVE,0.99473
A2C9XE9I8RSKNX,56223,1,2,27,NEGATIVE,0.998461
A3F3B1JPACN215,541419,3,5,51,NEGATIVE,0.997608
A3T3S48UAVTUE9,539369,2,5,21,NEGATIVE,0.982393
AHUYS6XZ39TJK,101094,2,1,25,NEGATIVE,0.99473
A1QK0XCAPHIW1L,139798,1,4,21,NEGATIVE,0.677489
AZ4QFFUT8QVTA,381390,1,5,25,NEGATIVE,0.985767
A3TJPSWY2HE4BS,352044,1,2,25,NEGATIVE,0.961792
A17GK9E70O7Y9R,232761,2,1,28,NEGATIVE,0.999216


In [30]:
from scipy import stats

pos_scores = pos_reviews['SentimentScores']
neg_scores = neg_reviews['SentimentScores']

t_stat, p_val = stats.ttest_ind(pos_scores, neg_scores)
print(pos_scores.mean(), pos_scores.std())
print(neg_scores.mean(), neg_scores.std())
t_stat, p_val

0.9846301892228195 0.05662112140548824
0.9591782200501769 0.08982381675198105


(14.522919069520311, 3.694540573224666e-47)