## Extract Transform and Load the Data set.

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

reviews_df = pd.read_csv("Reviews.csv")

# Only keep reviews with non-zero helpfulness rating.
helpfulnessMask = reviews_df["HelpfulnessNumerator"] > 0
reviews_df = reviews_df[helpfulnessMask]

# Compute the length of each review and filter out reviews less than 512 characters.
reviews_df['ReviewLength'] = list(map(len, reviews_df['Text']))
reviewLengthMask = reviews_df["ReviewLength"] <= 512
reviews_df = reviews_df[reviewLengthMask]

# Drop unecessary columns
reviews_df.drop(columns=["ProductId", "ProfileName", "HelpfulnessDenominator", "Time", "Summary"], inplace=True)
reviews_df.set_index("UserId", inplace=True)

# Determine how many reviews correspond to each user id.
review_counts = reviews_df.pivot_table(index=["UserId"], aggfunc='size')
review_count_df = review_counts.to_frame()
review_count_df.rename(columns={0:"ReviewCount"}, inplace=True)

# Join the dataframes and keep reviews by users with 20 or more reviews (more reviews = more average interaction)
reviews_df = reviews_df.join(review_count_df, how="outer")
reviews_df = reviews_df[reviews_df["ReviewCount"] >= 20]

print(reviews_df.shape)
reviews_df.head(20)

## Sentiment Analysis

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from scipy.special import softmax

sentiment_analysis_pipeline = pipeline('sentiment-analysis')

sentiment_objects = list(map(sentiment_analysis_pipeline, reviews_df['Text']))
sentiment_labels = [obj[0]['label'] for obj in sentiment_objects]
sentiment_scores = [obj[0]['score'] for obj in sentiment_objects]

reviews_df["SentimentLabels"] = sentiment_labels
reviews_df["SentimentScores"] = sentiment_scores

reviews_df.to_csv("transformed_reviews.csv", header=True, index=True)

reviews_df.head()

## Data Visualization ##

In [None]:
data_df = pd.read_csv("transformed_reviews.csv")
plt.hist(data_df["SentimentLabels"])

**Random Sample of 50 Positive and 50 Negative Reviews for T-test**

In [None]:
print(type(sentiment_labels))

In [None]:
pos_reviews = reviews_df[reviews_df['SentimentLabels'] == 'POSITIVE']
neg_reviews = reviews_df[reviews_df['SentimentLabels'] == 'NEGATIVE']

pos_reviews.drop(columns=["Text", "ReviewLength"], inplace=True)
neg_reviews.drop(columns=["Text", "ReviewLength"], inplace=True)

print(pos_reviews.shape)
pos_reviews.sample(50)

In [None]:
print(neg_reviews.shape)
neg_reviews.sample(50)

In [None]:
from scipy import stats

pos_scores = pos_reviews['SentimentScores']
neg_scores = neg_reviews['SentimentScores']

t_stat, p_val = stats.ttest_ind(pos_scores, neg_scores)
print(pos_scores.mean(), pos_scores.std())
print(neg_scores.mean(), neg_scores.std())
t_stat, p_val