## Extract Transform and Load the Data set.

In [128]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

reviews_df = pd.read_csv("Reviews.csv")

#Only keep reviews with a non-zero helpfulness rating.
reviews_df = reviews_df[reviews_df["HelpfulnessNumerator"] > 0]

#Compute the length of each review and delete all reviews that are longer than 512 characters.
reviews_df['ReviewLength'] = list(map(len, reviews_df['Text']))
reviews_df = reviews_df[reviews_df["ReviewLength"] <= 512]

#Drop all unnecessary columns.
reviews_df.drop(columns=["ProductId", "ProfileName", "HelpfulnessDenominator", "Time", "Summary"], inplace=True)
reviews_df.set_index("UserId", inplace=True)

#Determine how many reviews correspond to each UserId.
review_counts = reviews_df.pivot_table(index=["UserId"], aggfunc='size')
review_count_df = review_counts.to_frame()
review_count_df.rename(columns={0:"ReviewCount"}, inplace=True)

#Only keep reviews by users with atleast 20 reviews.
reviews_df = reviews_df.join(review_count_df, how="outer")
reviews_df = reviews_df[reviews_df["ReviewCount"] >=20]

print(reviews_df.shape)
reviews_df.head(20)

(7609, 6)


Unnamed: 0_level_0,Id,HelpfulnessNumerator,Score,Text,ReviewLength,ReviewCount
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A106ZCP7RSXMRU,19034,1,5,We like the regular Little Bear cheese puffs s...,331,40
A106ZCP7RSXMRU,62479,1,4,"Gorilla Munch tastes a lot like Cap'n Crunch, ...",378,40
A106ZCP7RSXMRU,74434,2,5,We love Garden of Eatin' chips. Our favorites ...,246,40
A106ZCP7RSXMRU,74472,2,5,We only buy organic natural foods without any ...,426,40
A106ZCP7RSXMRU,94442,2,5,We love Garden of Eatin' chips. Our favorites ...,246,40
A106ZCP7RSXMRU,94480,2,5,We only buy organic natural foods without any ...,426,40
A106ZCP7RSXMRU,151847,3,3,"We tried McCanns oatmeal about a year ago, ord...",512,40
A106ZCP7RSXMRU,154638,2,5,These were my favorite cracker snack until we ...,413,40
A106ZCP7RSXMRU,159008,2,5,This is our favorite brand of popcorn. They do...,410,40
A106ZCP7RSXMRU,161749,3,5,We love these! It is hard not to eat the whole...,275,40


## Sentiment Analysis

In [129]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from scipy.special import softmax

# sentiment_analysis_pipeline = pipeline('sentiment-analysis')

# x = sentiment_analysis_pipeline('Hello I am happy')
# print(x[0]['label'])
# reviews_df['Label'] = sentiment_analysis_pipeline(reviews_df['Text'])
# reviews_df['Label'] = list(map(sentiment_analysis_pipeline, reviews_df['Text']))

# x = list(map(sentiment_analysis_pipeline, reviews_df['Text']))

# print(x)

# reviews_df



#reviews_df.to_csv("transformed_reviews.csv", header=True, index=True)

**Random Sample of 50 Positive and 50 Negative Reviews for T-test**

In [130]:
pos_reviews = reviews_df["Score"] >= 4 # 4 stars or more is a positive review
reviews_pos = reviews_df[pos_reviews]
neg_reviews = reviews_df["Score"] <= 2 # 2 stars and below is a negative review
reviews_neg = reviews_df[neg_reviews]

reviews_pos.drop(columns=["Text", "ReviewLength"], inplace=True)
reviews_neg.drop(columns=["Text", "ReviewLength"], inplace=True)

print(reviews_pos.shape)
reviews_pos.sample(50)

(6222, 4)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_pos.drop(columns=["Text", "ReviewLength"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_neg.drop(columns=["Text", "ReviewLength"], inplace=True)


Unnamed: 0_level_0,Id,HelpfulnessNumerator,Score,ReviewCount
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A2II09GQGWOMTQ,52806,1,5,62
A3AOF4C0MXEQSP,198487,5,4,27
A20MMCAHGALRAX,293957,1,5,26
A24ZV048V7J0MT,134800,3,4,26
A1NJXFN4V5AUOL,77969,1,5,20
A3M174IC0VXOS2,422364,4,5,29
A1FP5ZLSKR07SU,80728,4,4,29
A3UUTDX2WNVUS3,126026,3,5,22
A2WVF9ZQ068DN0,499607,2,5,23
A1GV4HEM6XZC06,67930,3,5,24


In [131]:
print(reviews_neg.shape)
reviews_neg.sample(50)

(945, 4)


Unnamed: 0_level_0,Id,HelpfulnessNumerator,Score,ReviewCount
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A2L01339XV496V,462875,19,2,20
A1SB8CXAUIKT8X,90917,3,2,44
A2IYLMDQGBB4VG,294141,1,2,29
A10PJEHY3JKKQG,212680,3,1,25
A3JUM0WSKPN0NT,101098,1,1,25
A2XNJJ9TF70P4J,331544,2,1,24
A2LJJ4482V9LAW,288481,8,1,30
AQLL2R1PPR46X,376568,5,2,44
A2DBQWLG9HU8N5,47449,1,1,20
A2Q2GDB5XDYQV5,402408,9,1,27
