# 1. Extract Review Data  
Project from: [Nicholas Renotte's Sentiment Analyser for Yelp Reviews](https://www.nicholasrenotte.com/how-to-build-a-sentiment-analyser-for-yelp-reviews-in-python/)  
Date: 21 May 2024

In [8]:
import pandas as pd
from zipfile import ZipFile

In [9]:
# Unzip the downloaded dataset
with ZipFile('amazon-fine-food-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Load the dataset
df = pd.read_csv('Reviews.csv', usecols=['Text'])
df.rename(columns={'Text': 'review'}, inplace=True)

# 2. Analysis Raw Data

In [10]:
import numpy as np
# Import stopwords
import nltk
from nltk.corpus import stopwords

In [11]:
# 1. Word Count
df['word_count'] = df['review'].apply(lambda x: len(str(x).split(" ")))
# 2. Character Count
df['char_count'] = df['review'].str.len()
# 3. Average word length
def avg_word(review):
  words = review.split()
  return (sum(len(word) for word in words) / len(words))

df['avg_word'] = df['review'].apply(lambda x: avg_word(x))
# 4. Stop Word Count
stop_words = stopwords.words('english')
df['stopword_count'] = df['review'].apply(lambda x: len([x for x in x.split() if x in stop_words]))

In [12]:
df.describe()

Unnamed: 0,word_count,char_count,avg_word,stopword_count
count,568454.0,568454.0,568454.0,568454.0
mean,82.005522,436.222083,4.400934,32.074824
std,80.807102,445.339741,0.441144,32.115626
min,3.0,12.0,1.0,0.0
25%,34.0,179.0,4.12,13.0
50%,58.0,302.0,4.36,23.0
75%,100.0,527.0,4.625,40.0
max,3526.0,21409.0,35.296296,1295.0


In [13]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count
0,I have bought several of the Vitality canned d...,49,263,4.479167,21
1,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129,12
2,This is a confection that has been around a fe...,99,509,4.37234,42
3,If you are looking for the secret ingredient i...,43,219,4.317073,15
4,Great taffy at a great price. There was a wid...,30,140,4.111111,12


# 3. Clean Dataset

In [15]:
# 1. Lower case all words
df['review_lower'] = df['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
# 2. Remove Punctuation
df['review_nopunc'] = df['review_lower'].str.replace('[^\w\s]', '')
# 3. Remove Stopwords
stop_words = stopwords.words('english')
df['review_nopunc_nostop'] = df['review_nopunc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
# 4. Return frequency of values
freq= pd.Series(" ".join(df['review_nopunc_nostop']).split()).value_counts()[:30]
other_stopwords = ['get', 'us', 'see', 'use', 'said', 'asked', 'day', 'go' \
  'even', 'ive', 'right', 'left', 'always', 'would', 'told', \
  'get', 'us', 'would', 'get', 'one', 'ive', 'go', 'even', \
  'also', 'ever', 'x', 'take', 'let' ]
df['review_nopunc_nostop_nocommon'] = df['review_nopunc_nostop'].apply(lambda x: "".join(" ".join(x for x in x.split() if x not in other_stopwords)))

In [16]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,review_lower,review_nopunc,review_nopunc_nostop,review_nopunc_nostop_nocommon
0,I have bought several of the Vitality canned d...,49,263,4.479167,21,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,bought several vitality canned dog food produc...,bought several vitality canned dog food produc...
1,Product arrived labeled as Jumbo Salted Peanut...,31,190,5.16129,12,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled jumbo salted peanuts.....,product arrived labeled jumbo salted peanuts.....
2,This is a confection that has been around a fe...,99,509,4.37234,42,this is a confection that has been around a fe...,this is a confection that has been around a fe...,"confection around centuries. light, pillowy ci...","confection around centuries. light, pillowy ci..."
3,If you are looking for the secret ingredient i...,43,219,4.317073,15,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,looking secret ingredient robitussin believe f...
4,Great taffy at a great price. There was a wid...,30,140,4.111111,12,great taffy at a great price. there was a wide...,great taffy at a great price. there was a wide...,great taffy great price. wide assortment yummy...,great taffy great price. wide assortment yummy...


In [24]:
# Save the processed DataFrame to a CSV file
processed_csv_filename = 'processed_reviews.csv'
df.to_csv(processed_csv_filename, index=False)
print("Processed data saved to CSV.")

Processed data saved to CSV.


# 4. Lemmatize the review

In [25]:
processed_csv_filename = 'processed_reviews.csv'
df = pd.read_csv(processed_csv_filename)

In [26]:
# Import textblob
from textblob import Word

In [27]:
# Lemmatize final review format
df['cleaned_review'] = df['review_nopunc_nostop_nocommon'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [None]:
df.head()

# 5. Sentiment Analysis

In [28]:
from textblob import TextBlob

In [31]:
# 1. Calculate polarity
df['polarity'] = df['cleaned_review'].apply(lambda x: TextBlob(x).sentiment[0])
# 2. Calculate subjectivity
df['subjectivity'] = df['cleaned_review'].apply(lambda x: TextBlob(x).sentiment[1])

In [32]:
df_metric = df[['review', 'polarity','subjectivity']]
df_metric.head()

Unnamed: 0,review,polarity,subjectivity
0,I have bought several of the Vitality canned d...,0.44,0.42
1,Product arrived labeled as Jumbo Salted Peanut...,0.216667,0.762963
2,This is a confection that has been around a fe...,0.187,0.548
3,If you are looking for the secret ingredient i...,0.15,0.65
4,Great taffy at a great price. There was a wid...,0.458333,0.6


In [33]:
review_metric_csv_filename = 'reviews_metric.csv'
df.to_csv(review_metric_csv_filename, index=False)
print("Review Metric data saved to CSV.")

Review Metric data saved to CSV.


# 6. Visualising Correlation