# Scenario: 

### Analyze customer reviews and sentiment data to categorize them as positive, negative, or neutral, providing insights for businesses to improve customer satisfaction.

# DateSet

### reviewerName: Name or identifier of the reviewer
### overall: Overall Rating that has been given by the reviewer (Scale of 1 to 5)
### reviewText: Content of the review
### reviewTime: Time when the review was posted
### day_diff: Number of days since the post was posted
### helpful_yes: Number of users that have found the review helpful
### helpful_no: Number of user that have not founf the review helpful
### total_vote: Total amount of votes both helpful ot unhelpful





## Importing Libraries 

In [60]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import string
from collections import Counter

from textblob import TextBlob

import warnings
warnings.simplefilter('ignore', category=Warning, lineno=0, append=False)

## Importing

In [61]:
data = pd.read_csv('../data/amazon_reviews.csv', index_col='Unnamed: 0')

print(data.head())

   reviewerName  overall                                         reviewText  \
0           NaN      4.0                                         No issues.   
1          0mie      5.0  Purchased this for my device, it worked as adv...   
2           1K3      4.0  it works as expected. I should have sprung for...   
3           1m2      5.0  This think has worked out great.Had a diff. br...   
4  2&amp;1/2Men      5.0  Bought it with Retail Packaging, arrived legit...   

   reviewTime  day_diff  helpful_yes  helpful_no  total_vote  \
0  2014-07-23       138            0           0           0   
1  2013-10-25       409            0           0           0   
2  2012-12-23       715            0           0           0   
3  2013-11-21       382            0           0           0   
4  2013-07-13       513            0           0           0   

   score_pos_neg_diff  score_average_rating  wilson_lower_bound  
0                   0                   0.0                 0.0  
1       

## Lowercase Text

In [62]:
data['reviewText_lower'] = data['reviewText'].str.lower()
print(data['reviewText_lower'])

0                                              no issues.
1       purchased this for my device, it worked as adv...
2       it works as expected. i should have sprung for...
3       this think has worked out great.had a diff. br...
4       bought it with retail packaging, arrived legit...
                              ...                        
4910    i bought this sandisk 16gb class 10 to use wit...
4911    used this for extending the capabilities of my...
4912    great card that is very fast and reliable. it ...
4913    good amount of space for the stuff i want to d...
4914    i've heard bad things about this 64gb micro sd...
Name: reviewText_lower, Length: 4915, dtype: object


## Punctuation Removal

In [63]:
text_punctu = [str(text).translate(str.maketrans('','',string.punctuation)) for text in data['reviewText_lower']]

## Tokenize the Data

In [64]:
tokenized_data = [nltk.word_tokenize(text) for text in text_punctu]
tokenized_data

[['no', 'issues'],
 ['purchased',
  'this',
  'for',
  'my',
  'device',
  'it',
  'worked',
  'as',
  'advertised',
  'you',
  'can',
  'never',
  'have',
  'too',
  'much',
  'phone',
  'memory',
  'since',
  'i',
  'download',
  'a',
  'lot',
  'of',
  'stuff',
  'this',
  'was',
  'a',
  'no',
  'brainer',
  'for',
  'me'],
 ['it',
  'works',
  'as',
  'expected',
  'i',
  'should',
  'have',
  'sprung',
  'for',
  'the',
  'higher',
  'capacity',
  'i',
  'think',
  'its',
  'made',
  'a',
  'bit',
  'cheesier',
  'than',
  'the',
  'earlier',
  'versions',
  'the',
  'paint',
  'looks',
  'not',
  'as',
  'clean',
  'as',
  'before'],
 ['this',
  'think',
  'has',
  'worked',
  'out',
  'greathad',
  'a',
  'diff',
  'bran',
  '64gb',
  'card',
  'and',
  'if',
  'went',
  'south',
  'after',
  '3',
  'monthsthis',
  'one',
  'has',
  'held',
  'up',
  'pretty',
  'well',
  'since',
  'i',
  'had',
  'my',
  's3',
  'now',
  'on',
  'my',
  'note3',
  'update',
  '32114ive',
  'h

## StopWords

In [65]:
stop_words_english = set(stopwords.words('english'))

## StopWords Removal

In [66]:
data_filtered = [[word for word in text if word not in stop_words_english] for text in tokenized_data]
data_filtered

[['issues'],
 ['purchased',
  'device',
  'worked',
  'advertised',
  'never',
  'much',
  'phone',
  'memory',
  'since',
  'download',
  'lot',
  'stuff',
  'brainer'],
 ['works',
  'expected',
  'sprung',
  'higher',
  'capacity',
  'think',
  'made',
  'bit',
  'cheesier',
  'earlier',
  'versions',
  'paint',
  'looks',
  'clean'],
 ['think',
  'worked',
  'greathad',
  'diff',
  'bran',
  '64gb',
  'card',
  'went',
  'south',
  '3',
  'monthsthis',
  'one',
  'held',
  'pretty',
  'well',
  'since',
  's3',
  'note3',
  'update',
  '32114ive',
  'months',
  'zero',
  'issues',
  'since',
  'transferred',
  's3',
  'note3',
  'note2',
  'card',
  'reliable',
  'solidcheers'],
 ['bought',
  'retail',
  'packaging',
  'arrived',
  'legit',
  'orange',
  'envelope',
  'english',
  'version',
  'asian',
  'like',
  'picture',
  'shows',
  'arrived',
  'quickly',
  'bought',
  '32',
  '16',
  'retail',
  'packaging',
  'htc',
  'one',
  'sv',
  'lg',
  'optimus',
  'cards',
  'working

## Lemmatization 

In [67]:
data_filtered

[['issues'],
 ['purchased',
  'device',
  'worked',
  'advertised',
  'never',
  'much',
  'phone',
  'memory',
  'since',
  'download',
  'lot',
  'stuff',
  'brainer'],
 ['works',
  'expected',
  'sprung',
  'higher',
  'capacity',
  'think',
  'made',
  'bit',
  'cheesier',
  'earlier',
  'versions',
  'paint',
  'looks',
  'clean'],
 ['think',
  'worked',
  'greathad',
  'diff',
  'bran',
  '64gb',
  'card',
  'went',
  'south',
  '3',
  'monthsthis',
  'one',
  'held',
  'pretty',
  'well',
  'since',
  's3',
  'note3',
  'update',
  '32114ive',
  'months',
  'zero',
  'issues',
  'since',
  'transferred',
  's3',
  'note3',
  'note2',
  'card',
  'reliable',
  'solidcheers'],
 ['bought',
  'retail',
  'packaging',
  'arrived',
  'legit',
  'orange',
  'envelope',
  'english',
  'version',
  'asian',
  'like',
  'picture',
  'shows',
  'arrived',
  'quickly',
  'bought',
  '32',
  '16',
  'retail',
  'packaging',
  'htc',
  'one',
  'sv',
  'lg',
  'optimus',
  'cards',
  'working

In [68]:
lem = WordNetLemmatizer()

In [69]:

lemmatized_txt = [[lem.lemmatize(word) for word in text] for text in data_filtered]
lemmatized_txt

[['issue'],
 ['purchased',
  'device',
  'worked',
  'advertised',
  'never',
  'much',
  'phone',
  'memory',
  'since',
  'download',
  'lot',
  'stuff',
  'brainer'],
 ['work',
  'expected',
  'sprung',
  'higher',
  'capacity',
  'think',
  'made',
  'bit',
  'cheesier',
  'earlier',
  'version',
  'paint',
  'look',
  'clean'],
 ['think',
  'worked',
  'greathad',
  'diff',
  'bran',
  '64gb',
  'card',
  'went',
  'south',
  '3',
  'monthsthis',
  'one',
  'held',
  'pretty',
  'well',
  'since',
  's3',
  'note3',
  'update',
  '32114ive',
  'month',
  'zero',
  'issue',
  'since',
  'transferred',
  's3',
  'note3',
  'note2',
  'card',
  'reliable',
  'solidcheers'],
 ['bought',
  'retail',
  'packaging',
  'arrived',
  'legit',
  'orange',
  'envelope',
  'english',
  'version',
  'asian',
  'like',
  'picture',
  'show',
  'arrived',
  'quickly',
  'bought',
  '32',
  '16',
  'retail',
  'packaging',
  'htc',
  'one',
  'sv',
  'lg',
  'optimus',
  'card',
  'working',
  'or

## Stemmatization

In [70]:
stem = PorterStemmer()
stem_txt = [[stem.stem(word) for word in text] for text in lemmatized_txt]
stem_txt


[['issu'],
 ['purchas',
  'devic',
  'work',
  'advertis',
  'never',
  'much',
  'phone',
  'memori',
  'sinc',
  'download',
  'lot',
  'stuff',
  'brainer'],
 ['work',
  'expect',
  'sprung',
  'higher',
  'capac',
  'think',
  'made',
  'bit',
  'cheesier',
  'earlier',
  'version',
  'paint',
  'look',
  'clean'],
 ['think',
  'work',
  'greathad',
  'diff',
  'bran',
  '64gb',
  'card',
  'went',
  'south',
  '3',
  'monthsthi',
  'one',
  'held',
  'pretti',
  'well',
  'sinc',
  's3',
  'note3',
  'updat',
  '32114ive',
  'month',
  'zero',
  'issu',
  'sinc',
  'transfer',
  's3',
  'note3',
  'note2',
  'card',
  'reliabl',
  'solidch'],
 ['bought',
  'retail',
  'packag',
  'arriv',
  'legit',
  'orang',
  'envelop',
  'english',
  'version',
  'asian',
  'like',
  'pictur',
  'show',
  'arriv',
  'quickli',
  'bought',
  '32',
  '16',
  'retail',
  'packag',
  'htc',
  'one',
  'sv',
  'lg',
  'optimu',
  'card',
  'work',
  'order',
  'probabl',
  'best',
  'price',
  'you

## Vocabulary

In [71]:
vocab = set([word for text in stem_txt for word in text])
vocab

{'toth',
 'oneim',
 'bust',
 'amazonpackag',
 'compra',
 'keyboard',
 'kingston',
 'fastestupd',
 '512',
 '34review34',
 'supporti',
 'solvedtherefor',
 'hjgh',
 'soap',
 'comput',
 'exp',
 'bandwidth',
 'showedno',
 'fastestlarg',
 'visibl',
 'w',
 'repeat',
 'iopstest',
 'tabletit',
 'removedi',
 'prepaid',
 '403',
 '1595',
 'xotherwis',
 'freak',
 '128522',
 'ok',
 'glitchi',
 'tabletupd',
 'openamazon',
 'beyondhav',
 'reliev',
 'grade',
 '30mbp',
 'pda',
 'builtin',
 'stripe',
 'basic',
 'stunt',
 '34cheaper34',
 'fifti',
 'thrill',
 'psdu32g10efs2claim',
 'cyanogenmod',
 'mom',
 'abruptli',
 '64gin',
 '620',
 'preship',
 'mbytesh2testw',
 'www',
 'disappoint',
 'smartphoneconnon',
 'ssd430',
 'unexpect',
 'willi',
 'cardinde',
 'backdecemb',
 'rubwindow',
 'thumb',
 'mpg',
 'yesloc',
 'boxno',
 'amamzon',
 's13',
 'adapterthi',
 'advers',
 'gig',
 'shippedoveral',
 'symptom',
 '3656',
 'describ',
 'crowd',
 'weigh',
 'devicesandisk',
 'do',
 'greengray',
 'begun',
 'en',
 't1000'

## Text PreProcessing Function

In [72]:
def preprocess_text(txt):
    # Converting the text to lowercase
    txt = txt.lower()
    # removing punctuation
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    # tokenizing the text
    tokens = word_tokenize(txt)
    # removing the stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

### Preprocess the Data


In [73]:
#preprocessing the data
data['reviewText'].fillna('',inplace=True)
data_series_preprocessed = data['reviewText'].apply(preprocess_text)
data_series_preprocessed

0                                                  issues
1       purchased device worked advertised never much ...
2       works expected sprung higher capacity think ma...
3       think worked greathad diff bran 64gb card went...
4       bought retail packaging arrived legit orange e...
                              ...                        
4910    bought sandisk 16gb class 10 use htc inspire 3...
4911    used extending capabilities samsung galaxy not...
4912    great card fast reliable comes optional adapte...
4913          good amount space stuff want fits gopro say
4914    ive heard bad things 64gb micro sd card crappi...
Name: reviewText, Length: 4915, dtype: object

In [74]:
#Intializing and Perfomring sentiment analysis

sia = SentimentIntensityAnalyzer()

sentiment = data_series_preprocessed.apply(lambda x: sia.polarity_scores(x)['compound'])
sentiment

0       0.0000
1       0.0000
2       0.4019
3       0.6486
4       0.8591
         ...  
4910    0.0772
4911    0.1761
4912    0.8481
4913    0.4939
4914    0.5267
Name: reviewText, Length: 4915, dtype: float64

## Sentiment Analyser

In [75]:
#categorize reviews as positive, negative, or neutral
senti_class = sentiment.apply(lambda x: 'positive' if x>0 else ( 'neutral' if x== 0 else 'negative'))

In [76]:
#Sentiment scores are added to the DataFrame
data_with_sentiment = pd.DataFrame({'Reviews': data['reviewText'], 'Sentiments': senti_class})

#DataFrame is displayed together with the sentiment scores
data_with_sentiment

Unnamed: 0,Reviews,Sentiments
0,No issues.,neutral
1,"Purchased this for my device, it worked as adv...",neutral
2,it works as expected. I should have sprung for...,positive
3,This think has worked out great.Had a diff. br...,positive
4,"Bought it with Retail Packaging, arrived legit...",positive
...,...,...
4910,I bought this Sandisk 16GB Class 10 to use wit...,positive
4911,Used this for extending the capabilities of my...,positive
4912,Great card that is very fast and reliable. It ...,positive
4913,Good amount of space for the stuff I want to d...,positive
