In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

import nltk

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')


In [None]:
# Read the data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Amazon Reviews/Reviews.csv')
df.head()

In [None]:
df['Text'].values[0]

In [None]:
df.shape

In [None]:
df = df.head(500)
print(df.shape)

In [None]:
df.head()

In [None]:
df['Score'].value_counts().sort_index().plot(kind = 'bar', title = 'Reviews by Stars', figsize = (10,5))

In [None]:
# Basic NLTK

example = df['Text'][50]
example

In [None]:
nltk.word_tokenize(example)

In [None]:
# Tokenize the words
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
# Tagging the Tokenized Words (e.g.  oatmeal - NN (represents Noun))
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

# *VADER Sentiment Analysis *


In [None]:
# Vader has three properties Positive, Negative, Neutral takes words from sentece and assign values and then outputs how positive, negative, neutral those sentences are
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()



In [None]:
sia.polarity_scores('I fucked up real bad')

In [None]:
print(example)
sia.polarity_scores(example)

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)) :
  text = row['Text']
  myid = row['Id']
  res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {'index' : 'Id'})
vaders = vaders.merge(df, how = 'left')

In [None]:
vaders

In [None]:
# Now we have sentiment Score and Meta Data
df.head()

In [None]:
# Now comparing (if 1 star review has more negative score than 5 star and more)
sns.barplot(data = vaders, x = 'Score', y = 'compound')
plt.title('Compound Score by Amazon Star Review')
plt.show()

In [None]:
# checking positive, negative and neutral
sns.barplot(data = vaders, x = 'Score', y = 'pos')
plt.title('Positive Score by Amazon Star Review')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize = (12,3))
sns.barplot(data = vaders, x = 'Score', y = 'pos', ax = axs[0])
sns.barplot(data = vaders, x = 'Score', y = 'neu', ax = axs[1])
sns.barplot(data = vaders, x = 'Score', y = 'neg', ax = axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
# importing pretrained model for sentiment from huggingface
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
#VADER Example
print(example)
sia.polarity_scores(example)

In [None]:
# Run for Roberta Mod3l

encoded_text = tokenizer(example, return_tensors = "pt")
output = model(**encoded_text)

# save this as numpy
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(example) :
  encoded_text = tokenizer(example, return_tensors = "pt")
  output = model(**encoded_text)

# save this as numpy
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
  }
  return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)) :
  try :
    text = row['Text']
    myid = row['Id']
    vader_result = sia.polarity_scores(text)

    roberta_result = polarity_scores_roberta(text)

    both = {**vader_result, **roberta_result}
    res[myid] = both
  except RuntimeError :
    print(f'broke for id {myid}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns = {'index' : 'Id'})
results_df = results_df.merge(df, how = 'left')

In [None]:
results_df.head()

In [None]:
# Compare scores between models

sns.pairplot(data = results_df, vars = ['neg', 'neu', 'pos', 'roberta_neg', 'roberta_neu', 'roberta_pos'], hue = 'Score', palette = 'tab10')
plt.show()

In [None]:
# Review Examples

results_df.query('Score == 1').sort_values('roberta_pos', ascending = False)['Text'].values[0]

In [None]:
# Review Examples

results_df.query('Score == 1').sort_values('pos', ascending = False)['Text'].values[0]

In [None]:
# Negative sentiment 5 star review
# Review Examples

results_df.query('Score == 5').sort_values('roberta_neg', ascending = False)['Text'].values[0]

In [None]:
# Negative sentiment 5 star review
# Review Examples

results_df.query('Score == 5').sort_values('neg', ascending = False)['Text'].values[0]

In [None]:
# Hugging Face Transformers pipeline
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

In [None]:
sentiment_pipeline("I love sentiment analysis with hugging face")

In [None]:
sentiment_pipeline("positive")