<a href="https://colab.research.google.com/github/abdulraafaykhan/RoBERTa-VADER-Sentiment/blob/main/Sentiment_Analysis_using_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('vader_lexicon')

plt.style.use('ggplot')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/My_Projects/data/Reviews.csv')
df.head()
# Kaggle Dataset on Amazon Food Reviews -- https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews

In [None]:
print(df.shape)

In [None]:
df['Text'].values[2]

In [None]:
# Downsizing the dataset for ease of use

print(df.shape)
df = df.head(500)
print(df.shape)

In [None]:
ax = df['Score'].value_counts().sort_index().plot(kind='bar', title = 'Count of Reviews by Stars',
      figsize= (10,5))

ax.set_xlabel('Review by Stars')
plt.show()

### Basic NLTK

In [None]:
example = df['Text'][50]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:15]

In [None]:
tagged = nltk.pos_tag(tokens)
tagged[:15]

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

### VADER

In [None]:
!pip install ipywidgets

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
# from tqdm.notebook import tqdm (For better widget display)
sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am very happy!')

In [None]:
sia.polarity_scores('This is the worst thing ever.')

In [None]:
sia.polarity_scores(example)

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {'index': 'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
vaders.head()

### VADER Results

In [None]:
ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compound Score by Amazon Star Review')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

### ROBERTA Pretrained MOdel

In [None]:
# !pip install torch

In [None]:
# !pip install transformers

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# VADER results on example
print(example)
sia.polarity_scores(example)

In [None]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

# Merging Both Models Results

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')

In [None]:
results_df.head(3)

In [None]:
results_df.columns

# VADER & ROBERTA Result Comparison

In [None]:
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='Score',
            palette='tab10')
plt.show()

## Review Examples:Â¶
#### Positive 1-Star and Negative 5-Star Reviews
#### Examples where the model scoring and review score differ the most.

#### positive sentiment 1-Star review

In [None]:
results_df.query('Score == 1') \
    .sort_values('roberta_pos', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 1') \
    .sort_values('vader_pos', ascending=False)['Text'].values[0]

#### nevative sentiment 5-Star review

In [None]:
results_df.query('Score == 5') \
    .sort_values('roberta_neg', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5') \
    .sort_values('vader_neg', ascending=False)['Text'].values[0]

# Trying Transformers pipeline
#### Easy way to run Sentiment Analysis using Huggingface Transformer Pipeline

In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

In [None]:
out_1 = sent_pipeline("The battery life on this new laptop is phenomenalâ€”I can finally get a full day's work done without needing a charger.")
out_2 = sent_pipeline("This app constantly crashes when I try to save my progress; it's practically unusable and a huge waste of time")

print(" Sentence 1: ")
print(out_1)


print(" Sentence 2: ")
print(out_2)


In [None]:
out_3 = sent_pipeline("Just saw the trailer for the new sci-fi series and I'm totally hyped! It looks like a masterpiece in the making.ðŸ¤©")
out_4 = sent_pipeline("Can't believe they raised the price of the monthly subscription again. Seriously considering canceling, this is ridiculous. ðŸ˜ ")

print(" Sentence 1: ")
print(out_3)


print(" Sentence 2: ")
print(out_4)
