In [1]:
import pandas as pd
import glob
import config
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report,f1_score
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1  import Features, EntitiesOptions, KeywordsOptions, SentimentOptions, CategoriesOptions

In [2]:
reviews = pd.concat([pd.read_csv(f, sep='\t', names=['reviews', 'sentiments']) for f in glob.glob('data/*.txt')], ignore_index = True)

In [3]:
reviews.head()

Unnamed: 0,reviews,sentiments
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [4]:
reviews.shape

(2748, 2)

In [5]:
minimum_value = reviews['sentiments'].value_counts().min()

In [6]:
def sample_minimum(reviews):
    return reviews.sample(minimum_value)

    

In [7]:
g = reviews.groupby('sentiments')

In [8]:
new_df = g.apply(sample_minimum).reset_index(drop=True)
new_df.head()

Unnamed: 0,reviews,sentiments
0,Why are these sad little vegetables so overcoo...,0
1,The problem I have is that they charge $11.99 ...,0
2,I was mortified.,0
3,This is essentially a communications tool that...,0
4,Mediocre food.,0


In [9]:
new_df['sentiments'].value_counts()

1    1362
0    1362
Name: sentiments, dtype: int64

### For VADERSentiments

In [10]:
analyser = SentimentIntensityAnalyzer()


In [11]:
def vader_sentiment(text):
    score = analyser.polarity_scores(text)
    if score['compound'] > 0:
        result = 1
    else:
        result = 0
    return result

### For TextBlob

In [12]:
def text_blob(text):
    score = TextBlob(text).sentiment.polarity
    if score > 0:
        result = 1
    else:
        result = 0
    return result

### For IBM-Watson

In [13]:
URL = "https://gateway-lon.watsonplatform.net/natural-language-understanding/api"
# API = "aNAfGJImJnR8NPTOu855xFilyu396r50FuddXeHeAjQd"

natural_language_understanding = NaturalLanguageUnderstandingV1(                                         
    version='2018-11-16',
    iam_apikey= config.API, # Use your API key here
    url= URL # paste the url here
    )

  import sys


In [14]:
def Sentiment_score(input_text): 
    # Input text can be sentence, paragraph or document
    response = natural_language_understanding.analyze (text = input_text, features = Features(sentiment=SentimentOptions()), language='en').get_result()
    # From the response extract score which is between -1 to 1
    score = response.get('sentiment').get('document').get('score')
    if score > 0:
        result = 1
    else:
        result = 0
    return result

In [17]:
new_df['vader_sent'] = new_df['reviews'].apply(vader_sentiment)
new_df['textblob_sent'] = new_df['reviews'].apply(text_blob)


In [18]:
new_df['IBM_sent'] = new_df['reviews'].apply(Sentiment_score)

In [39]:
col = ['vader_sent','textblob_sent', 'IBM_sent']
def pd_score(df,col,target):
    score = []
    for items in col:
        rate = accuracy_score(target, df[items])
        score.append(rate)
    scores = pd.Series(score, index=['VADER', 'TextBlob', 'IBM_Watson'])
    return scores
    

In [40]:
scores = pd_score(new_df,col,new_df['sentiments'])

In [22]:
rate = accuracy_score(new_df['sentiments'], new_df['vader_sent'])

In [23]:
rate

0.8146108663729809

In [24]:
rate = accuracy_score(new_df['sentiments'], new_df['textblob_sent'])

In [25]:
rate

0.777165932452276

In [26]:
rate = accuracy_score(new_df['sentiments'], new_df['IBM_sent'])

In [27]:
rate

0.8784875183553598

In [28]:
new_df.head()

Unnamed: 0,reviews,sentiments,vader_sent,textblob_sent,IBM_sent
0,Why are these sad little vegetables so overcoo...,0,0,0,0
1,The problem I have is that they charge $11.99 ...,0,0,1,0
2,I was mortified.,0,0,0,0
3,This is essentially a communications tool that...,0,0,0,0
4,Mediocre food.,0,0,0,0


In [29]:
score = TextBlob(new_df['reviews'][1]).sentiment.polarity

In [30]:
score

0.275

In [31]:
score = analyser.polarity_scores(new_df['reviews'][1])

In [32]:
score

{'neg': 0.164, 'neu': 0.738, 'pos': 0.097, 'compound': -0.25}

In [42]:
print(scores)

VADER         0.814611
TextBlob      0.777166
IBM_Watson    0.878488
dtype: float64


In [44]:
confusion_matrix(new_df['sentiments'], new_df['IBM_sent'])

array([[1251,  111],
       [ 220, 1142]])

In [45]:
confusion_matrix(new_df['sentiments'], new_df['vader_sent'])

array([[1122,  240],
       [ 265, 1097]])

In [46]:
confusion_matrix(new_df['sentiments'], new_df['textblob_sent'])

array([[1034,  328],
       [ 279, 1083]])

In [48]:
print(classification_report(new_df['sentiments'], new_df['IBM_sent']))

              precision    recall  f1-score   support

           0       0.85      0.92      0.88      1362
           1       0.91      0.84      0.87      1362

    accuracy                           0.88      2724
   macro avg       0.88      0.88      0.88      2724
weighted avg       0.88      0.88      0.88      2724



In [49]:
print(classification_report(new_df['sentiments'], new_df['vader_sent']))

              precision    recall  f1-score   support

           0       0.81      0.82      0.82      1362
           1       0.82      0.81      0.81      1362

    accuracy                           0.81      2724
   macro avg       0.81      0.81      0.81      2724
weighted avg       0.81      0.81      0.81      2724



In [50]:
print(classification_report(new_df['sentiments'], new_df['textblob_sent']))

              precision    recall  f1-score   support

           0       0.79      0.76      0.77      1362
           1       0.77      0.80      0.78      1362

    accuracy                           0.78      2724
   macro avg       0.78      0.78      0.78      2724
weighted avg       0.78      0.78      0.78      2724



In [52]:
new_df['mode'] = new_df[['vader_sent','textblob_sent', 'IBM_sent']].mode(axis=1)

In [54]:
new_df.head(30)

Unnamed: 0,reviews,sentiments,vader_sent,textblob_sent,IBM_sent,mode
0,Why are these sad little vegetables so overcoo...,0,0,0,0,0
1,The problem I have is that they charge $11.99 ...,0,0,1,0,0
2,I was mortified.,0,0,0,0,0
3,This is essentially a communications tool that...,0,0,0,0,0
4,Mediocre food.,0,0,0,0,0
5,I asked multiple times for the wine list and a...,0,0,0,0,0
6,"I started this review with two stars, but I'm ...",0,0,0,1,0
7,the spaghetti is nothing special whatsoever.,0,0,1,0,0
8,So just beware.,0,0,0,0,0
9,These are the central themes of the film and t...,0,0,0,0,0


In [55]:
confusion_matrix(new_df['sentiments'], new_df['mode'])

array([[1203,  159],
       [ 239, 1123]])

In [56]:
accuracy_score(new_df['sentiments'], new_df['mode'])

0.8538913362701909

In [58]:
print(classification_report(new_df['sentiments'], new_df['mode']))


              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1362
           1       0.88      0.82      0.85      1362

    accuracy                           0.85      2724
   macro avg       0.86      0.85      0.85      2724
weighted avg       0.86      0.85      0.85      2724

