# Get acquainted with the data

#### Importing libraries

In [1]:
import pandas as pd
import glob
import config
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report,f1_score
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1  import Features, EntitiesOptions, KeywordsOptions, SentimentOptions, CategoriesOptions

#### Converting datasets to DataFame from DataFrame

In [2]:
reviews = pd.concat([pd.read_csv(f, sep='\t', names=['reviews', 'sentiments']) for f in glob.glob('data/*.txt')], ignore_index = True)

#### Balancing the classes

In [3]:
minimum_value = reviews['sentiments'].value_counts().min()
def sample_minimum(reviews):
    return reviews.sample(minimum_value, random_state=42)
g = reviews.groupby('sentiments')
new_df = g.apply(sample_minimum).reset_index(drop=True)

In [4]:
new_df.head()

Unnamed: 0,reviews,sentiments
0,That's a huge design flaw (unless I'm not usin...,0
1,The plot was the same as pretty much every oth...,0
2,"In fact, this stinker smells like a direct-to-...",0
3,Took an hour to get our food only 4 tables in ...,0
4,Omit watching this.,0


In [5]:
new_df['sentiments'].value_counts()

1    1362
0    1362
Name: sentiments, dtype: int64

# Build the analyzers formation

#### Instantiate the analyzers

In [6]:
# for VaderSentiment Analyser
analyser = SentimentIntensityAnalyzer()

# For IBM-Watson Sentiment Analyser
URL = "https://gateway-lon.watsonplatform.net/natural-language-understanding/api"
natural_language_understanding = NaturalLanguageUnderstandingV1(                                         
    version='2018-11-16',
    iam_apikey= config.API, 
    url= URL
    )

  if __name__ == '__main__':


#### Create a function that iterates through the analysers and text then store the scores

In [7]:
def analyze_text(input_text, analyzer):
    if analyzer == 'VADER':
        result = analyser.polarity_scores(input_text)
        score = result['compound']
    elif analyzer == 'TextBlob':
        score = TextBlob(input_text).sentiment.polarity
    else: 
        response = natural_language_understanding.analyze (text = input_text, features = Features(sentiment=SentimentOptions()), language='en').get_result()
        score = response.get('sentiment').get('document').get('score')
    if score > 0:
        result = 1
    else:
        result = 0
    return result
        
    

In [8]:
new_df['vader_sent'] = new_df['reviews'].apply(analyze_text, analyzer='VADER' )
new_df['textblob_sent'] = new_df['reviews'].apply(analyze_text, analyzer= 'TextBlob')
new_df['IBM_sent'] = new_df['reviews'].apply(analyze_text, analyzer = 'Ibm')

In [9]:
new_df.head()

Unnamed: 0,reviews,sentiments,vader_sent,textblob_sent,IBM_sent
0,That's a huge design flaw (unless I'm not usin...,0,1,1,0
1,The plot was the same as pretty much every oth...,0,0,1,0
2,"In fact, this stinker smells like a direct-to-...",0,0,0,0
3,Took an hour to get our food only 4 tables in ...,0,1,1,0
4,Omit watching this.,0,0,0,0


# Evaluate the Analyzers

#### Using Confusion matrix

In [10]:
col_name = ['True_Negative', 'False_Positive', 'False_Negative', 'True_Positive']
columns = ['vader_sent','textblob_sent', 'IBM_sent']
def matrix_and_array(target_column, columns):
    matrix_list = []
    for i in columns:
        create_matrix = confusion_matrix(new_df[target_column], new_df[i])
        create_vector = np.asarray(create_matrix).reshape(-1)
        matrix = list(create_vector)
        matrix_list.append(matrix)  
    return matrix_list

In [11]:
index = ['VADER', 'TextBlob', 'IBM Watson']
matrix_list = matrix_and_array('sentiments', columns)
truth = pd.DataFrame(matrix_list, columns = col_name, index = index)
truth

Unnamed: 0,True_Negative,False_Positive,False_Negative,True_Positive
VADER,1122,240,260,1102
TextBlob,1034,328,276,1086
IBM Watson,1251,111,217,1145


#### Using Accuracy Score, Precision and Recall

In [12]:
#for every row, take the tP and divide it by the tp and the fp
# for every row , take the tp and divide it by the tp and fn
precision = []
recall = []
accuracy = []
for index ,TN, FP, FN, TP in truth.itertuples():
    p = TP/(TP + FP)
    r = TP/(TP+FN)
    a = (TP + TN)/(TN + FP + FN +TP)
    precision.append(round(p,2))
    recall.append(round(r,2))
    accuracy.append(round(a,2))

In [13]:
data = [accuracy, precision, recall]
evaluate = pd.DataFrame(data, columns = ['VADER', 'TextBlob', 'IBM Watson'] , index = ['Accuracy_Score', 'Precision', 'Recall']).T.sort_values('Accuracy_Score', ascending= False)
evaluate

Unnamed: 0,Accuracy_Score,Precision,Recall
IBM Watson,0.88,0.91,0.84
VADER,0.82,0.82,0.81
TextBlob,0.78,0.77,0.8


# Using Voting Method

#### Get the most frequent sentiment in each row

In [15]:
new_df['Mode_sent'] = new_df[['vader_sent','textblob_sent', 'IBM_sent']].mode(axis=1)

In [16]:
new_df.sample(10 , random_state=42)

Unnamed: 0,reviews,sentiments,vader_sent,textblob_sent,IBM_sent,Mode_sent
2227,great...no problems at all!.,1,0,0,1,0
1583,"It has everything you could want... suspense, ...",1,0,0,0,0
1462,These are certainly very comfortable and funct...,1,1,1,1,1
1832,It's a great item.,1,1,1,1,1
605,"Anyways, The food was definitely not filling a...",0,1,1,0,1
1533,You won't forget this movie!,1,1,0,0,0
291,The RI style calamari was a joke.,0,1,0,0,0
1057,Main thing I didn't enjoy is that the crowd is...,0,0,1,0,0
1133,The live music on Fridays totally blows.,0,0,1,0,0
2232,Go To Place for Gyros.,1,0,0,0,0


### Evaluate the analyser

###### Using Confusion Matrix

In [17]:
mode_column = confusion_matrix(new_df['sentiments'], new_df['Mode_sent'])
mode_column

array([[1203,  159],
       [ 234, 1128]])

In [18]:
truth.loc['Mode'] = [mode_column[0][0], mode_column[0][1],mode_column[1][0], mode_column[1][1]]

In [19]:
truth

Unnamed: 0,True_Negative,False_Positive,False_Negative,True_Positive
VADER,1122,240,260,1102
TextBlob,1034,328,276,1086
IBM Watson,1251,111,217,1145
Mode,1203,159,234,1128


###### Using Accuracy_Score, Precision and Recall

In [20]:
mode = truth.loc['Mode'].values

In [21]:
mode_accuracy = round((mode[3] + mode[0])/(mode[3] + mode[0] + mode[1] + mode[2]),2)
mode_precision =round(mode[3]/(mode[3] + mode[1]),2)
mode_recall = round(mode[3]/(mode[3] + mode[2]), 2)

In [22]:
evaluate.loc['Mode'] = [mode_accuracy, mode_precision, mode_recall]
evaluate.sort_values('Accuracy_Score', ascending= False)

Unnamed: 0,Accuracy_Score,Precision,Recall
IBM Watson,0.88,0.91,0.84
Mode,0.86,0.88,0.83
VADER,0.82,0.82,0.81
TextBlob,0.78,0.77,0.8


# EXTRAS

In [23]:
truth

Unnamed: 0,True_Negative,False_Positive,False_Negative,True_Positive
VADER,1122,240,260,1102
TextBlob,1034,328,276,1086
IBM Watson,1251,111,217,1145
Mode,1203,159,234,1128


In [24]:
truth = truth[['True_Positive', 'True_Negative', 'False_Positive', 'False_Negative']]
truth

Unnamed: 0,True_Positive,True_Negative,False_Positive,False_Negative
VADER,1102,1122,240,260
TextBlob,1086,1034,328,276
IBM Watson,1145,1251,111,217
Mode,1128,1203,159,234


In [25]:
truth.drop('Mode').sort_values('True_Positive', ascending = False)

Unnamed: 0,True_Positive,True_Negative,False_Positive,False_Negative
IBM Watson,1145,1251,111,217
VADER,1102,1122,240,260
TextBlob,1086,1034,328,276


In [14]:
# col = ['vader_sent','textblob_sent', 'IBM_sent']
# def pd_score(df,col,target):
#     score = []
#     for items in col:
#         rate = accuracy_score(target, df[items])
#         score.append(rate)
#     scores = pd.Series(score, index=['VADER', 'TextBlob', 'IBM_Watson'])
#     return scores

# scores = pd_score(new_df,col,new_df['sentiments'])
# scores.sort_values(ascending=False)