In [2]:
# VADER = Valence Aware Dictionary for sEntiment Reasoning
# Primarily, VADER sentiment analysis uses a dictionary which maps lexical features to emotion intensities called sentiment scores
# Each word has negative or positive sentiment attached to them.

In [3]:
import nltk

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alessandroalberga/nltk_data...


True

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [6]:
# Takes in a string and returns a dict of scores...
sid = SentimentIntensityAnalyzer()

In [9]:
a = 'This is a good movie'

In [10]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [11]:
a = 'This was the best, most awesome movie EVER MADE!!!'

In [12]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [13]:
a = 'This was the WORST movie that has ever disgraced the screen'

In [14]:
sid.polarity_scores(a)

{'neg': 0.465, 'neu': 0.535, 'pos': 0.0, 'compound': -0.8331}

In [15]:
# Analyse amazon reviews...
import pandas as pd

df = pd.read_csv('../TextFiles/amazonreviews.tsv', sep='\t')

In [16]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [17]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [18]:
df.dropna(inplace=True)

In [19]:
blank_indexes = []
for i, lb, rv in df.itertuples():
    if (type(rv) == str and rv.isspace()): blank_indexes.append(i)

In [20]:
blank_indexes

[]

In [21]:
# Drop a list of indexes...
df.drop(blank_indexes, inplace=True)

In [22]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [24]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [25]:
df['scores'] = df['review'].apply(lambda rev: sid.polarity_scores(rev))

In [26]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [28]:
df['compound'] = df['scores'].apply(lambda d: d['compound'])

In [29]:
df['compound']

0       0.9454
1       0.8957
2       0.9858
3       0.9814
4       0.9781
         ...  
9995    0.9610
9996    0.9544
9997    0.9102
9998   -0.3595
9999    0.9107
Name: compound, Length: 10000, dtype: float64

In [30]:
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')

In [31]:
df

Unnamed: 0,label,review,scores,compund,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,0.9781,pos
...,...,...,...,...,...,...
9995,pos,A revelation of life in small town America in ...,"{'neg': 0.017, 'neu': 0.846, 'pos': 0.136, 'co...",0.9610,0.9610,pos
9996,pos,Great biography of a very interesting journali...,"{'neg': 0.0, 'neu': 0.868, 'pos': 0.132, 'comp...",0.9544,0.9544,pos
9997,neg,Interesting Subject; Poor Presentation: You'd ...,"{'neg': 0.084, 'neu': 0.754, 'pos': 0.162, 'co...",0.9102,0.9102,pos
9998,neg,Don't buy: The box looked used and it is obvio...,"{'neg': 0.091, 'neu': 0.909, 'pos': 0.0, 'comp...",-0.3595,-0.3595,neg


In [32]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [33]:
accuracy_score(df['label'], df['comp_score'])

0.7097

In [35]:
print(classification_report(df['label'], df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [36]:
print(confusion_matrix(df['label'], df['comp_score']))

[[2629 2468]
 [ 435 4468]]
