In [38]:
import pandas as pd
import numpy as np
import nltk
from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from utils import load_sst_nltk, load_imdb_nltk
from sklearn.metrics import accuracy_score

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [112]:
class VaderClassifier:
    def __init__(self, dataset='IMDB'):
        self.dataset = dataset
        self.sid = SentimentIntensityAnalyzer()
        
    def predict(self, X):
        '''Predicts sentiment for a many examples.  
        Returns an int list
        '''
        if not isinstance(X, list):
            X = [X]
        return [self._predict(x) for x in X]
    
    def _predict(self, x):
        '''Predicts sentiment for a single example.  
        Returns 1 if pos, 0 otherwise (based on compound score)
        '''
        if self.dataset == 'IMDB':
            scores = []
            for line in tokenize.sent_tokenize(x):
                score = self.sid.polarity_scores(line)
                scores.append(score)
            df = pd.DataFrame(scores)
            
            return int(df.median().compound >= 0)
        elif self.dataset == 'SST':
            score = self.sid.polarity_scores(x)
            
            return int(score['compound'] >= 0)
        else:
            raise ValueError(f"Invalid dataset: {self.dataset}")
    
    def score(self, X, y):
        '''
        Returns accuracy score for X and y
        '''
        if not isinstance(y, list):
            y = [y]
        y_hat = self.predict(X)
        return accuracy_score(y_hat, y)

## SST

In [113]:
x_train_raw, y_train_raw, x_test_raw, y_test__raw = load_sst_nltk()



Getting splits
Loading splits


In [114]:
len(x_train_raw)

2202

In [115]:
x_train_raw[0]

"The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."

In [116]:
y_train_raw[0]

'positive'

In [117]:
clf = VaderClassifier(dataset='SST')
X = x_train_raw
y = list(map(lambda y: 0 if y == 'negative' else 1, y_train_raw))

In [118]:
X[0]

"The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."

In [119]:
y[0]

1

In [120]:
clf.score(X, y)

0.7275204359673024

## IMDB

In [127]:
x_train_raw, y_train_raw, x_test_raw, y_test__raw = load_imdb_nltk()



Getting splits
Loading splits


In [128]:
idx = 20000
x = x_train_raw[idx]
y = y_train_raw[idx]

print(x)
print()
print(y)

This movie tries hard, but completely lacks the fun of the 1960s TV series, that I am sure people do remember with fondness. Although I am 17, I watched some of the series on YouTube a long time ago and it was enjoyable and fun. Sadly, this movie does little justice to the series.<br /><br />The special effects are rather substandard, and this wasn't helped by the flat camera-work. The script also was dull and lacked any sense of wonder and humour. Other films with under-par scripting are Home Alone 4, Cat in the Hat, Thomas and the Magic Railroad and Addams Family Reunion.<br /><br />Now I will say I liked the idea of the story, but unfortunately it was badly executed and ran out of steam far too early, and I am honestly not sure for this reason this is something for the family to enjoy. And I was annoyed by the talking suit, despite spirited voice work from Wayne Knight.<br /><br />But the thing that angered me most about this movie was that it wasted the talents of Christopher Lloyd

In [129]:
clf = VaderClassifier(dataset='IMDB')
X = x_train_raw
y = list(map(lambda y: 0 if y == 'neg' else 1, y_train_raw))

In [130]:
clf.score(X, y)

0.63728

# \*\*Hacking section\*\*

In [123]:
df = pd.DataFrame({'text': x_train_raw[:3000], 'label': y_train_raw[:3000]})

In [124]:
df

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,pos
1,Homelessness (or Houselessness as George Carli...,pos
2,Brilliant over-acting by Lesley Ann Warren. Be...,pos
3,This is easily the most underrated film inn th...,pos
4,This is not the typical Mel Brooks film. It wa...,pos
...,...,...
2995,Has Al Pacino ever been in a bad movie? His na...,pos
2996,I read the negative comments before viewing th...,pos
2997,Across the country and especially in the polit...,pos
2998,This is one of the best crime-drama movies dur...,pos


In [88]:
def predict(x, sid):
    '''Predicts sentiment for a single example.  
    Returns 1 if pos, 0 otherwise (based on compound score)
    '''
    scores = []
    for line in tokenize.sent_tokenize(x):
        score = sid.polarity_scores(line)
        scores.append(score)
    df = pd.DataFrame(scores)

    return int(df.median().compound >= 0)

In [89]:
sid = SentimentIntensityAnalyzer()

In [90]:
df.text.apply(lambda x: predict(x, sid))

KeyboardInterrupt: 

In [92]:
df.text.str.split('.')

0        [Bromwell High is a cartoon comedy,  It ran at...
1        [Homelessness (or Houselessness as George Carl...
2        [Brilliant over-acting by Lesley Ann Warren,  ...
3        [This is easily the most underrated film inn t...
4        [This is not the typical Mel Brooks film,  It ...
                               ...                        
24995    [Towards the end of the movie, I felt it was t...
24996    [This is the kind of movie that my enemies con...
24997    [I saw 'Descent' last night at the Stockholm F...
24998    [Some films that you pick up for a pound turn ...
24999    [This is one of the dumbest films, I've ever s...
Name: text, Length: 25000, dtype: object

In [96]:
df.text.apply(tokenize.sent_tokenize) \
       .explode() \
       .apply(lambda x: sid.polarity_scores(x)['compound'])

KeyboardInterrupt: 