In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_movie_reviews():

    from nltk.corpus import movie_reviews
    try:
        movie_reviews.categories()
    except:
        import nltk
        nltk.download('movie_reviews')
        from nltk.corpus import movie_reviews

    raw_data = []

    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            review_words = movie_reviews.words(fileid)
            review_text = ''

            for word in review_words:
                review_text += ' ' + word

            review_dictionary = {
                'review': review_text,
                'label': category
            }

            raw_data.append(review_dictionary)

    return pd.DataFrame(raw_data) 

In [3]:
def drop_empty_data(df):
    df.dropna(inplace=True)
    blanks = []
    for i,lb,rv in df.itertuples():
        if type(rv)==str:
            if rv.isspace():
                blanks.append(i)
    df.drop(blanks, inplace=True)

    return df

In [4]:
df = load_movie_reviews()
df.head()

Unnamed: 0,label,review
0,neg,plot : two teen couples go to a church party ...
1,neg,the happy bastard ' s quick movie review damn...
2,neg,it is movies like these that make a jaded mov...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing...


In [5]:
df = drop_empty_data(df)
df['label'].value_counts()

neg    1000
pos    1000
Name: label, dtype: int64

In [6]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ariel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()



In [8]:
df['sentiment'] = df['review'].apply(lambda review: 'pos' if sid.polarity_scores(review)['compound'] >=0 else 'neg')

df.head()

Unnamed: 0,label,review,sentiment
0,neg,plot : two teen couples go to a church party ...,pos
1,neg,the happy bastard ' s quick movie review damn...,pos
2,neg,it is movies like these that make a jaded mov...,pos
3,neg,""" quest for camelot "" is warner bros . ' firs...",neg
4,neg,synopsis : a mentally unstable man undergoing...,pos


In [9]:
from sklearn import metrics
def print_metrics(y_test,predictions):
    print(metrics.classification_report(y_test,predictions))
    print('Accuracy: ',metrics.accuracy_score(y_test,predictions))
    print('Confusion matrix:\n',metrics.confusion_matrix(y_test,predictions))

In [10]:
print_metrics(df['label'],df['sentiment'])

precision    recall  f1-score   support

         neg       0.72      0.42      0.53      1000
         pos       0.59      0.84      0.69      1000

   micro avg       0.63      0.63      0.63      2000
   macro avg       0.65      0.63      0.61      2000
weighted avg       0.65      0.63      0.61      2000

Accuracy:  0.628
Confusion matrix:
 [[420 580]
 [164 836]]
