In [14]:
import pandas as pd
import numpy as np

In [15]:
def load_movie_reviews():

    from nltk.corpus import movie_reviews
    try:
        movie_reviews.categories()
    except:
        import nltk
        nltk.download('movie_reviews')
        from nltk.corpus import movie_reviews

    raw_data = []

    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            review_words = movie_reviews.words(fileid)
            review_text = ''

            for word in review_words:
                review_text += ' ' + word

            review_dictionary = {
                'review': review_text,
                'label': category
            }

            raw_data.append(review_dictionary)

    return pd.DataFrame(raw_data) 

In [16]:
def drop_empty_data(df):
    df.dropna(inplace=True)
    blanks = []
    for i,lb,rv in df.itertuples():
        if type(rv)==str:
            if rv.isspace():
                blanks.append(i)
    df.drop(blanks, inplace=True)

    return df

In [17]:
df = load_movie_reviews()
df.head()

Unnamed: 0,label,review
0,neg,plot : two teen couples go to a church party ...
1,neg,the happy bastard ' s quick movie review damn...
2,neg,it is movies like these that make a jaded mov...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing...


In [18]:
df = drop_empty_data(df)
df['label'].value_counts()

pos    1000
neg    1000
Name: label, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [21]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [22]:
predictions = text_clf_lsvc.predict(X_test)

In [23]:
from sklearn import metrics
def print_metrics(y_test,predictions):
    print(metrics.classification_report(y_test,predictions))
    print('Accuracy: ',metrics.accuracy_score(y_test,predictions))
    print('Confusion matrix:\n',metrics.confusion_matrix(y_test,predictions))

In [24]:
print_metrics(y_test,predictions)

precision    recall  f1-score   support

         neg       0.86      0.80      0.83       330
         pos       0.81      0.87      0.84       330

   micro avg       0.83      0.83      0.83       660
   macro avg       0.84      0.83      0.83       660
weighted avg       0.84      0.83      0.83       660

Accuracy:  0.8348484848484848
Confusion matrix:
 [[265  65]
 [ 44 286]]


In [25]:
def predict_review_label(review_text):
    print('Positive' if text_clf_lsvc.predict([review_text])[0] == 'pos' else 'Negative')

In [26]:
predict_review_label('the best movie of this year, I\'d watch it every day')

Positive
