In [1]:
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]


[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [4]:
import random
random.shuffle(documents)
documents[0:5]

[(['john', 'cusack', 'is', 'the', 'kind', 'of', 'actor', ...], 'pos'),
 (['one', 'never', 'quite', 'knows', 'what', 'one', 'is', ...], 'pos'),
 (['have', 'you', 'ever', 'been', 'in', 'an', ...], 'neg'),
 (['plot', ':', 'odin', 'is', 'a', 'great', 'high', ...], 'pos'),
 (['"', 'living', 'out', 'loud', ',', '"', 'is', 'the', ...], 'pos')]

In [5]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [6]:
lemmatizer=WordNetLemmatizer()

In [7]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [8]:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stops:
            pos=pos_tag([w])
            clean_words=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_words.lower())
    return output_words            

In [9]:
documents=[(clean_review(document),category) for document, category in documents]

In [10]:
documents[0]

(['john',
  'cusack',
  'kind',
  'actor',
  'seem',
  'effortlessly',
  'slide',
  'respective',
  'film',
  'role',
  'effortlessly',
  'people',
  'tend',
  'forget',
  'much',
  'way',
  'people',
  'rarely',
  'recall',
  'many',
  'great',
  'character',
  'actor',
  'anyone',
  'put',
  'name',
  'james',
  'rebhorn',
  'actor',
  'face',
  'invite',
  'treat',
  'product',
  'one',
  'sponsor',
  'example',
  'day',
  'mother',
  'ask',
  'expert',
  'course',
  'movie',
  'worth',
  'see',
  'never',
  'mind',
  'taste',
  'divergent',
  '---',
  'never',
  'forget',
  'day',
  'recommend',
  'go',
  'see',
  'night',
  'roxbury',
  'god',
  'sake',
  'fair',
  'mightily',
  'piss',
  'tell',
  'go',
  'lot',
  'fun',
  'see',
  'anything',
  'futile',
  'attempt',
  'conversation',
  'mutter',
  'little',
  'trepidation',
  'might',
  'enjoy',
  'high',
  'fidelity',
  'respond',
  'usual',
  'query',
  'whose',
  'one',
  'question',
  'whose',
  'answer',
  'seem',
  'immed

In [11]:
training_documents=documents[0:1500]
testing_documents=documents[1500:]

In [12]:
all_words=[]
for doc in training_documents:
    all_words+= doc[0]

In [13]:
import nltk
freq=nltk.FreqDist(all_words)
common=freq.most_common(3000)
features=[ i[0] for i in common]

In [14]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'go',
 'see',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'also',
 'bad',
 'come',
 'life',
 'two',
 'look',
 'give',
 'know',
 'way',
 'end',
 '--',
 'seem',
 'first',
 'work',
 'year',
 'thing',
 'plot',
 'say',
 'really',
 'play',
 'show',
 'people',
 'little',
 'star',
 'could',
 'never',
 'love',
 'man',
 'great',
 'try',
 'performance',
 'director',
 'best',
 'many',
 'new',
 'big',
 'action',
 'actor',
 'want',
 'watch',
 'u',
 'find',
 'role',
 'think',
 'act',
 'another',
 'back',
 'something',
 'turn',
 'audience',
 'still',
 'world',
 'however',
 'day',
 'old',
 'set',
 'begin',
 'comedy',
 'use',
 'every',
 'part',
 'real',
 'feel',
 'guy',
 'around',
 'though',
 'interest',
 'point',
 'last',
 'enough',
 'cast',
 'run',
 'write',
 'may',
 'young',
 'name',
 'minute',
 'fact',
 'actually',
 'woman',
 'long',
 'script',
 'lot',
 'friend',
 'right',
 'nothing',
 'altho

In [15]:
def get_feature_dict(words):
    current_features={}
    word_set= set(words)
    for w in features:
        current_features[w]= w in word_set
    return current_features    

In [23]:
training_data=[(get_feature_dict(doc),category) for doc, category in training_documents]
testing_data=[(get_feature_dict(doc),category) for doc, category in testing_documents]

In [17]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': False,
  'like': True,
  'character': True,
  'get': True,
  'go': True,
  'see': True,
  'time': False,
  'well': False,
  'scene': False,
  'even': True,
  'good': True,
  'story': False,
  'take': True,
  'would': True,
  'much': True,
  'also': True,
  'bad': False,
  'come': True,
  'life': False,
  'two': False,
  'look': True,
  'give': True,
  'know': True,
  'way': True,
  'end': False,
  '--': False,
  'seem': True,
  'first': True,
  'work': True,
  'year': True,
  'thing': True,
  'plot': True,
  'say': True,
  'really': True,
  'play': True,
  'show': True,
  'people': True,
  'little': True,
  'star': False,
  'could': False,
  'never': True,
  'love': True,
  'man': True,
  'great': True,
  'try': False,
  'performance': True,
  'director': True,
  'best': True,
  'many': True,
  'new': True,
  'big': True,
  'action': False,
  'actor': True,
  'want': True,
  'watch': False,
  'u': True,
  'find': False,
  'role'

In [19]:
from nltk import NaiveBayesClassifier

In [21]:
classfier= NaiveBayesClassifier.train(training_data)

In [24]:
nltk.classify.accuracy(classfier,testing_data)

0.786

In [26]:
classfier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =      9.4 : 1.0
                   inept = True              neg : pos    =      9.1 : 1.0
                    lame = True              neg : pos    =      8.3 : 1.0
                  alicia = True              neg : pos    =      8.2 : 1.0
               ludicrous = True              neg : pos    =      7.7 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                flawless = True              pos : neg    =      7.1 : 1.0
                   mulan = True              pos : neg    =      7.1 : 1.0
             wonderfully = True              pos : neg    =      6.9 : 1.0
                religion = True              pos : neg    =      6.8 : 1.0
               fantastic = True              pos : neg    =      6.8 : 1.0
                  castle = True              pos : neg    =      6.7 : 1.0
              uninspired = True              neg : pos    =      6.6 : 1.0

In [33]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [34]:
svc=SVC()
classifier_sklearn=SklearnClassifier(svc)

In [35]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))>

In [36]:
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.848

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
rbf=RandomForestClassifier()
classifier_sklearn1=SklearnClassifier(rbf)

In [44]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False))>

In [45]:
nltk.classify.accuracy(classifier_sklearn1,testing_data)

0.818