In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.fileids()
len(movie_reviews.fileids())

2000

In [4]:
movie_reviews.fileids('neg')

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [5]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [6]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))

In [7]:
import random
random.shuffle(documents)
documents[0:5]

[(['here', 'is', 'a', 'movie', 'that', 'sadly', ...], 'neg'),
 (['"', 'sometimes', 'the', "'", 'green', 'mile', "'", ...], 'pos'),
 (['after', 'sixteen', 'years', 'francis', 'ford', ...], 'pos'),
 (['seen', 'september', '5', ',', '1998', 'at', '10', ...], 'pos'),
 (['movie', 'concepts', 'are', 'often', 'pitched', 'to', ...], 'neg')]

In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
import string
punctuations = list(string.punctuation)
stops.update(punctuations)

In [11]:
from nltk import pos_tag
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [12]:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stops:
            #pos = pos_tag([w])
            #clean_word= lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(w.lower())
    return output_words

In [13]:
documents = [(clean_review(document),category) for document,category in documents]

In [14]:
len(documents)

2000

In [15]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [16]:
all_words = []
for doc in training_documents:
    all_words+=doc[0]

In [17]:
import nltk

In [18]:
freq = nltk.FreqDist(all_words)

In [19]:
common = freq.most_common(3000)
features = [i[0] for i in common]

In [20]:
features

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'get',
 'also',
 'well',
 'two',
 'characters',
 'see',
 '--',
 'first',
 'way',
 'make',
 'really',
 'life',
 'plot',
 'films',
 'little',
 'bad',
 'could',
 'people',
 'scene',
 'man',
 'never',
 'best',
 'new',
 'scenes',
 'director',
 'know',
 'many',
 'movies',
 'love',
 'action',
 'another',
 'great',
 'go',
 'us',
 'made',
 'big',
 'something',
 'back',
 'still',
 'end',
 'seems',
 'work',
 'world',
 'makes',
 'however',
 'though',
 'audience',
 'around',
 'seen',
 'every',
 'better',
 'take',
 'real',
 'enough',
 'role',
 'performance',
 'old',
 'may',
 'going',
 'funny',
 'gets',
 'year',
 'think',
 'fact',
 'comedy',
 'thing',
 'say',
 'long',
 'actually',
 'years',
 'look',
 'things',
 'right',
 'last',
 'nothing',
 'played',
 'find',
 'john',
 'almost',
 'script',
 'come',
 'plays',
 'ever',
 'since',
 'cast',
 'comes',
 'part',
 'original',
 'although',
 'show',
 'you

In [21]:
def get_feature_dict(words):
    current_features={}
    word_set=set(words)
    for w in features:
        current_features[w]=w in word_set
    return current_features

In [22]:
get_feature_dict(training_documents[0][0])

{'film': True,
 'one': True,
 'movie': True,
 'like': True,
 'even': True,
 'good': False,
 'time': False,
 'story': False,
 'would': False,
 'much': False,
 'character': False,
 'get': False,
 'also': True,
 'well': False,
 'two': False,
 'characters': True,
 'see': False,
 '--': False,
 'first': False,
 'way': True,
 'make': False,
 'really': False,
 'life': True,
 'plot': True,
 'films': False,
 'little': False,
 'bad': False,
 'could': True,
 'people': False,
 'scene': False,
 'man': False,
 'never': False,
 'best': False,
 'new': False,
 'scenes': False,
 'director': False,
 'know': True,
 'many': False,
 'movies': True,
 'love': False,
 'action': False,
 'another': False,
 'great': False,
 'go': True,
 'us': True,
 'made': True,
 'big': True,
 'something': False,
 'back': False,
 'still': False,
 'end': False,
 'seems': True,
 'work': False,
 'world': True,
 'makes': True,
 'however': True,
 'though': False,
 'audience': True,
 'around': False,
 'seen': False,
 'every': False,
 '

In [23]:
training_data = [ (get_feature_dict(doc), category) for doc , category in training_documents]

In [24]:
testing_data = [ (get_feature_dict(doc), category) for doc , category in testing_documents]

In [25]:
from nltk import NaiveBayesClassifier

In [26]:
cls = NaiveBayesClassifier.train(training_data)

In [27]:
accuracy = nltk.classify.accuracy(cls, testing_data)

In [28]:
accuracy

0.818

In [29]:
cls.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     11.6 : 1.0
               stupidity = True              neg : pos    =     10.5 : 1.0
                  finest = True              pos : neg    =      9.5 : 1.0
                lifeless = True              neg : pos    =      7.2 : 1.0
             wonderfully = True              pos : neg    =      6.5 : 1.0
                 freddie = True              neg : pos    =      6.3 : 1.0
                  wasted = True              neg : pos    =      6.2 : 1.0
                 idiotic = True              neg : pos    =      6.0 : 1.0
                  prinze = True              neg : pos    =      5.9 : 1.0
                  seagal = True              neg : pos    =      5.9 : 1.0
                    lame = True              neg : pos    =      5.7 : 1.0
                    jedi = True              pos : neg    =      5.7 : 1.0
                     eve = True              neg : pos    =      5.6 : 1.0

# Movie-Reviews-SKLearn

In [32]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [33]:
svm =SVC()
classifier_sklearn = SklearnClassifier(svm)

In [34]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [37]:
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.854

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
rfc = RandomForestClassifier()
classifier_sklearn = SklearnClassifier(rfc)

In [40]:
classifier_sklearn.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [41]:
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.806