In [29]:
import random
import nltk
from nltk.corpus import movie_reviews

In [34]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


In [36]:
# shuffle the documents
random.shuffle(documents)


In [44]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
print(all_words[0])
all_words = nltk.FreqDist(all_words)

print('Most common words: {}'.format(all_words.most_common(15)))
print('The word happy: {}'.format(all_words["happy"]))

plot
Most common words: [(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
The word happy: 215


In [46]:
# We'll use the 4000 most common words as features
print(len(all_words))
word_features = list(all_words.keys())[:4000]
len(word_features)

39768


4000

In [75]:
# The find_features function will determine which of the 3000 word features are contained in the review
def find_features(document):
    words = set(document)
  
    features = {}
    for w in word_features:
        features[w] = (w in words)
   # print(features)
    return features


In [78]:
# Lets use an example from a negative review
features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key, value in features.items():
    if value == True:
        print (key)

downshifts
package
deal
before
throughout
her
actors
cool
people
need
start
new
problem
playing
t
make
correctly
drive
good
guess
neighborhood
turning
little
seems
lazy
sitting
came
us
bad
but
character
generally
trying
accident
things
couples
video
horror


In [82]:
# Now lets do it for all the documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [83]:
featuresets[0]

({'sacked': False,
  'cap': False,
  '007': False,
  'es': False,
  'delpy': False,
  'belittling': False,
  'goodwin': False,
  'sharp': False,
  'cites': False,
  'flank': False,
  'crue': False,
  'fervour': False,
  '_real_': False,
  'card': False,
  'insinuating': False,
  'popeye': False,
  'base': False,
  'absolut': False,
  'taj': False,
  'piggy': False,
  '_halloween_': False,
  'dysfuntion': False,
  'dubbing': False,
  'feely': False,
  'expansive': False,
  'markedly': False,
  'blink': False,
  'nominees': False,
  'willis': False,
  'understand': False,
  'displaced': False,
  'extraction': False,
  'flanked': False,
  'unchanged': False,
  'massachusetts': False,
  'miko': False,
  'xvi': False,
  'superimposed': False,
  'enthusiasts': False,
  'lambs': False,
  'relaxes': False,
  'providing': False,
  'padding': False,
  'hairline': False,
  'vicki': False,
  'slooooow': False,
  'warda': False,
  'rebirth': False,
  'vcr': False,
  'parading': False,
  'animalitie

In [84]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

# define a seed for reproducibility
seed = 1

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 70.0
