In [1]:
# get data
import nltk
# nltk.download('movie_reviews')

In [3]:
from nltk.corpus import movie_reviews

In [4]:
# data size
len(movie_reviews.fileids())

2000

In [5]:
# data categs
movie_reviews.categories()

['neg', 'pos']

In [6]:
# cteg len
'neg data len :', len(movie_reviews.fileids('neg')), 'pos data len :', len(movie_reviews.fileids('pos'))

('neg data len :', 1000, 'pos data len :', 1000)

In [7]:
movie_reviews.fileids('pos')[0]

'pos/cv000_29590.txt'

convert to list all docs

In [8]:
docs = []
for categ in movie_reviews.categories():
    for fileid in data.fileids(categ):
        docs.append((data.words(fileid), categ))

In [9]:
len(docs)

2000

In [10]:
docs[0]

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')

In [11]:
# shufle data
from random import shuffle
shuffle(docs)

# Feature Extraction

To classify the text into any category, we need to define some criteria. On the basis of those criteria, our classifier will learn that a particular kind of text falls in a particular category. This kind of criteria is known as feature. We can define one or more feature to train our classifier.

In this example, we will use the top-N words feature.

#### Fetch all words from the movie reviews corpus

We first fetch all the words from all the movie reviews and create a list.

In [12]:
all_words = [word.lower() for word in data.words()]

In [13]:
all_words[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

#### Create Frequency Distribution of all words

Frequency Distribution will calculate the number of occurence of each word in the entire list of words.

In [14]:
from nltk import FreqDist

In [26]:
all_words_frequency = FreqDist(all_words)
all_words_frequency

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [16]:
all_words_frequency.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

#### Removing Punctuation and Stopwords

From the above frequency distribution of words, we can see the most frequently occurring words are either punctuation marks or stopwords.

Stop words are those frequently words which do not carry any significant meaning in text analysis. For example, I, me, my, the, a, and, is, are, he, she, we, etc.

Punctuation marks like comma, fullstop. inverted comma, etc. occur highly in any text data.

We will do data cleaning by removing stop words and punctuations.

#### Remove Stop Words

In [17]:
# download stopwords from nltk but once time only
# nltk.download('stopwords')

In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
# remove stopwords from all_words
all_words_removed_stopwords = [word for word in all_words if word not in stop_words]
all_words_removed_stopwords

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 ',',
 'drink',
 'drive',
 '.',
 'get',
 'accident',
 '.',
 'one',
 'guys',
 'dies',
 ',',
 'girlfriend',
 'continues',
 'see',
 'life',
 ',',
 'nightmares',
 '.',
 "'",
 'deal',
 '?',
 'watch',
 'movie',
 '"',
 'sorta',
 '"',
 'find',
 '.',
 '.',
 '.',
 'critique',
 ':',
 'mind',
 '-',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 ',',
 'presents',
 'bad',
 'package',
 '.',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 ',',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 ',',
 'mess',
 'head',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 ',',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 ',',
 'folks',
 "'",
 'snag',
 'one',
 'correctly',
 '.',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 ',',
 'executed',
 'terribly',
 '.',
 'problems',
 'movie',
 '?',
 'well',
 ',',
 'main',
 'problem',
 "'",
 'simply',
 'jumbled',


#### Remove Punctuation

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
all_words_removed_punctuation = [word for word in all_words_removed_stopwords if word not in string.punctuation]
all_words_removed_punctuation

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

In [23]:
# do last two step together
cleaned_all_words = [word for word in all_words if word not in stop_words and word not in string.punctuation]
cleaned_all_words

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

#### Frequency Distribution of cleaned words list

Below is the frequency distribution of the new list after removing stopwords and punctuation.

In [27]:
len(all_words_frequency)

39768

In [28]:
all_words_frequency = FreqDist(cleaned_words)
all_words_frequency

FreqDist({'film': 9517, 'one': 5852, 'movie': 5771, 'like': 3690, 'even': 2565, 'good': 2411, 'time': 2411, 'story': 2169, 'would': 2109, 'much': 2049, ...})

In [30]:
len(all_words_frequency)

39586

Previously, before removing stopwords and punctuation, the frequency distribution was:

FreqDist with 39768 samples and 1583820 outcomes

Now, the frequency distribution is:

FreqDist with 39586 samples and 710578 outcomes

This shows that after removing around 200 stop words and punctuation, the outcomes/words number has reduced to around half of the original size.

The most common words or highly occurring words list has also got meaningful words in the list. Before, the first 10 frequently occurring words were only stop-words and punctuations.

#### Create Word Feature using 2000 most frequently occurring words

We take 2000 most frequently occurring words as our feature.

In [31]:
most_common_words = all_words_frequency.most_common(2000)
most_common_words

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906),
 ('characters', 1859),
 ('first', 1836),
 ('--', 1815),
 ('see', 1749),
 ('way', 1693),
 ('make', 1642),
 ('life', 1586),
 ('really', 1558),
 ('films', 1536),
 ('plot', 1513),
 ('little', 1501),
 ('people', 1455),
 ('could', 1427),
 ('scene', 1397),
 ('man', 1396),
 ('bad', 1395),
 ('never', 1374),
 ('best', 1333),
 ('new', 1292),
 ('scenes', 1274),
 ('many', 1268),
 ('director', 1237),
 ('know', 1217),
 ('movies', 1206),
 ('action', 1172),
 ('great', 1148),
 ('another', 1121),
 ('love', 1119),
 ('go', 1113),
 ('made', 1084),
 ('us', 1073),
 ('big', 1064),
 ('end', 1062),
 ('something', 1061),
 ('back', 1060),
 ('still', 1047),
 ('world', 1037),
 ('seems', 1033),
 ('work', 1020),
 ('makes', 992),
 ('however', 989),
 ('every', 947)

In [32]:
# the most common words list's elements are in the form of tuple
# get only the first element of each tuple of the word list
word_features = [row[0] for row in most_common_words]
word_features

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'also',
 'get',
 'two',
 'well',
 'characters',
 'first',
 '--',
 'see',
 'way',
 'make',
 'life',
 'really',
 'films',
 'plot',
 'little',
 'people',
 'could',
 'scene',
 'man',
 'bad',
 'never',
 'best',
 'new',
 'scenes',
 'many',
 'director',
 'know',
 'movies',
 'action',
 'great',
 'another',
 'love',
 'go',
 'made',
 'us',
 'big',
 'end',
 'something',
 'back',
 'still',
 'world',
 'seems',
 'work',
 'makes',
 'however',
 'every',
 'though',
 'better',
 'real',
 'audience',
 'enough',
 'seen',
 'take',
 'around',
 'going',
 'year',
 'performance',
 'role',
 'old',
 'gets',
 'may',
 'things',
 'think',
 'years',
 'last',
 'comedy',
 'funny',
 'actually',
 'long',
 'look',
 'almost',
 'thing',
 'fact',
 'nothing',
 'say',
 'right',
 'john',
 'although',
 'played',
 'find',
 'script',
 'come',
 'ever',
 'cast',
 'since',
 'star',
 'plays',
 'young',
 'show',
 'comes',
 'part',

#### Create Feature Set

Now, we write a function that will be used to create feature set. The feature set is used to train the classifier.

We define a feature extractor function that checks if the words in a given document are present in the word_features list or not.

In [33]:
def doc_features(doc):
    # "set" function will remove repeated/duplicate tokens in the given list
    doc_words = set(doc)
    feetures = {}
    for word in word_features:
        feetures['contains(%s)' % word] = (word in doc_words)
    return feetures
# get first neg movie review
movie_review_file = movie_reviews.fileids('neg')[0]
movie_review_file

'neg/cv000_29416.txt'

In [34]:
doc_features(movie_reviews.words(movie_review_file))

{'contains(film)': True,
 'contains(one)': True,
 'contains(movie)': True,
 'contains(like)': True,
 'contains(even)': True,
 'contains(good)': True,
 'contains(time)': False,
 'contains(story)': False,
 'contains(would)': True,
 'contains(much)': False,
 'contains(character)': True,
 'contains(also)': True,
 'contains(get)': True,
 'contains(two)': True,
 'contains(well)': True,
 'contains(characters)': True,
 'contains(first)': False,
 'contains(--)': False,
 'contains(see)': True,
 'contains(way)': True,
 'contains(make)': True,
 'contains(life)': True,
 'contains(really)': True,
 'contains(films)': True,
 'contains(plot)': True,
 'contains(little)': True,
 'contains(people)': True,
 'contains(could)': False,
 'contains(scene)': False,
 'contains(man)': False,
 'contains(bad)': True,
 'contains(never)': False,
 'contains(best)': False,
 'contains(new)': True,
 'contains(scenes)': True,
 'contains(many)': False,
 'contains(director)': True,
 'contains(know)': True,
 'contains(movies)

In the beginning of this article, we have created the documents list which contains data of all the movie reviews. Its elements are tuples with word list as first item and review category as the second item of the tuple.

In [36]:
# print first tuple of the documents list
docs[0]

(['synopsis', ':', 'nice', 'girl', 'susanne', 'has', ...], 'neg')

We now loop through the documents list and create a feature set list using the features_doc function defined above.

– Each item of the feature_set list is a tuple.

– The first item of the tuple is the dictionary returned from document_features function

– The second item of the tuple is the category (pos or neg) of the movie review

In [37]:
feature_set = [(doc_features(doc),category) for (doc, category) in docs]

In [38]:
feature_set[0]

({'contains(film)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(like)': True,
  'contains(even)': False,
  'contains(good)': True,
  'contains(time)': True,
  'contains(story)': False,
  'contains(would)': False,
  'contains(much)': False,
  'contains(character)': False,
  'contains(also)': False,
  'contains(get)': False,
  'contains(two)': False,
  'contains(well)': True,
  'contains(characters)': False,
  'contains(first)': False,
  'contains(--)': False,
  'contains(see)': True,
  'contains(way)': False,
  'contains(make)': False,
  'contains(life)': False,
  'contains(really)': False,
  'contains(films)': False,
  'contains(plot)': False,
  'contains(little)': False,
  'contains(people)': True,
  'contains(could)': False,
  'contains(scene)': True,
  'contains(man)': False,
  'contains(bad)': False,
  'contains(never)': False,
  'contains(best)': False,
  'contains(new)': False,
  'contains(scenes)': True,
  'contains(many)': False,
  'contains(director)'

# Training Classifier
From the feature set we created above, we now create a separate training set and a separate testing/validation set. The train set is used to train the classifier and the test set is used to test the classifier to check how accurately it classifies the given text.

#### Creating Train and Test Dataset

In this example, we use the first 400 elements of the feature set array as a test set and the rest of the data as a train set. Generally, 80/20 percent is a fair split between training and testing set, i.e. 80 percent training set and 20 percent testing set.


In [39]:
len(feature_set)

2000

In [41]:
# 80% train
train_set = feature_set[400:]
# 20% test
test_set = feature_set[:400]
len(train_set),len(test_set)

(1600, 400)

#### Training a Classifier

Now, we train a classifier using the training dataset. There are different kind of classifiers namely Naive Bayes Classifier, Maximum Entropy Classifier, Decision Tree Classifier, Support Vector Machine Classifier, etc.

In this example, we use the Naive Bayes Classifier. It’s a simple, fast, and easy classifier which performs well for small datasets. It’s a simple probabilistic classifier based on applying Bayes’ theorem. Bayes’ theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event.

In [42]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)

#### Testing the trained Classifier

Let’s see the accuracy percentage of the trained classifier. The accuracy value changes each time you run the program because of the names array being shuffled above.

In [43]:
from nltk import classify

In [44]:
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8175

Let’s see the output of the classifier by providing some custom reviews.

In [47]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmeterdogan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [49]:
from nltk import word_tokenize
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)
# negative review correctly classified as 'neg'

'neg'

In [50]:
# probability result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [52]:
prob_result.max()

'neg'

In [54]:
prob_result.prob('neg'), prob_result.prob('pos')
# e min less then 0 

(0.9999976737495611, 2.326250453636635e-06)

In [55]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Positive review is classified as negative
 
We need to improve our feature set for more accurate prediction

In [58]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result, prob_result.max()

(<ProbDist with 2 samples>, 'neg')

In [59]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.9999459511691823, 5.404883083514626e-05)

Let’s see the most informative features among the entire features in the feature set.

In [60]:
classifier.show_most_informative_features(10)

Most Informative Features
         contains(damon) = True              pos : neg    =     14.6 : 1.0
   contains(outstanding) = True              pos : neg    =      8.6 : 1.0
        contains(seagal) = True              neg : pos    =      8.0 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.7 : 1.0
    contains(ridiculous) = True              neg : pos    =      6.0 : 1.0
         contains(lucas) = True              pos : neg    =      6.0 : 1.0
          contains(lame) = True              neg : pos    =      5.6 : 1.0
         contains(bland) = True              neg : pos    =      5.5 : 1.0
        contains(poorly) = True              neg : pos    =      5.2 : 1.0
         contains(waste) = True              neg : pos    =      5.2 : 1.0


The result shows that the word outstanding is used in positive reviews 14.7 times more often than it is used in negative reviews the word poorly is used in negative reviews 7.7 times more often than it is used in positive reviews. Similarly, for other letters. These ratios are also called likelihood ratios.

Therefore, a review has a high chance to be classified as positive if it contains words like outstanding and wonderfully. Similarly, a review has a high chance of being classified as negative if it contains words like poorly, awful, waste, etc.

### Note: 

You can modify the document_features function to generate the feature set which can improve the accuracy of the trained classifier. Feature extractors are built through a process of trail-and-error & guided by intuitions.

# Bag of Words Feature
In the above example, we used top-N words feature. We used 2000 most frequently occurring words as our top-N words feature. The classifier identified negative review as negative. However, the classifier was not able to classify positive review correctly. It classified a positive review as negative.

### ''
### Top-N words feature

– The top-N words feature is also a bag-of-words feature.
– But in the top-N feature, we only used the top 2000 words in the feature set.
– We combined the positive and negative reviews into a single list, randomized the list, and then separated the train and test set.
– This approach can result in the un-even distribution of positive and negative reviews across the train and test set.

### Bag-of-words feature shown below

In the bag-of-words feature as shown below:

– We will use all the useful words of each review while creating the feature set.
– We take a fixed number of positive and negative reviews for train and test set.
– This result in equal distribution of positive and negative reviews across train and test set.

### ''
In the approach shown below, we will modify the feature extractor function.

– We form a list of unique words of each review.
– The category (pos or neg) is assigned to each bag of words.
– Then the category of any given text is calculated by matching the different bag-of-words & their respective category.

In [1]:
from nltk.corpus import movie_reviews

In [2]:
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)

In [3]:
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)

In [4]:
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [5]:
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [6]:
pos_reviews[0][:20]

['films',
 'adapted',
 'from',
 'comic',
 'books',
 'have',
 'had',
 'plenty',
 'of',
 'success',
 ',',
 'whether',
 'they',
 "'",
 're',
 'about',
 'superheroes',
 '(',
 'batman',
 ',']

In [7]:
neg_reviews[0][:20]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an']

#### Feature Extraction

We use the bag-of-words feature. Here, we clean the word list (i.e. remove stop words and punctuation). Then, we create a dictionary of cleaned words.

In [8]:
from nltk.corpus import stopwords
import string

In [10]:
stopwords_english = stopwords.words('english')

In [11]:
# feature extractor function
def bag_of_words(words):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    words_dict = dict([word, True] for word in words_clean)
    return words_dict

In [12]:
# using dict will remove duplicate words from the words list
# note the output: stopword 'the' is also removed
bag_of_words(['the', 'the', 'good', 'bad', 'the', 'good'])

{'good': True, 'bad': True}

#### Create Feature Set

We use the bag-of-words feature and tag each review with its respective category as positive or negative.

In [13]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_words(words), 'neg'))

In [15]:
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [16]:
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as a test set. The remaining negative and positive reviews will be taken as a training set.

#### Note:
– There is difference between pos_reviews & pos_reviews_set array which are defined above.

– pos_reviews array contains words list only

– pos_reviews_set array contains words feature list

– pos_reviews_set & neg_reviews_set arrays are used to create train and test set as shown below

In [23]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program

from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [24]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [25]:
from nltk import classify, NaiveBayesClassifier

In [26]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy =  classify.accuracy(classifier, test_set)
accuracy

0.6725

In [27]:
classifier.show_most_informative_features(10)

Most Informative Features
                   sucks = True              neg : pos    =     13.0 : 1.0
               marvelous = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
               affecting = True              pos : neg    =     11.7 : 1.0
                    taxi = True              pos : neg    =     11.0 : 1.0
                  hatred = True              pos : neg    =     10.3 : 1.0
                headache = True              neg : pos    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0
                  avoids = True              pos : neg    =      9.7 : 1.0


#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [31]:
#probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [32]:
prob_result.max()

'neg'

In [33]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.6896168633261128, 0.31038313667388745)

In [38]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [40]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [41]:
prob_result.max()

'pos'

In [42]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9033209854909708, 0.0966790145090289)

# Bi-gram Features
N-grams are common terms in text processing and analysis. N-grams are related with words of a text. There are different n-grams like unigram, bigram, trigram, etc.

Unigram = Item having a single word, i.e. the n-gram of size 1. For example, good.

Bigram = Item having two words, i.e. the n-gram of size 2. For example, very good.

Trigram = Item having three words, i.e. the n-gram of size 3. For example, not so good.

In the above bag-of-words model, we only used the unigram feature. In the example below, we will use both unigram and bigram feature, i.e. we will deal with both single words and double words.

#### Feature Extraction

In this case, both unigrams and bigrams are used as features.

We define two functions:

– bag_of_words: that extracts only unigram features from the movie review words

– bag_of_ngrams: that extracts only bigram features from the movie review words

We then define another function:

– bag_of_all_words: that combines both unigram and bigram features

In [43]:
from nltk import ngrams
from nltk.corpus import stopwords
import string

stopwords_english = stopwords.words('english')

In [44]:
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    return words_clean

In [45]:
# feature extractor function for unigram
def bag_of_words(words):
    words_dict = dict([word, True] for word in words)
    return words_dict

In [52]:
# feature extractor function for ngrams (bigram)
def bag_of_unigrms(words, n=2):
    words_ng = []
    for item in iter(ngrams(words,n)):
        words_ng.append(item)
    words_dict = dict([word, True] for word in words_ng)
    return words_dict
    

'''
 Alternative Bi-gram feature extractor 
 using BigramCollocationFinder module
 
 Collocations are multiple words which commonly co-occur.

http://www.nltk.org/howto/collocations.html

https://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
 
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
feature extractor function for ngrams (bigram)

get 200 most frequently occurring bigrams from every review

def bag_of_ngrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    
    bigram_finder = BigramCollocationFinder.from_words(words)
    
    bigrams = bigram_finder.nbest(score_fn, n)
    
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
'''

In [53]:
from nltk.tokenize import word_tokenize

In [54]:
text = "It was a very good movie."

words = word_tokenize(text.lower())
words

['it', 'was', 'a', 'very', 'good', 'movie', '.']

In [55]:
bag_of_words(words)

{'it': True,
 'was': True,
 'a': True,
 'very': True,
 'good': True,
 'movie': True,
 '.': True}

In [56]:
bag_of_unigrms(words)

{('it', 'was'): True,
 ('was', 'a'): True,
 ('a', 'very'): True,
 ('very', 'good'): True,
 ('good', 'movie'): True,
 ('movie', '.'): True}

In [57]:
# working with cleaning words
# i.e. removing stopwords and punctuation
words_clean = clean_words(words, stopwords_english)
words_clean

['good', 'movie']

In [58]:
# cleaning words is find for unigrams
# but this can omit important words for bigrams
# for example, stopwords like very, over, under, so, etc. are important for bigrams
# we create a new stopwords list specifically for bigrams by omitting such important words
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']

In [59]:
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)

In [62]:
words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
words_clean_for_bigram

['very', 'good', 'movie']

In [63]:
# We will use general stopwords for unigrams 
# And special stopwords list for bigrams
unigram_features = bag_of_words(words_clean)
unigram_features

{'good': True, 'movie': True}

In [66]:
bigram_features = bag_of_unigrms(words_clean_for_bigram)
bigram_features

{('very', 'good'): True, ('good', 'movie'): True}

In [67]:
# combine both unigram and bigram features
all_features = unigram_features.copy()
all_features.update(bigram_features)
all_features

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [68]:
# let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
    
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_unigrms(words_clean_for_bigram)
    
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
    
    return all_features
bag_of_all_words(words)

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

Working with NLTK’s movie reviews corpus

In [70]:
from nltk.corpus import movie_reviews 

In [71]:
pos_reviews = []
for filed in movie_reviews.fileids('pos'):
    words = movie_reviews.words(filed)
    pos_reviews.append(words)
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [72]:
neg_reviews = []
for filed in movie_reviews.fileids('neg'):
    words = movie_reviews.words(filed)
    neg_reviews.append(words)
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

#### Create Feature Set

In [75]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [76]:
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as the test set. The remaining negative and positive reviews will be taken as the training set.

In [77]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [78]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [79]:
from nltk import classify, NaiveBayesClassifier

In [80]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8275

In [81]:
classifier.show_most_informative_features(10)

Most Informative Features
        ('one', 'worst') = True              neg : pos    =     17.0 : 1.0
                   sucks = True              neg : pos    =     14.3 : 1.0
       ('waste', 'time') = True              neg : pos    =     11.3 : 1.0
             outstanding = True              pos : neg    =     11.2 : 1.0
     ('saving', 'grace') = True              neg : pos    =     11.0 : 1.0
               maintains = True              pos : neg    =     11.0 : 1.0
       ('one', 'better') = True              pos : neg    =     11.0 : 1.0
                 idiotic = True              neg : pos    =     10.6 : 1.0
              dreamworks = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0


#### Note:

– The accuracy of the classifier has significantly increased when trained with combined feature set (unigram + bigram).

– Accuracy was 67.25% while using only Unigram features.

– Accuracy has increased to 82.75% while using combined (unigram + bigram) features.

#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.



In [84]:
from nltk.tokenize import word_tokenize

In [85]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [86]:
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [87]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [88]:
prob_result.max()

'neg'

In [89]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.963986611471882, 0.03601338852811978)

In [90]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [91]:
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [92]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [93]:
prob_result.max()

'pos'

In [94]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9710979916165804, 0.02890200838342214)

In [1]:
# get data
import nltk
# nltk.download('movie_reviews')

In [3]:
from nltk.corpus import movie_reviews

In [4]:
# data size
len(movie_reviews.fileids())

2000

In [5]:
# data categs
movie_reviews.categories()

['neg', 'pos']

In [6]:
# cteg len
'neg data len :', len(movie_reviews.fileids('neg')), 'pos data len :', len(movie_reviews.fileids('pos'))

('neg data len :', 1000, 'pos data len :', 1000)

In [7]:
movie_reviews.fileids('pos')[0]

'pos/cv000_29590.txt'

convert to list all docs

In [8]:
docs = []
for categ in movie_reviews.categories():
    for fileid in data.fileids(categ):
        docs.append((data.words(fileid), categ))

In [9]:
len(docs)

2000

In [10]:
docs[0]

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')

In [11]:
# shufle data
from random import shuffle
shuffle(docs)

# Feature Extraction

To classify the text into any category, we need to define some criteria. On the basis of those criteria, our classifier will learn that a particular kind of text falls in a particular category. This kind of criteria is known as feature. We can define one or more feature to train our classifier.

In this example, we will use the top-N words feature.

#### Fetch all words from the movie reviews corpus

We first fetch all the words from all the movie reviews and create a list.

In [12]:
all_words = [word.lower() for word in data.words()]

In [13]:
all_words[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

#### Create Frequency Distribution of all words

Frequency Distribution will calculate the number of occurence of each word in the entire list of words.

In [14]:
from nltk import FreqDist

In [26]:
all_words_frequency = FreqDist(all_words)
all_words_frequency

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [16]:
all_words_frequency.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

#### Removing Punctuation and Stopwords

From the above frequency distribution of words, we can see the most frequently occurring words are either punctuation marks or stopwords.

Stop words are those frequently words which do not carry any significant meaning in text analysis. For example, I, me, my, the, a, and, is, are, he, she, we, etc.

Punctuation marks like comma, fullstop. inverted comma, etc. occur highly in any text data.

We will do data cleaning by removing stop words and punctuations.

#### Remove Stop Words

In [17]:
# download stopwords from nltk but once time only
# nltk.download('stopwords')

In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
# remove stopwords from all_words
all_words_removed_stopwords = [word for word in all_words if word not in stop_words]
all_words_removed_stopwords

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 ',',
 'drink',
 'drive',
 '.',
 'get',
 'accident',
 '.',
 'one',
 'guys',
 'dies',
 ',',
 'girlfriend',
 'continues',
 'see',
 'life',
 ',',
 'nightmares',
 '.',
 "'",
 'deal',
 '?',
 'watch',
 'movie',
 '"',
 'sorta',
 '"',
 'find',
 '.',
 '.',
 '.',
 'critique',
 ':',
 'mind',
 '-',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 ',',
 'presents',
 'bad',
 'package',
 '.',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 ',',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 ',',
 'mess',
 'head',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 ',',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 ',',
 'folks',
 "'",
 'snag',
 'one',
 'correctly',
 '.',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 ',',
 'executed',
 'terribly',
 '.',
 'problems',
 'movie',
 '?',
 'well',
 ',',
 'main',
 'problem',
 "'",
 'simply',
 'jumbled',


#### Remove Punctuation

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
all_words_removed_punctuation = [word for word in all_words_removed_stopwords if word not in string.punctuation]
all_words_removed_punctuation

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

In [23]:
# do last two step together
cleaned_all_words = [word for word in all_words if word not in stop_words and word not in string.punctuation]
cleaned_all_words

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

#### Frequency Distribution of cleaned words list

Below is the frequency distribution of the new list after removing stopwords and punctuation.

In [27]:
len(all_words_frequency)

39768

In [28]:
all_words_frequency = FreqDist(cleaned_words)
all_words_frequency

FreqDist({'film': 9517, 'one': 5852, 'movie': 5771, 'like': 3690, 'even': 2565, 'good': 2411, 'time': 2411, 'story': 2169, 'would': 2109, 'much': 2049, ...})

In [30]:
len(all_words_frequency)

39586

Previously, before removing stopwords and punctuation, the frequency distribution was:

FreqDist with 39768 samples and 1583820 outcomes

Now, the frequency distribution is:

FreqDist with 39586 samples and 710578 outcomes

This shows that after removing around 200 stop words and punctuation, the outcomes/words number has reduced to around half of the original size.

The most common words or highly occurring words list has also got meaningful words in the list. Before, the first 10 frequently occurring words were only stop-words and punctuations.

#### Create Word Feature using 2000 most frequently occurring words

We take 2000 most frequently occurring words as our feature.

In [31]:
most_common_words = all_words_frequency.most_common(2000)
most_common_words

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906),
 ('characters', 1859),
 ('first', 1836),
 ('--', 1815),
 ('see', 1749),
 ('way', 1693),
 ('make', 1642),
 ('life', 1586),
 ('really', 1558),
 ('films', 1536),
 ('plot', 1513),
 ('little', 1501),
 ('people', 1455),
 ('could', 1427),
 ('scene', 1397),
 ('man', 1396),
 ('bad', 1395),
 ('never', 1374),
 ('best', 1333),
 ('new', 1292),
 ('scenes', 1274),
 ('many', 1268),
 ('director', 1237),
 ('know', 1217),
 ('movies', 1206),
 ('action', 1172),
 ('great', 1148),
 ('another', 1121),
 ('love', 1119),
 ('go', 1113),
 ('made', 1084),
 ('us', 1073),
 ('big', 1064),
 ('end', 1062),
 ('something', 1061),
 ('back', 1060),
 ('still', 1047),
 ('world', 1037),
 ('seems', 1033),
 ('work', 1020),
 ('makes', 992),
 ('however', 989),
 ('every', 947)

In [32]:
# the most common words list's elements are in the form of tuple
# get only the first element of each tuple of the word list
word_features = [row[0] for row in most_common_words]
word_features

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'also',
 'get',
 'two',
 'well',
 'characters',
 'first',
 '--',
 'see',
 'way',
 'make',
 'life',
 'really',
 'films',
 'plot',
 'little',
 'people',
 'could',
 'scene',
 'man',
 'bad',
 'never',
 'best',
 'new',
 'scenes',
 'many',
 'director',
 'know',
 'movies',
 'action',
 'great',
 'another',
 'love',
 'go',
 'made',
 'us',
 'big',
 'end',
 'something',
 'back',
 'still',
 'world',
 'seems',
 'work',
 'makes',
 'however',
 'every',
 'though',
 'better',
 'real',
 'audience',
 'enough',
 'seen',
 'take',
 'around',
 'going',
 'year',
 'performance',
 'role',
 'old',
 'gets',
 'may',
 'things',
 'think',
 'years',
 'last',
 'comedy',
 'funny',
 'actually',
 'long',
 'look',
 'almost',
 'thing',
 'fact',
 'nothing',
 'say',
 'right',
 'john',
 'although',
 'played',
 'find',
 'script',
 'come',
 'ever',
 'cast',
 'since',
 'star',
 'plays',
 'young',
 'show',
 'comes',
 'part',

#### Create Feature Set

Now, we write a function that will be used to create feature set. The feature set is used to train the classifier.

We define a feature extractor function that checks if the words in a given document are present in the word_features list or not.

In [33]:
def doc_features(doc):
    # "set" function will remove repeated/duplicate tokens in the given list
    doc_words = set(doc)
    feetures = {}
    for word in word_features:
        feetures['contains(%s)' % word] = (word in doc_words)
    return feetures
# get first neg movie review
movie_review_file = movie_reviews.fileids('neg')[0]
movie_review_file

'neg/cv000_29416.txt'

In [34]:
doc_features(movie_reviews.words(movie_review_file))

{'contains(film)': True,
 'contains(one)': True,
 'contains(movie)': True,
 'contains(like)': True,
 'contains(even)': True,
 'contains(good)': True,
 'contains(time)': False,
 'contains(story)': False,
 'contains(would)': True,
 'contains(much)': False,
 'contains(character)': True,
 'contains(also)': True,
 'contains(get)': True,
 'contains(two)': True,
 'contains(well)': True,
 'contains(characters)': True,
 'contains(first)': False,
 'contains(--)': False,
 'contains(see)': True,
 'contains(way)': True,
 'contains(make)': True,
 'contains(life)': True,
 'contains(really)': True,
 'contains(films)': True,
 'contains(plot)': True,
 'contains(little)': True,
 'contains(people)': True,
 'contains(could)': False,
 'contains(scene)': False,
 'contains(man)': False,
 'contains(bad)': True,
 'contains(never)': False,
 'contains(best)': False,
 'contains(new)': True,
 'contains(scenes)': True,
 'contains(many)': False,
 'contains(director)': True,
 'contains(know)': True,
 'contains(movies)

In the beginning of this article, we have created the documents list which contains data of all the movie reviews. Its elements are tuples with word list as first item and review category as the second item of the tuple.

In [36]:
# print first tuple of the documents list
docs[0]

(['synopsis', ':', 'nice', 'girl', 'susanne', 'has', ...], 'neg')

We now loop through the documents list and create a feature set list using the features_doc function defined above.

– Each item of the feature_set list is a tuple.

– The first item of the tuple is the dictionary returned from document_features function

– The second item of the tuple is the category (pos or neg) of the movie review

In [37]:
feature_set = [(doc_features(doc),category) for (doc, category) in docs]

In [38]:
feature_set[0]

({'contains(film)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(like)': True,
  'contains(even)': False,
  'contains(good)': True,
  'contains(time)': True,
  'contains(story)': False,
  'contains(would)': False,
  'contains(much)': False,
  'contains(character)': False,
  'contains(also)': False,
  'contains(get)': False,
  'contains(two)': False,
  'contains(well)': True,
  'contains(characters)': False,
  'contains(first)': False,
  'contains(--)': False,
  'contains(see)': True,
  'contains(way)': False,
  'contains(make)': False,
  'contains(life)': False,
  'contains(really)': False,
  'contains(films)': False,
  'contains(plot)': False,
  'contains(little)': False,
  'contains(people)': True,
  'contains(could)': False,
  'contains(scene)': True,
  'contains(man)': False,
  'contains(bad)': False,
  'contains(never)': False,
  'contains(best)': False,
  'contains(new)': False,
  'contains(scenes)': True,
  'contains(many)': False,
  'contains(director)'

# Training Classifier
From the feature set we created above, we now create a separate training set and a separate testing/validation set. The train set is used to train the classifier and the test set is used to test the classifier to check how accurately it classifies the given text.

#### Creating Train and Test Dataset

In this example, we use the first 400 elements of the feature set array as a test set and the rest of the data as a train set. Generally, 80/20 percent is a fair split between training and testing set, i.e. 80 percent training set and 20 percent testing set.


In [39]:
len(feature_set)

2000

In [41]:
# 80% train
train_set = feature_set[400:]
# 20% test
test_set = feature_set[:400]
len(train_set),len(test_set)

(1600, 400)

#### Training a Classifier

Now, we train a classifier using the training dataset. There are different kind of classifiers namely Naive Bayes Classifier, Maximum Entropy Classifier, Decision Tree Classifier, Support Vector Machine Classifier, etc.

In this example, we use the Naive Bayes Classifier. It’s a simple, fast, and easy classifier which performs well for small datasets. It’s a simple probabilistic classifier based on applying Bayes’ theorem. Bayes’ theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event.

In [42]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)

#### Testing the trained Classifier

Let’s see the accuracy percentage of the trained classifier. The accuracy value changes each time you run the program because of the names array being shuffled above.

In [43]:
from nltk import classify

In [44]:
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8175

Let’s see the output of the classifier by providing some custom reviews.

In [47]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmeterdogan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [49]:
from nltk import word_tokenize
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)
# negative review correctly classified as 'neg'

'neg'

In [50]:
# probability result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [52]:
prob_result.max()

'neg'

In [54]:
prob_result.prob('neg'), prob_result.prob('pos')
# e min less then 0 

(0.9999976737495611, 2.326250453636635e-06)

In [55]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Positive review is classified as negative
 
We need to improve our feature set for more accurate prediction

In [58]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result, prob_result.max()

(<ProbDist with 2 samples>, 'neg')

In [59]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.9999459511691823, 5.404883083514626e-05)

Let’s see the most informative features among the entire features in the feature set.

In [60]:
classifier.show_most_informative_features(10)

Most Informative Features
         contains(damon) = True              pos : neg    =     14.6 : 1.0
   contains(outstanding) = True              pos : neg    =      8.6 : 1.0
        contains(seagal) = True              neg : pos    =      8.0 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.7 : 1.0
    contains(ridiculous) = True              neg : pos    =      6.0 : 1.0
         contains(lucas) = True              pos : neg    =      6.0 : 1.0
          contains(lame) = True              neg : pos    =      5.6 : 1.0
         contains(bland) = True              neg : pos    =      5.5 : 1.0
        contains(poorly) = True              neg : pos    =      5.2 : 1.0
         contains(waste) = True              neg : pos    =      5.2 : 1.0


The result shows that the word outstanding is used in positive reviews 14.7 times more often than it is used in negative reviews the word poorly is used in negative reviews 7.7 times more often than it is used in positive reviews. Similarly, for other letters. These ratios are also called likelihood ratios.

Therefore, a review has a high chance to be classified as positive if it contains words like outstanding and wonderfully. Similarly, a review has a high chance of being classified as negative if it contains words like poorly, awful, waste, etc.

### Note: 

You can modify the document_features function to generate the feature set which can improve the accuracy of the trained classifier. Feature extractors are built through a process of trail-and-error & guided by intuitions.

# Bag of Words Feature
In the above example, we used top-N words feature. We used 2000 most frequently occurring words as our top-N words feature. The classifier identified negative review as negative. However, the classifier was not able to classify positive review correctly. It classified a positive review as negative.

### ''
### Top-N words feature

– The top-N words feature is also a bag-of-words feature.
– But in the top-N feature, we only used the top 2000 words in the feature set.
– We combined the positive and negative reviews into a single list, randomized the list, and then separated the train and test set.
– This approach can result in the un-even distribution of positive and negative reviews across the train and test set.

### Bag-of-words feature shown below

In the bag-of-words feature as shown below:

– We will use all the useful words of each review while creating the feature set.
– We take a fixed number of positive and negative reviews for train and test set.
– This result in equal distribution of positive and negative reviews across train and test set.

### ''
In the approach shown below, we will modify the feature extractor function.

– We form a list of unique words of each review.
– The category (pos or neg) is assigned to each bag of words.
– Then the category of any given text is calculated by matching the different bag-of-words & their respective category.

In [1]:
from nltk.corpus import movie_reviews

In [2]:
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)

In [3]:
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)

In [4]:
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [5]:
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [6]:
pos_reviews[0][:20]

['films',
 'adapted',
 'from',
 'comic',
 'books',
 'have',
 'had',
 'plenty',
 'of',
 'success',
 ',',
 'whether',
 'they',
 "'",
 're',
 'about',
 'superheroes',
 '(',
 'batman',
 ',']

In [7]:
neg_reviews[0][:20]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an']

#### Feature Extraction

We use the bag-of-words feature. Here, we clean the word list (i.e. remove stop words and punctuation). Then, we create a dictionary of cleaned words.

In [8]:
from nltk.corpus import stopwords
import string

In [10]:
stopwords_english = stopwords.words('english')

In [11]:
# feature extractor function
def bag_of_words(words):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    words_dict = dict([word, True] for word in words_clean)
    return words_dict

In [12]:
# using dict will remove duplicate words from the words list
# note the output: stopword 'the' is also removed
bag_of_words(['the', 'the', 'good', 'bad', 'the', 'good'])

{'good': True, 'bad': True}

#### Create Feature Set

We use the bag-of-words feature and tag each review with its respective category as positive or negative.

In [13]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_words(words), 'neg'))

In [15]:
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [16]:
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as a test set. The remaining negative and positive reviews will be taken as a training set.

#### Note:
– There is difference between pos_reviews & pos_reviews_set array which are defined above.

– pos_reviews array contains words list only

– pos_reviews_set array contains words feature list

– pos_reviews_set & neg_reviews_set arrays are used to create train and test set as shown below

In [23]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program

from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [24]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [25]:
from nltk import classify, NaiveBayesClassifier

In [26]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy =  classify.accuracy(classifier, test_set)
accuracy

0.6725

In [27]:
classifier.show_most_informative_features(10)

Most Informative Features
                   sucks = True              neg : pos    =     13.0 : 1.0
               marvelous = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
               affecting = True              pos : neg    =     11.7 : 1.0
                    taxi = True              pos : neg    =     11.0 : 1.0
                  hatred = True              pos : neg    =     10.3 : 1.0
                headache = True              neg : pos    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0
                  avoids = True              pos : neg    =      9.7 : 1.0


#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [31]:
#probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [32]:
prob_result.max()

'neg'

In [33]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.6896168633261128, 0.31038313667388745)

In [38]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [40]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [41]:
prob_result.max()

'pos'

In [42]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9033209854909708, 0.0966790145090289)

# Bi-gram Features
N-grams are common terms in text processing and analysis. N-grams are related with words of a text. There are different n-grams like unigram, bigram, trigram, etc.

Unigram = Item having a single word, i.e. the n-gram of size 1. For example, good.

Bigram = Item having two words, i.e. the n-gram of size 2. For example, very good.

Trigram = Item having three words, i.e. the n-gram of size 3. For example, not so good.

In the above bag-of-words model, we only used the unigram feature. In the example below, we will use both unigram and bigram feature, i.e. we will deal with both single words and double words.

#### Feature Extraction

In this case, both unigrams and bigrams are used as features.

We define two functions:

– bag_of_words: that extracts only unigram features from the movie review words

– bag_of_ngrams: that extracts only bigram features from the movie review words

We then define another function:

– bag_of_all_words: that combines both unigram and bigram features

In [43]:
from nltk import ngrams
from nltk.corpus import stopwords
import string

stopwords_english = stopwords.words('english')

In [44]:
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    return words_clean

In [45]:
# feature extractor function for unigram
def bag_of_words(words):
    words_dict = dict([word, True] for word in words)
    return words_dict

In [52]:
# feature extractor function for ngrams (bigram)
def bag_of_unigrms(words, n=2):
    words_ng = []
    for item in iter(ngrams(words,n)):
        words_ng.append(item)
    words_dict = dict([word, True] for word in words_ng)
    return words_dict
    

'''
 Alternative Bi-gram feature extractor 
 using BigramCollocationFinder module
 
 Collocations are multiple words which commonly co-occur.

http://www.nltk.org/howto/collocations.html

https://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
 
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
feature extractor function for ngrams (bigram)

get 200 most frequently occurring bigrams from every review

def bag_of_ngrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    
    bigram_finder = BigramCollocationFinder.from_words(words)
    
    bigrams = bigram_finder.nbest(score_fn, n)
    
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
'''

In [53]:
from nltk.tokenize import word_tokenize

In [54]:
text = "It was a very good movie."

words = word_tokenize(text.lower())
words

['it', 'was', 'a', 'very', 'good', 'movie', '.']

In [55]:
bag_of_words(words)

{'it': True,
 'was': True,
 'a': True,
 'very': True,
 'good': True,
 'movie': True,
 '.': True}

In [56]:
bag_of_unigrms(words)

{('it', 'was'): True,
 ('was', 'a'): True,
 ('a', 'very'): True,
 ('very', 'good'): True,
 ('good', 'movie'): True,
 ('movie', '.'): True}

In [57]:
# working with cleaning words
# i.e. removing stopwords and punctuation
words_clean = clean_words(words, stopwords_english)
words_clean

['good', 'movie']

In [58]:
# cleaning words is find for unigrams
# but this can omit important words for bigrams
# for example, stopwords like very, over, under, so, etc. are important for bigrams
# we create a new stopwords list specifically for bigrams by omitting such important words
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']

In [59]:
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)

In [62]:
words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
words_clean_for_bigram

['very', 'good', 'movie']

In [63]:
# We will use general stopwords for unigrams 
# And special stopwords list for bigrams
unigram_features = bag_of_words(words_clean)
unigram_features

{'good': True, 'movie': True}

In [66]:
bigram_features = bag_of_unigrms(words_clean_for_bigram)
bigram_features

{('very', 'good'): True, ('good', 'movie'): True}

In [67]:
# combine both unigram and bigram features
all_features = unigram_features.copy()
all_features.update(bigram_features)
all_features

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [68]:
# let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
    
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_unigrms(words_clean_for_bigram)
    
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
    
    return all_features
bag_of_all_words(words)

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

Working with NLTK’s movie reviews corpus

In [70]:
from nltk.corpus import movie_reviews 

In [71]:
pos_reviews = []
for filed in movie_reviews.fileids('pos'):
    words = movie_reviews.words(filed)
    pos_reviews.append(words)
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [72]:
neg_reviews = []
for filed in movie_reviews.fileids('neg'):
    words = movie_reviews.words(filed)
    neg_reviews.append(words)
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

#### Create Feature Set

In [75]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [76]:
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as the test set. The remaining negative and positive reviews will be taken as the training set.

In [77]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [78]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [79]:
from nltk import classify, NaiveBayesClassifier

In [80]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8275

In [81]:
classifier.show_most_informative_features(10)

Most Informative Features
        ('one', 'worst') = True              neg : pos    =     17.0 : 1.0
                   sucks = True              neg : pos    =     14.3 : 1.0
       ('waste', 'time') = True              neg : pos    =     11.3 : 1.0
             outstanding = True              pos : neg    =     11.2 : 1.0
     ('saving', 'grace') = True              neg : pos    =     11.0 : 1.0
               maintains = True              pos : neg    =     11.0 : 1.0
       ('one', 'better') = True              pos : neg    =     11.0 : 1.0
                 idiotic = True              neg : pos    =     10.6 : 1.0
              dreamworks = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0


#### Note:

– The accuracy of the classifier has significantly increased when trained with combined feature set (unigram + bigram).

– Accuracy was 67.25% while using only Unigram features.

– Accuracy has increased to 82.75% while using combined (unigram + bigram) features.

#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.



In [84]:
from nltk.tokenize import word_tokenize

In [85]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [86]:
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [87]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [88]:
prob_result.max()

'neg'

In [89]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.963986611471882, 0.03601338852811978)

In [90]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [91]:
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [92]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [93]:
prob_result.max()

'pos'

In [94]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9710979916165804, 0.02890200838342214)

In [1]:
# get data
import nltk
# nltk.download('movie_reviews')

In [3]:
from nltk.corpus import movie_reviews

In [4]:
# data size
len(movie_reviews.fileids())

2000

In [5]:
# data categs
movie_reviews.categories()

['neg', 'pos']

In [6]:
# cteg len
'neg data len :', len(movie_reviews.fileids('neg')), 'pos data len :', len(movie_reviews.fileids('pos'))

('neg data len :', 1000, 'pos data len :', 1000)

In [7]:
movie_reviews.fileids('pos')[0]

'pos/cv000_29590.txt'

convert to list all docs

In [8]:
docs = []
for categ in movie_reviews.categories():
    for fileid in data.fileids(categ):
        docs.append((data.words(fileid), categ))

In [9]:
len(docs)

2000

In [10]:
docs[0]

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')

In [11]:
# shufle data
from random import shuffle
shuffle(docs)

# Feature Extraction

To classify the text into any category, we need to define some criteria. On the basis of those criteria, our classifier will learn that a particular kind of text falls in a particular category. This kind of criteria is known as feature. We can define one or more feature to train our classifier.

In this example, we will use the top-N words feature.

#### Fetch all words from the movie reviews corpus

We first fetch all the words from all the movie reviews and create a list.

In [12]:
all_words = [word.lower() for word in data.words()]

In [13]:
all_words[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

#### Create Frequency Distribution of all words

Frequency Distribution will calculate the number of occurence of each word in the entire list of words.

In [14]:
from nltk import FreqDist

In [26]:
all_words_frequency = FreqDist(all_words)
all_words_frequency

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [16]:
all_words_frequency.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

#### Removing Punctuation and Stopwords

From the above frequency distribution of words, we can see the most frequently occurring words are either punctuation marks or stopwords.

Stop words are those frequently words which do not carry any significant meaning in text analysis. For example, I, me, my, the, a, and, is, are, he, she, we, etc.

Punctuation marks like comma, fullstop. inverted comma, etc. occur highly in any text data.

We will do data cleaning by removing stop words and punctuations.

#### Remove Stop Words

In [17]:
# download stopwords from nltk but once time only
# nltk.download('stopwords')

In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
# remove stopwords from all_words
all_words_removed_stopwords = [word for word in all_words if word not in stop_words]
all_words_removed_stopwords

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 ',',
 'drink',
 'drive',
 '.',
 'get',
 'accident',
 '.',
 'one',
 'guys',
 'dies',
 ',',
 'girlfriend',
 'continues',
 'see',
 'life',
 ',',
 'nightmares',
 '.',
 "'",
 'deal',
 '?',
 'watch',
 'movie',
 '"',
 'sorta',
 '"',
 'find',
 '.',
 '.',
 '.',
 'critique',
 ':',
 'mind',
 '-',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 ',',
 'presents',
 'bad',
 'package',
 '.',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 ',',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 ',',
 'mess',
 'head',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 ',',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 ',',
 'folks',
 "'",
 'snag',
 'one',
 'correctly',
 '.',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 ',',
 'executed',
 'terribly',
 '.',
 'problems',
 'movie',
 '?',
 'well',
 ',',
 'main',
 'problem',
 "'",
 'simply',
 'jumbled',


#### Remove Punctuation

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
all_words_removed_punctuation = [word for word in all_words_removed_stopwords if word not in string.punctuation]
all_words_removed_punctuation

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

In [23]:
# do last two step together
cleaned_all_words = [word for word in all_words if word not in stop_words and word not in string.punctuation]
cleaned_all_words

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

#### Frequency Distribution of cleaned words list

Below is the frequency distribution of the new list after removing stopwords and punctuation.

In [27]:
len(all_words_frequency)

39768

In [28]:
all_words_frequency = FreqDist(cleaned_words)
all_words_frequency

FreqDist({'film': 9517, 'one': 5852, 'movie': 5771, 'like': 3690, 'even': 2565, 'good': 2411, 'time': 2411, 'story': 2169, 'would': 2109, 'much': 2049, ...})

In [30]:
len(all_words_frequency)

39586

Previously, before removing stopwords and punctuation, the frequency distribution was:

FreqDist with 39768 samples and 1583820 outcomes

Now, the frequency distribution is:

FreqDist with 39586 samples and 710578 outcomes

This shows that after removing around 200 stop words and punctuation, the outcomes/words number has reduced to around half of the original size.

The most common words or highly occurring words list has also got meaningful words in the list. Before, the first 10 frequently occurring words were only stop-words and punctuations.

#### Create Word Feature using 2000 most frequently occurring words

We take 2000 most frequently occurring words as our feature.

In [31]:
most_common_words = all_words_frequency.most_common(2000)
most_common_words

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906),
 ('characters', 1859),
 ('first', 1836),
 ('--', 1815),
 ('see', 1749),
 ('way', 1693),
 ('make', 1642),
 ('life', 1586),
 ('really', 1558),
 ('films', 1536),
 ('plot', 1513),
 ('little', 1501),
 ('people', 1455),
 ('could', 1427),
 ('scene', 1397),
 ('man', 1396),
 ('bad', 1395),
 ('never', 1374),
 ('best', 1333),
 ('new', 1292),
 ('scenes', 1274),
 ('many', 1268),
 ('director', 1237),
 ('know', 1217),
 ('movies', 1206),
 ('action', 1172),
 ('great', 1148),
 ('another', 1121),
 ('love', 1119),
 ('go', 1113),
 ('made', 1084),
 ('us', 1073),
 ('big', 1064),
 ('end', 1062),
 ('something', 1061),
 ('back', 1060),
 ('still', 1047),
 ('world', 1037),
 ('seems', 1033),
 ('work', 1020),
 ('makes', 992),
 ('however', 989),
 ('every', 947)

In [32]:
# the most common words list's elements are in the form of tuple
# get only the first element of each tuple of the word list
word_features = [row[0] for row in most_common_words]
word_features

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'also',
 'get',
 'two',
 'well',
 'characters',
 'first',
 '--',
 'see',
 'way',
 'make',
 'life',
 'really',
 'films',
 'plot',
 'little',
 'people',
 'could',
 'scene',
 'man',
 'bad',
 'never',
 'best',
 'new',
 'scenes',
 'many',
 'director',
 'know',
 'movies',
 'action',
 'great',
 'another',
 'love',
 'go',
 'made',
 'us',
 'big',
 'end',
 'something',
 'back',
 'still',
 'world',
 'seems',
 'work',
 'makes',
 'however',
 'every',
 'though',
 'better',
 'real',
 'audience',
 'enough',
 'seen',
 'take',
 'around',
 'going',
 'year',
 'performance',
 'role',
 'old',
 'gets',
 'may',
 'things',
 'think',
 'years',
 'last',
 'comedy',
 'funny',
 'actually',
 'long',
 'look',
 'almost',
 'thing',
 'fact',
 'nothing',
 'say',
 'right',
 'john',
 'although',
 'played',
 'find',
 'script',
 'come',
 'ever',
 'cast',
 'since',
 'star',
 'plays',
 'young',
 'show',
 'comes',
 'part',

#### Create Feature Set

Now, we write a function that will be used to create feature set. The feature set is used to train the classifier.

We define a feature extractor function that checks if the words in a given document are present in the word_features list or not.

In [33]:
def doc_features(doc):
    # "set" function will remove repeated/duplicate tokens in the given list
    doc_words = set(doc)
    feetures = {}
    for word in word_features:
        feetures['contains(%s)' % word] = (word in doc_words)
    return feetures
# get first neg movie review
movie_review_file = movie_reviews.fileids('neg')[0]
movie_review_file

'neg/cv000_29416.txt'

In [34]:
doc_features(movie_reviews.words(movie_review_file))

{'contains(film)': True,
 'contains(one)': True,
 'contains(movie)': True,
 'contains(like)': True,
 'contains(even)': True,
 'contains(good)': True,
 'contains(time)': False,
 'contains(story)': False,
 'contains(would)': True,
 'contains(much)': False,
 'contains(character)': True,
 'contains(also)': True,
 'contains(get)': True,
 'contains(two)': True,
 'contains(well)': True,
 'contains(characters)': True,
 'contains(first)': False,
 'contains(--)': False,
 'contains(see)': True,
 'contains(way)': True,
 'contains(make)': True,
 'contains(life)': True,
 'contains(really)': True,
 'contains(films)': True,
 'contains(plot)': True,
 'contains(little)': True,
 'contains(people)': True,
 'contains(could)': False,
 'contains(scene)': False,
 'contains(man)': False,
 'contains(bad)': True,
 'contains(never)': False,
 'contains(best)': False,
 'contains(new)': True,
 'contains(scenes)': True,
 'contains(many)': False,
 'contains(director)': True,
 'contains(know)': True,
 'contains(movies)

In the beginning of this article, we have created the documents list which contains data of all the movie reviews. Its elements are tuples with word list as first item and review category as the second item of the tuple.

In [36]:
# print first tuple of the documents list
docs[0]

(['synopsis', ':', 'nice', 'girl', 'susanne', 'has', ...], 'neg')

We now loop through the documents list and create a feature set list using the features_doc function defined above.

– Each item of the feature_set list is a tuple.

– The first item of the tuple is the dictionary returned from document_features function

– The second item of the tuple is the category (pos or neg) of the movie review

In [37]:
feature_set = [(doc_features(doc),category) for (doc, category) in docs]

In [38]:
feature_set[0]

({'contains(film)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(like)': True,
  'contains(even)': False,
  'contains(good)': True,
  'contains(time)': True,
  'contains(story)': False,
  'contains(would)': False,
  'contains(much)': False,
  'contains(character)': False,
  'contains(also)': False,
  'contains(get)': False,
  'contains(two)': False,
  'contains(well)': True,
  'contains(characters)': False,
  'contains(first)': False,
  'contains(--)': False,
  'contains(see)': True,
  'contains(way)': False,
  'contains(make)': False,
  'contains(life)': False,
  'contains(really)': False,
  'contains(films)': False,
  'contains(plot)': False,
  'contains(little)': False,
  'contains(people)': True,
  'contains(could)': False,
  'contains(scene)': True,
  'contains(man)': False,
  'contains(bad)': False,
  'contains(never)': False,
  'contains(best)': False,
  'contains(new)': False,
  'contains(scenes)': True,
  'contains(many)': False,
  'contains(director)'

# Training Classifier
From the feature set we created above, we now create a separate training set and a separate testing/validation set. The train set is used to train the classifier and the test set is used to test the classifier to check how accurately it classifies the given text.

#### Creating Train and Test Dataset

In this example, we use the first 400 elements of the feature set array as a test set and the rest of the data as a train set. Generally, 80/20 percent is a fair split between training and testing set, i.e. 80 percent training set and 20 percent testing set.


In [39]:
len(feature_set)

2000

In [41]:
# 80% train
train_set = feature_set[400:]
# 20% test
test_set = feature_set[:400]
len(train_set),len(test_set)

(1600, 400)

#### Training a Classifier

Now, we train a classifier using the training dataset. There are different kind of classifiers namely Naive Bayes Classifier, Maximum Entropy Classifier, Decision Tree Classifier, Support Vector Machine Classifier, etc.

In this example, we use the Naive Bayes Classifier. It’s a simple, fast, and easy classifier which performs well for small datasets. It’s a simple probabilistic classifier based on applying Bayes’ theorem. Bayes’ theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event.

In [42]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)

#### Testing the trained Classifier

Let’s see the accuracy percentage of the trained classifier. The accuracy value changes each time you run the program because of the names array being shuffled above.

In [43]:
from nltk import classify

In [44]:
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8175

Let’s see the output of the classifier by providing some custom reviews.

In [47]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmeterdogan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [49]:
from nltk import word_tokenize
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)
# negative review correctly classified as 'neg'

'neg'

In [50]:
# probability result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [52]:
prob_result.max()

'neg'

In [54]:
prob_result.prob('neg'), prob_result.prob('pos')
# e min less then 0 

(0.9999976737495611, 2.326250453636635e-06)

In [55]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Positive review is classified as negative
 
We need to improve our feature set for more accurate prediction

In [58]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result, prob_result.max()

(<ProbDist with 2 samples>, 'neg')

In [59]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.9999459511691823, 5.404883083514626e-05)

Let’s see the most informative features among the entire features in the feature set.

In [60]:
classifier.show_most_informative_features(10)

Most Informative Features
         contains(damon) = True              pos : neg    =     14.6 : 1.0
   contains(outstanding) = True              pos : neg    =      8.6 : 1.0
        contains(seagal) = True              neg : pos    =      8.0 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.7 : 1.0
    contains(ridiculous) = True              neg : pos    =      6.0 : 1.0
         contains(lucas) = True              pos : neg    =      6.0 : 1.0
          contains(lame) = True              neg : pos    =      5.6 : 1.0
         contains(bland) = True              neg : pos    =      5.5 : 1.0
        contains(poorly) = True              neg : pos    =      5.2 : 1.0
         contains(waste) = True              neg : pos    =      5.2 : 1.0


The result shows that the word outstanding is used in positive reviews 14.7 times more often than it is used in negative reviews the word poorly is used in negative reviews 7.7 times more often than it is used in positive reviews. Similarly, for other letters. These ratios are also called likelihood ratios.

Therefore, a review has a high chance to be classified as positive if it contains words like outstanding and wonderfully. Similarly, a review has a high chance of being classified as negative if it contains words like poorly, awful, waste, etc.

### Note: 

You can modify the document_features function to generate the feature set which can improve the accuracy of the trained classifier. Feature extractors are built through a process of trail-and-error & guided by intuitions.

# Bag of Words Feature
In the above example, we used top-N words feature. We used 2000 most frequently occurring words as our top-N words feature. The classifier identified negative review as negative. However, the classifier was not able to classify positive review correctly. It classified a positive review as negative.

### ''
### Top-N words feature

– The top-N words feature is also a bag-of-words feature.
– But in the top-N feature, we only used the top 2000 words in the feature set.
– We combined the positive and negative reviews into a single list, randomized the list, and then separated the train and test set.
– This approach can result in the un-even distribution of positive and negative reviews across the train and test set.

### Bag-of-words feature shown below

In the bag-of-words feature as shown below:

– We will use all the useful words of each review while creating the feature set.
– We take a fixed number of positive and negative reviews for train and test set.
– This result in equal distribution of positive and negative reviews across train and test set.

### ''
In the approach shown below, we will modify the feature extractor function.

– We form a list of unique words of each review.
– The category (pos or neg) is assigned to each bag of words.
– Then the category of any given text is calculated by matching the different bag-of-words & their respective category.

In [1]:
from nltk.corpus import movie_reviews

In [2]:
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)

In [3]:
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)

In [4]:
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [5]:
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [6]:
pos_reviews[0][:20]

['films',
 'adapted',
 'from',
 'comic',
 'books',
 'have',
 'had',
 'plenty',
 'of',
 'success',
 ',',
 'whether',
 'they',
 "'",
 're',
 'about',
 'superheroes',
 '(',
 'batman',
 ',']

In [7]:
neg_reviews[0][:20]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an']

#### Feature Extraction

We use the bag-of-words feature. Here, we clean the word list (i.e. remove stop words and punctuation). Then, we create a dictionary of cleaned words.

In [8]:
from nltk.corpus import stopwords
import string

In [10]:
stopwords_english = stopwords.words('english')

In [11]:
# feature extractor function
def bag_of_words(words):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    words_dict = dict([word, True] for word in words_clean)
    return words_dict

In [12]:
# using dict will remove duplicate words from the words list
# note the output: stopword 'the' is also removed
bag_of_words(['the', 'the', 'good', 'bad', 'the', 'good'])

{'good': True, 'bad': True}

#### Create Feature Set

We use the bag-of-words feature and tag each review with its respective category as positive or negative.

In [13]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_words(words), 'neg'))

In [15]:
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [16]:
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as a test set. The remaining negative and positive reviews will be taken as a training set.

#### Note:
– There is difference between pos_reviews & pos_reviews_set array which are defined above.

– pos_reviews array contains words list only

– pos_reviews_set array contains words feature list

– pos_reviews_set & neg_reviews_set arrays are used to create train and test set as shown below

In [23]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program

from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [24]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [25]:
from nltk import classify, NaiveBayesClassifier

In [26]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy =  classify.accuracy(classifier, test_set)
accuracy

0.6725

In [27]:
classifier.show_most_informative_features(10)

Most Informative Features
                   sucks = True              neg : pos    =     13.0 : 1.0
               marvelous = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
               affecting = True              pos : neg    =     11.7 : 1.0
                    taxi = True              pos : neg    =     11.0 : 1.0
                  hatred = True              pos : neg    =     10.3 : 1.0
                headache = True              neg : pos    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0
                  avoids = True              pos : neg    =      9.7 : 1.0


#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [31]:
#probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [32]:
prob_result.max()

'neg'

In [33]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.6896168633261128, 0.31038313667388745)

In [38]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [40]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [41]:
prob_result.max()

'pos'

In [42]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9033209854909708, 0.0966790145090289)

# Bi-gram Features
N-grams are common terms in text processing and analysis. N-grams are related with words of a text. There are different n-grams like unigram, bigram, trigram, etc.

Unigram = Item having a single word, i.e. the n-gram of size 1. For example, good.

Bigram = Item having two words, i.e. the n-gram of size 2. For example, very good.

Trigram = Item having three words, i.e. the n-gram of size 3. For example, not so good.

In the above bag-of-words model, we only used the unigram feature. In the example below, we will use both unigram and bigram feature, i.e. we will deal with both single words and double words.

#### Feature Extraction

In this case, both unigrams and bigrams are used as features.

We define two functions:

– bag_of_words: that extracts only unigram features from the movie review words

– bag_of_ngrams: that extracts only bigram features from the movie review words

We then define another function:

– bag_of_all_words: that combines both unigram and bigram features

In [43]:
from nltk import ngrams
from nltk.corpus import stopwords
import string

stopwords_english = stopwords.words('english')

In [44]:
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    return words_clean

In [45]:
# feature extractor function for unigram
def bag_of_words(words):
    words_dict = dict([word, True] for word in words)
    return words_dict

In [52]:
# feature extractor function for ngrams (bigram)
def bag_of_unigrms(words, n=2):
    words_ng = []
    for item in iter(ngrams(words,n)):
        words_ng.append(item)
    words_dict = dict([word, True] for word in words_ng)
    return words_dict
    

'''
 Alternative Bi-gram feature extractor 
 using BigramCollocationFinder module
 
 Collocations are multiple words which commonly co-occur.

http://www.nltk.org/howto/collocations.html

https://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
 
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
feature extractor function for ngrams (bigram)

get 200 most frequently occurring bigrams from every review

def bag_of_ngrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    
    bigram_finder = BigramCollocationFinder.from_words(words)
    
    bigrams = bigram_finder.nbest(score_fn, n)
    
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
'''

In [53]:
from nltk.tokenize import word_tokenize

In [54]:
text = "It was a very good movie."

words = word_tokenize(text.lower())
words

['it', 'was', 'a', 'very', 'good', 'movie', '.']

In [55]:
bag_of_words(words)

{'it': True,
 'was': True,
 'a': True,
 'very': True,
 'good': True,
 'movie': True,
 '.': True}

In [56]:
bag_of_unigrms(words)

{('it', 'was'): True,
 ('was', 'a'): True,
 ('a', 'very'): True,
 ('very', 'good'): True,
 ('good', 'movie'): True,
 ('movie', '.'): True}

In [57]:
# working with cleaning words
# i.e. removing stopwords and punctuation
words_clean = clean_words(words, stopwords_english)
words_clean

['good', 'movie']

In [58]:
# cleaning words is find for unigrams
# but this can omit important words for bigrams
# for example, stopwords like very, over, under, so, etc. are important for bigrams
# we create a new stopwords list specifically for bigrams by omitting such important words
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']

In [59]:
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)

In [62]:
words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
words_clean_for_bigram

['very', 'good', 'movie']

In [63]:
# We will use general stopwords for unigrams 
# And special stopwords list for bigrams
unigram_features = bag_of_words(words_clean)
unigram_features

{'good': True, 'movie': True}

In [66]:
bigram_features = bag_of_unigrms(words_clean_for_bigram)
bigram_features

{('very', 'good'): True, ('good', 'movie'): True}

In [67]:
# combine both unigram and bigram features
all_features = unigram_features.copy()
all_features.update(bigram_features)
all_features

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [68]:
# let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
    
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_unigrms(words_clean_for_bigram)
    
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
    
    return all_features
bag_of_all_words(words)

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

Working with NLTK’s movie reviews corpus

In [70]:
from nltk.corpus import movie_reviews 

In [71]:
pos_reviews = []
for filed in movie_reviews.fileids('pos'):
    words = movie_reviews.words(filed)
    pos_reviews.append(words)
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [72]:
neg_reviews = []
for filed in movie_reviews.fileids('neg'):
    words = movie_reviews.words(filed)
    neg_reviews.append(words)
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

#### Create Feature Set

In [75]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [76]:
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as the test set. The remaining negative and positive reviews will be taken as the training set.

In [77]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [78]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [79]:
from nltk import classify, NaiveBayesClassifier

In [80]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8275

In [81]:
classifier.show_most_informative_features(10)

Most Informative Features
        ('one', 'worst') = True              neg : pos    =     17.0 : 1.0
                   sucks = True              neg : pos    =     14.3 : 1.0
       ('waste', 'time') = True              neg : pos    =     11.3 : 1.0
             outstanding = True              pos : neg    =     11.2 : 1.0
     ('saving', 'grace') = True              neg : pos    =     11.0 : 1.0
               maintains = True              pos : neg    =     11.0 : 1.0
       ('one', 'better') = True              pos : neg    =     11.0 : 1.0
                 idiotic = True              neg : pos    =     10.6 : 1.0
              dreamworks = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0


#### Note:

– The accuracy of the classifier has significantly increased when trained with combined feature set (unigram + bigram).

– Accuracy was 67.25% while using only Unigram features.

– Accuracy has increased to 82.75% while using combined (unigram + bigram) features.

#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.



In [84]:
from nltk.tokenize import word_tokenize

In [85]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [86]:
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [87]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [88]:
prob_result.max()

'neg'

In [89]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.963986611471882, 0.03601338852811978)

In [90]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [91]:
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [92]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [93]:
prob_result.max()

'pos'

In [94]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9710979916165804, 0.02890200838342214)

In [1]:
# get data
import nltk
# nltk.download('movie_reviews')

In [3]:
from nltk.corpus import movie_reviews

In [4]:
# data size
len(movie_reviews.fileids())

2000

In [5]:
# data categs
movie_reviews.categories()

['neg', 'pos']

In [6]:
# cteg len
'neg data len :', len(movie_reviews.fileids('neg')), 'pos data len :', len(movie_reviews.fileids('pos'))

('neg data len :', 1000, 'pos data len :', 1000)

In [7]:
movie_reviews.fileids('pos')[0]

'pos/cv000_29590.txt'

convert to list all docs

In [8]:
docs = []
for categ in movie_reviews.categories():
    for fileid in data.fileids(categ):
        docs.append((data.words(fileid), categ))

In [9]:
len(docs)

2000

In [10]:
docs[0]

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')

In [11]:
# shufle data
from random import shuffle
shuffle(docs)

# Feature Extraction

To classify the text into any category, we need to define some criteria. On the basis of those criteria, our classifier will learn that a particular kind of text falls in a particular category. This kind of criteria is known as feature. We can define one or more feature to train our classifier.

In this example, we will use the top-N words feature.

#### Fetch all words from the movie reviews corpus

We first fetch all the words from all the movie reviews and create a list.

In [12]:
all_words = [word.lower() for word in data.words()]

In [13]:
all_words[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

#### Create Frequency Distribution of all words

Frequency Distribution will calculate the number of occurence of each word in the entire list of words.

In [14]:
from nltk import FreqDist

In [26]:
all_words_frequency = FreqDist(all_words)
all_words_frequency

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [16]:
all_words_frequency.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

#### Removing Punctuation and Stopwords

From the above frequency distribution of words, we can see the most frequently occurring words are either punctuation marks or stopwords.

Stop words are those frequently words which do not carry any significant meaning in text analysis. For example, I, me, my, the, a, and, is, are, he, she, we, etc.

Punctuation marks like comma, fullstop. inverted comma, etc. occur highly in any text data.

We will do data cleaning by removing stop words and punctuations.

#### Remove Stop Words

In [17]:
# download stopwords from nltk but once time only
# nltk.download('stopwords')

In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
# remove stopwords from all_words
all_words_removed_stopwords = [word for word in all_words if word not in stop_words]
all_words_removed_stopwords

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 ',',
 'drink',
 'drive',
 '.',
 'get',
 'accident',
 '.',
 'one',
 'guys',
 'dies',
 ',',
 'girlfriend',
 'continues',
 'see',
 'life',
 ',',
 'nightmares',
 '.',
 "'",
 'deal',
 '?',
 'watch',
 'movie',
 '"',
 'sorta',
 '"',
 'find',
 '.',
 '.',
 '.',
 'critique',
 ':',
 'mind',
 '-',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 ',',
 'presents',
 'bad',
 'package',
 '.',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 ',',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 ',',
 'mess',
 'head',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 ',',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 ',',
 'folks',
 "'",
 'snag',
 'one',
 'correctly',
 '.',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 ',',
 'executed',
 'terribly',
 '.',
 'problems',
 'movie',
 '?',
 'well',
 ',',
 'main',
 'problem',
 "'",
 'simply',
 'jumbled',


#### Remove Punctuation

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
all_words_removed_punctuation = [word for word in all_words_removed_stopwords if word not in string.punctuation]
all_words_removed_punctuation

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

In [23]:
# do last two step together
cleaned_all_words = [word for word in all_words if word not in stop_words and word not in string.punctuation]
cleaned_all_words

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

#### Frequency Distribution of cleaned words list

Below is the frequency distribution of the new list after removing stopwords and punctuation.

In [27]:
len(all_words_frequency)

39768

In [28]:
all_words_frequency = FreqDist(cleaned_words)
all_words_frequency

FreqDist({'film': 9517, 'one': 5852, 'movie': 5771, 'like': 3690, 'even': 2565, 'good': 2411, 'time': 2411, 'story': 2169, 'would': 2109, 'much': 2049, ...})

In [30]:
len(all_words_frequency)

39586

Previously, before removing stopwords and punctuation, the frequency distribution was:

FreqDist with 39768 samples and 1583820 outcomes

Now, the frequency distribution is:

FreqDist with 39586 samples and 710578 outcomes

This shows that after removing around 200 stop words and punctuation, the outcomes/words number has reduced to around half of the original size.

The most common words or highly occurring words list has also got meaningful words in the list. Before, the first 10 frequently occurring words were only stop-words and punctuations.

#### Create Word Feature using 2000 most frequently occurring words

We take 2000 most frequently occurring words as our feature.

In [31]:
most_common_words = all_words_frequency.most_common(2000)
most_common_words

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906),
 ('characters', 1859),
 ('first', 1836),
 ('--', 1815),
 ('see', 1749),
 ('way', 1693),
 ('make', 1642),
 ('life', 1586),
 ('really', 1558),
 ('films', 1536),
 ('plot', 1513),
 ('little', 1501),
 ('people', 1455),
 ('could', 1427),
 ('scene', 1397),
 ('man', 1396),
 ('bad', 1395),
 ('never', 1374),
 ('best', 1333),
 ('new', 1292),
 ('scenes', 1274),
 ('many', 1268),
 ('director', 1237),
 ('know', 1217),
 ('movies', 1206),
 ('action', 1172),
 ('great', 1148),
 ('another', 1121),
 ('love', 1119),
 ('go', 1113),
 ('made', 1084),
 ('us', 1073),
 ('big', 1064),
 ('end', 1062),
 ('something', 1061),
 ('back', 1060),
 ('still', 1047),
 ('world', 1037),
 ('seems', 1033),
 ('work', 1020),
 ('makes', 992),
 ('however', 989),
 ('every', 947)

In [32]:
# the most common words list's elements are in the form of tuple
# get only the first element of each tuple of the word list
word_features = [row[0] for row in most_common_words]
word_features

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'also',
 'get',
 'two',
 'well',
 'characters',
 'first',
 '--',
 'see',
 'way',
 'make',
 'life',
 'really',
 'films',
 'plot',
 'little',
 'people',
 'could',
 'scene',
 'man',
 'bad',
 'never',
 'best',
 'new',
 'scenes',
 'many',
 'director',
 'know',
 'movies',
 'action',
 'great',
 'another',
 'love',
 'go',
 'made',
 'us',
 'big',
 'end',
 'something',
 'back',
 'still',
 'world',
 'seems',
 'work',
 'makes',
 'however',
 'every',
 'though',
 'better',
 'real',
 'audience',
 'enough',
 'seen',
 'take',
 'around',
 'going',
 'year',
 'performance',
 'role',
 'old',
 'gets',
 'may',
 'things',
 'think',
 'years',
 'last',
 'comedy',
 'funny',
 'actually',
 'long',
 'look',
 'almost',
 'thing',
 'fact',
 'nothing',
 'say',
 'right',
 'john',
 'although',
 'played',
 'find',
 'script',
 'come',
 'ever',
 'cast',
 'since',
 'star',
 'plays',
 'young',
 'show',
 'comes',
 'part',

#### Create Feature Set

Now, we write a function that will be used to create feature set. The feature set is used to train the classifier.

We define a feature extractor function that checks if the words in a given document are present in the word_features list or not.

In [33]:
def doc_features(doc):
    # "set" function will remove repeated/duplicate tokens in the given list
    doc_words = set(doc)
    feetures = {}
    for word in word_features:
        feetures['contains(%s)' % word] = (word in doc_words)
    return feetures
# get first neg movie review
movie_review_file = movie_reviews.fileids('neg')[0]
movie_review_file

'neg/cv000_29416.txt'

In [34]:
doc_features(movie_reviews.words(movie_review_file))

{'contains(film)': True,
 'contains(one)': True,
 'contains(movie)': True,
 'contains(like)': True,
 'contains(even)': True,
 'contains(good)': True,
 'contains(time)': False,
 'contains(story)': False,
 'contains(would)': True,
 'contains(much)': False,
 'contains(character)': True,
 'contains(also)': True,
 'contains(get)': True,
 'contains(two)': True,
 'contains(well)': True,
 'contains(characters)': True,
 'contains(first)': False,
 'contains(--)': False,
 'contains(see)': True,
 'contains(way)': True,
 'contains(make)': True,
 'contains(life)': True,
 'contains(really)': True,
 'contains(films)': True,
 'contains(plot)': True,
 'contains(little)': True,
 'contains(people)': True,
 'contains(could)': False,
 'contains(scene)': False,
 'contains(man)': False,
 'contains(bad)': True,
 'contains(never)': False,
 'contains(best)': False,
 'contains(new)': True,
 'contains(scenes)': True,
 'contains(many)': False,
 'contains(director)': True,
 'contains(know)': True,
 'contains(movies)

In the beginning of this article, we have created the documents list which contains data of all the movie reviews. Its elements are tuples with word list as first item and review category as the second item of the tuple.

In [36]:
# print first tuple of the documents list
docs[0]

(['synopsis', ':', 'nice', 'girl', 'susanne', 'has', ...], 'neg')

We now loop through the documents list and create a feature set list using the features_doc function defined above.

– Each item of the feature_set list is a tuple.

– The first item of the tuple is the dictionary returned from document_features function

– The second item of the tuple is the category (pos or neg) of the movie review

In [37]:
feature_set = [(doc_features(doc),category) for (doc, category) in docs]

In [38]:
feature_set[0]

({'contains(film)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(like)': True,
  'contains(even)': False,
  'contains(good)': True,
  'contains(time)': True,
  'contains(story)': False,
  'contains(would)': False,
  'contains(much)': False,
  'contains(character)': False,
  'contains(also)': False,
  'contains(get)': False,
  'contains(two)': False,
  'contains(well)': True,
  'contains(characters)': False,
  'contains(first)': False,
  'contains(--)': False,
  'contains(see)': True,
  'contains(way)': False,
  'contains(make)': False,
  'contains(life)': False,
  'contains(really)': False,
  'contains(films)': False,
  'contains(plot)': False,
  'contains(little)': False,
  'contains(people)': True,
  'contains(could)': False,
  'contains(scene)': True,
  'contains(man)': False,
  'contains(bad)': False,
  'contains(never)': False,
  'contains(best)': False,
  'contains(new)': False,
  'contains(scenes)': True,
  'contains(many)': False,
  'contains(director)'

# Training Classifier
From the feature set we created above, we now create a separate training set and a separate testing/validation set. The train set is used to train the classifier and the test set is used to test the classifier to check how accurately it classifies the given text.

#### Creating Train and Test Dataset

In this example, we use the first 400 elements of the feature set array as a test set and the rest of the data as a train set. Generally, 80/20 percent is a fair split between training and testing set, i.e. 80 percent training set and 20 percent testing set.


In [39]:
len(feature_set)

2000

In [41]:
# 80% train
train_set = feature_set[400:]
# 20% test
test_set = feature_set[:400]
len(train_set),len(test_set)

(1600, 400)

#### Training a Classifier

Now, we train a classifier using the training dataset. There are different kind of classifiers namely Naive Bayes Classifier, Maximum Entropy Classifier, Decision Tree Classifier, Support Vector Machine Classifier, etc.

In this example, we use the Naive Bayes Classifier. It’s a simple, fast, and easy classifier which performs well for small datasets. It’s a simple probabilistic classifier based on applying Bayes’ theorem. Bayes’ theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event.

In [42]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)

#### Testing the trained Classifier

Let’s see the accuracy percentage of the trained classifier. The accuracy value changes each time you run the program because of the names array being shuffled above.

In [43]:
from nltk import classify

In [44]:
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8175

Let’s see the output of the classifier by providing some custom reviews.

In [47]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmeterdogan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [49]:
from nltk import word_tokenize
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)
# negative review correctly classified as 'neg'

'neg'

In [50]:
# probability result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [52]:
prob_result.max()

'neg'

In [54]:
prob_result.prob('neg'), prob_result.prob('pos')
# e min less then 0 

(0.9999976737495611, 2.326250453636635e-06)

In [55]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = doc_features(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Positive review is classified as negative
 
We need to improve our feature set for more accurate prediction

In [58]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result, prob_result.max()

(<ProbDist with 2 samples>, 'neg')

In [59]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.9999459511691823, 5.404883083514626e-05)

Let’s see the most informative features among the entire features in the feature set.

In [60]:
classifier.show_most_informative_features(10)

Most Informative Features
         contains(damon) = True              pos : neg    =     14.6 : 1.0
   contains(outstanding) = True              pos : neg    =      8.6 : 1.0
        contains(seagal) = True              neg : pos    =      8.0 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.7 : 1.0
    contains(ridiculous) = True              neg : pos    =      6.0 : 1.0
         contains(lucas) = True              pos : neg    =      6.0 : 1.0
          contains(lame) = True              neg : pos    =      5.6 : 1.0
         contains(bland) = True              neg : pos    =      5.5 : 1.0
        contains(poorly) = True              neg : pos    =      5.2 : 1.0
         contains(waste) = True              neg : pos    =      5.2 : 1.0


The result shows that the word outstanding is used in positive reviews 14.7 times more often than it is used in negative reviews the word poorly is used in negative reviews 7.7 times more often than it is used in positive reviews. Similarly, for other letters. These ratios are also called likelihood ratios.

Therefore, a review has a high chance to be classified as positive if it contains words like outstanding and wonderfully. Similarly, a review has a high chance of being classified as negative if it contains words like poorly, awful, waste, etc.

### Note: 

You can modify the document_features function to generate the feature set which can improve the accuracy of the trained classifier. Feature extractors are built through a process of trail-and-error & guided by intuitions.

# Bag of Words Feature
In the above example, we used top-N words feature. We used 2000 most frequently occurring words as our top-N words feature. The classifier identified negative review as negative. However, the classifier was not able to classify positive review correctly. It classified a positive review as negative.

### ''
### Top-N words feature

– The top-N words feature is also a bag-of-words feature.
– But in the top-N feature, we only used the top 2000 words in the feature set.
– We combined the positive and negative reviews into a single list, randomized the list, and then separated the train and test set.
– This approach can result in the un-even distribution of positive and negative reviews across the train and test set.

### Bag-of-words feature shown below

In the bag-of-words feature as shown below:

– We will use all the useful words of each review while creating the feature set.
– We take a fixed number of positive and negative reviews for train and test set.
– This result in equal distribution of positive and negative reviews across train and test set.

### ''
In the approach shown below, we will modify the feature extractor function.

– We form a list of unique words of each review.
– The category (pos or neg) is assigned to each bag of words.
– Then the category of any given text is calculated by matching the different bag-of-words & their respective category.

In [1]:
from nltk.corpus import movie_reviews

In [2]:
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)

In [3]:
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)

In [4]:
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [5]:
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [6]:
pos_reviews[0][:20]

['films',
 'adapted',
 'from',
 'comic',
 'books',
 'have',
 'had',
 'plenty',
 'of',
 'success',
 ',',
 'whether',
 'they',
 "'",
 're',
 'about',
 'superheroes',
 '(',
 'batman',
 ',']

In [7]:
neg_reviews[0][:20]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an']

#### Feature Extraction

We use the bag-of-words feature. Here, we clean the word list (i.e. remove stop words and punctuation). Then, we create a dictionary of cleaned words.

In [8]:
from nltk.corpus import stopwords
import string

In [10]:
stopwords_english = stopwords.words('english')

In [11]:
# feature extractor function
def bag_of_words(words):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    words_dict = dict([word, True] for word in words_clean)
    return words_dict

In [12]:
# using dict will remove duplicate words from the words list
# note the output: stopword 'the' is also removed
bag_of_words(['the', 'the', 'good', 'bad', 'the', 'good'])

{'good': True, 'bad': True}

#### Create Feature Set

We use the bag-of-words feature and tag each review with its respective category as positive or negative.

In [13]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_words(words), 'neg'))

In [15]:
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [16]:
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as a test set. The remaining negative and positive reviews will be taken as a training set.

#### Note:
– There is difference between pos_reviews & pos_reviews_set array which are defined above.

– pos_reviews array contains words list only

– pos_reviews_set array contains words feature list

– pos_reviews_set & neg_reviews_set arrays are used to create train and test set as shown below

In [23]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program

from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [24]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [25]:
from nltk import classify, NaiveBayesClassifier

In [26]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy =  classify.accuracy(classifier, test_set)
accuracy

0.6725

In [27]:
classifier.show_most_informative_features(10)

Most Informative Features
                   sucks = True              neg : pos    =     13.0 : 1.0
               marvelous = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
               affecting = True              pos : neg    =     11.7 : 1.0
                    taxi = True              pos : neg    =     11.0 : 1.0
                  hatred = True              pos : neg    =     10.3 : 1.0
                headache = True              neg : pos    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0
                  avoids = True              pos : neg    =      9.7 : 1.0


#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [31]:
#probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [32]:
prob_result.max()

'neg'

In [33]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.6896168633261128, 0.31038313667388745)

In [38]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [40]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [41]:
prob_result.max()

'pos'

In [42]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9033209854909708, 0.0966790145090289)

# Bi-gram Features
N-grams are common terms in text processing and analysis. N-grams are related with words of a text. There are different n-grams like unigram, bigram, trigram, etc.

Unigram = Item having a single word, i.e. the n-gram of size 1. For example, good.

Bigram = Item having two words, i.e. the n-gram of size 2. For example, very good.

Trigram = Item having three words, i.e. the n-gram of size 3. For example, not so good.

In the above bag-of-words model, we only used the unigram feature. In the example below, we will use both unigram and bigram feature, i.e. we will deal with both single words and double words.

#### Feature Extraction

In this case, both unigrams and bigrams are used as features.

We define two functions:

– bag_of_words: that extracts only unigram features from the movie review words

– bag_of_ngrams: that extracts only bigram features from the movie review words

We then define another function:

– bag_of_all_words: that combines both unigram and bigram features

In [43]:
from nltk import ngrams
from nltk.corpus import stopwords
import string

stopwords_english = stopwords.words('english')

In [44]:
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)
    return words_clean

In [45]:
# feature extractor function for unigram
def bag_of_words(words):
    words_dict = dict([word, True] for word in words)
    return words_dict

In [52]:
# feature extractor function for ngrams (bigram)
def bag_of_unigrms(words, n=2):
    words_ng = []
    for item in iter(ngrams(words,n)):
        words_ng.append(item)
    words_dict = dict([word, True] for word in words_ng)
    return words_dict
    

'''
 Alternative Bi-gram feature extractor 
 using BigramCollocationFinder module
 
 Collocations are multiple words which commonly co-occur.

http://www.nltk.org/howto/collocations.html

https://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
 
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
feature extractor function for ngrams (bigram)

get 200 most frequently occurring bigrams from every review

def bag_of_ngrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    
    bigram_finder = BigramCollocationFinder.from_words(words)
    
    bigrams = bigram_finder.nbest(score_fn, n)
    
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
'''

In [53]:
from nltk.tokenize import word_tokenize

In [54]:
text = "It was a very good movie."

words = word_tokenize(text.lower())
words

['it', 'was', 'a', 'very', 'good', 'movie', '.']

In [55]:
bag_of_words(words)

{'it': True,
 'was': True,
 'a': True,
 'very': True,
 'good': True,
 'movie': True,
 '.': True}

In [56]:
bag_of_unigrms(words)

{('it', 'was'): True,
 ('was', 'a'): True,
 ('a', 'very'): True,
 ('very', 'good'): True,
 ('good', 'movie'): True,
 ('movie', '.'): True}

In [57]:
# working with cleaning words
# i.e. removing stopwords and punctuation
words_clean = clean_words(words, stopwords_english)
words_clean

['good', 'movie']

In [58]:
# cleaning words is find for unigrams
# but this can omit important words for bigrams
# for example, stopwords like very, over, under, so, etc. are important for bigrams
# we create a new stopwords list specifically for bigrams by omitting such important words
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']

In [59]:
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)

In [62]:
words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
words_clean_for_bigram

['very', 'good', 'movie']

In [63]:
# We will use general stopwords for unigrams 
# And special stopwords list for bigrams
unigram_features = bag_of_words(words_clean)
unigram_features

{'good': True, 'movie': True}

In [66]:
bigram_features = bag_of_unigrms(words_clean_for_bigram)
bigram_features

{('very', 'good'): True, ('good', 'movie'): True}

In [67]:
# combine both unigram and bigram features
all_features = unigram_features.copy()
all_features.update(bigram_features)
all_features

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [68]:
# let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigram = clean_words(words, stopwords_english_for_bigrams)
    
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_unigrms(words_clean_for_bigram)
    
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
    
    return all_features
bag_of_all_words(words)

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

Working with NLTK’s movie reviews corpus

In [70]:
from nltk.corpus import movie_reviews 

In [71]:
pos_reviews = []
for filed in movie_reviews.fileids('pos'):
    words = movie_reviews.words(filed)
    pos_reviews.append(words)
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [72]:
neg_reviews = []
for filed in movie_reviews.fileids('neg'):
    words = movie_reviews.words(filed)
    neg_reviews.append(words)
neg_reviews[0]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

#### Create Feature Set

In [75]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))
pos_reviews_set[0]

({'films': True,
  'adapted': True,
  'comic': True,
  'books': True,
  'plenty': True,
  'success': True,
  'whether': True,
  'superheroes': True,
  'batman': True,
  'superman': True,
  'spawn': True,
  'geared': True,
  'toward': True,
  'kids': True,
  'casper': True,
  'arthouse': True,
  'crowd': True,
  'ghost': True,
  'world': True,
  'never': True,
  'really': True,
  'book': True,
  'like': True,
  'hell': True,
  'starters': True,
  'created': True,
  'alan': True,
  'moore': True,
  'eddie': True,
  'campbell': True,
  'brought': True,
  'medium': True,
  'whole': True,
  'new': True,
  'level': True,
  'mid': True,
  '80s': True,
  '12': True,
  'part': True,
  'series': True,
  'called': True,
  'watchmen': True,
  'say': True,
  'thoroughly': True,
  'researched': True,
  'subject': True,
  'jack': True,
  'ripper': True,
  'would': True,
  'saying': True,
  'michael': True,
  'jackson': True,
  'starting': True,
  'look': True,
  'little': True,
  'odd': True,
  'grap

In [76]:
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))
neg_reviews_set[0]

({'plot': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'church': True,
  'party': True,
  'drink': True,
  'drive': True,
  'get': True,
  'accident': True,
  'one': True,
  'guys': True,
  'dies': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'life': True,
  'nightmares': True,
  'deal': True,
  'watch': True,
  'movie': True,
  'sorta': True,
  'find': True,
  'critique': True,
  'mind': True,
  'fuck': True,
  'generation': True,
  'touches': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'bad': True,
  'package': True,
  'makes': True,
  'review': True,
  'even': True,
  'harder': True,
  'write': True,
  'since': True,
  'generally': True,
  'applaud': True,
  'films': True,
  'attempt': True,
  'break': True,
  'mold': True,
  'mess': True,
  'head': True,
  'lost': True,
  'highway': True,
  'memento': True,
  'good': True,
  'ways': True,
  'making': True,
  'types': True,
  'folks': True,
  'snag': True,
  'correctly'

#### Create Train and Test Set

There are 1000 positive reviews set and 1000 negative reviews set. We take 20% (i.e. 200) of positive reviews and 20% (i.e. 200) of negative reviews as the test set. The remaining negative and positive reviews will be taken as the training set.

In [77]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

In [78]:
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
len(train_set), len(test_set)

(1600, 400)

#### Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the trained classifier using the test set.

In [79]:
from nltk import classify, NaiveBayesClassifier

In [80]:
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
accuracy

0.8275

In [81]:
classifier.show_most_informative_features(10)

Most Informative Features
        ('one', 'worst') = True              neg : pos    =     17.0 : 1.0
                   sucks = True              neg : pos    =     14.3 : 1.0
       ('waste', 'time') = True              neg : pos    =     11.3 : 1.0
             outstanding = True              pos : neg    =     11.2 : 1.0
     ('saving', 'grace') = True              neg : pos    =     11.0 : 1.0
               maintains = True              pos : neg    =     11.0 : 1.0
       ('one', 'better') = True              pos : neg    =     11.0 : 1.0
                 idiotic = True              neg : pos    =     10.6 : 1.0
              dreamworks = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =      9.8 : 1.0


#### Note:

– The accuracy of the classifier has significantly increased when trained with combined feature set (unigram + bigram).

– Accuracy was 67.25% while using only Unigram features.

– Accuracy has increased to 82.75% while using combined (unigram + bigram) features.

#### Testing Classifier with Custom Review

We provide custom review text and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive reviews provided.



In [84]:
from nltk.tokenize import word_tokenize

In [85]:
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [86]:
classifier.classify(custom_review_set)

'neg'

Negative review correctly classified as negative

In [87]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [88]:
prob_result.max()

'neg'

In [89]:
prob_result.prob('neg'), prob_result.prob('pos')

(0.963986611471882, 0.03601338852811978)

In [90]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)

In [91]:
classifier.classify(custom_review_set)

'pos'

Positive review correctly classified as positive

In [92]:
# probablity result
prob_result = classifier.prob_classify(custom_review_set)
prob_result

<ProbDist with 2 samples>

In [93]:
prob_result.max()

'pos'

In [94]:
prob_result.prob('pos'), prob_result.prob('neg')

(0.9710979916165804, 0.02890200838342214)