In [18]:
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Pac

True

In [19]:
# Extract features from the input list of words
def extract_features(words):
  return dict([(word, True) for word in words])

In [20]:
if __name__=='__main__':
  # Load the reviews from the corpus
  fileids_pos = movie_reviews.fileids('pos')
  fileids_neg = movie_reviews.fileids('neg')

In [21]:
  # Extract the features from the reviews
  features_pos = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in fileids_pos]
  features_neg = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in fileids_neg]

In [22]:
  # Define the train and test split (80% and 20%)
  threshold = 0.8
  num_pos = int(threshold * len(features_pos))
  num_neg = int(threshold * len(features_neg))

In [23]:
  # Create training and training datasets
  features_train = features_pos[:num_pos] + features_neg[:num_neg]
  features_test = features_pos[num_pos:] + features_neg[num_neg:]

In [24]:
  # Print the number of datapoints used
  print('\nNumber of training datapoints:', len(features_train))
  print('Number of test datapoints:', len(features_test))


Number of training datapoints: 1600
Number of test datapoints: 400


In [25]:
  # Train a Naive Bayes classifier
  classifier = NaiveBayesClassifier.train(features_train)
  print('\nAccuracy of the classifier:', nltk_accuracy(classifier, features_test))


Accuracy of the classifier: 0.735


In [26]:
  N = 15
  print('\nTop ' +str(N) + ' most informative words')
  for i, item in enumerate(classifier.most_informative_features()):
    print(str(i+1) + '. ' + item[0])
    if i == N - 1:
      break


Top 15 most informative words
1. outstanding
2. insulting
3. vulnerable
4. ludicrous
5. uninvolving
6. astounding
7. avoids
8. fascination
9. affecting
10. animators
11. anna
12. darker
13. seagal
14. symbol
15. idiotic


In [27]:
  # Test input movie reviews
  input_reviews = [
      'The costumes in this movie were great',
      'I think the story was terrible and the characters were very weak',
      'People say that the director of the moview is amazing',
      'This is such an idiotic movie. I will not recommend it to anyone.'
  ]

In [32]:
  print('\nMoview review predictions:')
  for review in input_reviews:
    print('\nReview:', review)

        # Compute the probabilities
    probabilities = classifier.prob_classify(extract_features(review.split()))

        # Pick the maximum value
    predicted_sentiment = probabilities.max()

        # Print outputs
    print('Predicted sentiment:', predicted_sentiment)
    print('Probability:', round(probabilities.prob(predicted_sentiment), 2))


Moview review predictions:

Review: The costumes in this movie were great
Predicted sentiment: Positive
Probability: 0.59

Review: I think the story was terrible and the characters were very weak
Predicted sentiment: Negative
Probability: 0.8

Review: People say that the director of the moview is amazing
Predicted sentiment: Negative
Probability: 0.65

Review: This is such an idiotic movie. I will not recommend it to anyone.
Predicted sentiment: Negative
Probability: 0.87
