In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
from nltk.corpus import movie_reviews
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier

In [3]:
def extract_features(word_list):
    return dict([(word, True) for word in word_list])  # 返回字典,提取所有唯一性的词；因为nltk classifier 只接受字典格式的数据

In [4]:
# test
extract_features(['I','am','fine','.'])
extract_features('I am fine.')
extract_features(['I am fine.'])
extract_features(['I','am','fine','miao','miao','miao'])

{'.': True, 'I': True, 'am': True, 'fine': True}

In [7]:
positive_fileids = movie_reviews.fileids('pos')
negative_fileids = movie_reviews.fileids('neg')
#positive_fileids  #返回列表，元素是 .txt 文件

In [30]:
# test
print(movie_reviews.raw(positive_fileids[1]))

every now and then a movie comes along from a suspect studio , with every indication that it will be a stinker , and to everybody's surprise ( perhaps even the studio ) the film becomes a critical darling . 
mtv films' _election , a high school comedy starring matthew broderick and reese witherspoon , is a current example . 
did anybody know this film existed a week before it opened ? 
the plot is deceptively simple . 
george washington carver high school is having student elections . 
tracy flick ( reese witherspoon ) is an over-achiever with her hand raised at nearly every question , way , way , high . 
mr . " m " ( matthew broderick ) , sick of the megalomaniac student , encourages paul , a popular-but-slow jock to run . 
and paul's nihilistic sister jumps in the race as well , for personal reasons . 
the dark side of such sleeper success is that , because expectations were so low going in , the fact that this was quality stuff made the reviews even more enthusiastic than they have 

In [8]:
# test:
positive_fileids[1]  # 取某一个 .txt 文件
movie_reviews.words(positive_fileids[1]) # 解析 .txt，并分词，返回列表
extract_features(movie_reviews.words(positive_fileids[1])) # 提取特征，返回字典

'pos/cv001_18431.txt'

['every', 'now', 'and', 'then', 'a', 'movie', 'comes', ...]

{'"': True,
 "'": True,
 '(': True,
 ')': True,
 ',': True,
 '-': True,
 '.': True,
 '/': True,
 ':': True,
 '?': True,
 '_election': True,
 '_election_': True,
 '_ferris': True,
 '_rushmore_': True,
 'a': True,
 'achiever': True,
 'add': True,
 'adult': True,
 'affair': True,
 'airy': True,
 'alexander': True,
 'all': True,
 'along': True,
 'amount': True,
 'an': True,
 'and': True,
 'anti': True,
 'any': True,
 'anybody': True,
 'are': True,
 'as': True,
 'at': True,
 'aware': True,
 'bad': True,
 'baggage': True,
 'be': True,
 'because': True,
 'becomes': True,
 'bee': True,
 'been': True,
 'before': True,
 'between': True,
 'bill': True,
 'both': True,
 'bothering': True,
 'broderick': True,
 'bueller_': True,
 'bumbling': True,
 'but': True,
 'campbell': True,
 'can': True,
 'carver': True,
 'caught': True,
 'character': True,
 'clout': True,
 'clubs': True,
 'comedy': True,
 'comes': True,
 'coming': True,
 'contain': True,
 'contains': True,
 'contrast': True,
 'contributed': Tr

In [9]:
# 整合上面的语句：
features_positive = [(extract_features(movie_reviews.words(fileids=[f])),'Positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') for f in negative_fileids]
# 返回的是 [({},'Positive'),] 列表[元组(字典{},'Positive')]; 一个.txt文件作一个列表元素，即一个datapoint
# nltk classifier 只接受字典格式的数据

## Split the data into train and test

In [16]:
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]
print('\nNumber of training datapoints:', len(features_train)) # 一个.txt文件作一个列表元素，即一个datapoint
print('\nNumber of test datapoints:', len(features_test))


Number of training datapoints: 1600

Number of test datapoints: 400


## Naive Bayes classifier

In [19]:
classifier = NaiveBayesClassifier.train(features_train)
print('\nAccuracy of the classifier:',nltk.classify.util.accuracy(classifier, features_test)) # 这个


Accuracy of the classifier: 0.735


## object: classifier.most_informative_features 

能给出最具信息量的词。在被划分为正面或负面的影评里，这些词基本上有很强的代表性和表达力。

比如 outstanding 表示该影评更可能是正面的，insulting 表示负面影评的可能性较大

In [25]:
print('\nTop 10 most informative words:\n')
for item in classifier.most_informative_features()[:10]:
    print(item[0])


Top 10 most informative words:

outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
animators
darker


## Predict

In [41]:
input_reviews = [
        "It is an amazing movie", 
        "This is a dull movie. I would never recommend it to anyone.",
        "The cinematography is pretty great in this movie", 
        "The direction was terrible and the story was all over the place" 
    ]

print('\nPredictions:')
for review in input_reviews:
    print('\nReview:',review)
    probdist = classifier.prob_classify(extract_features(review.split()))  # 总的object
    pred_sentiment = probdist.max()
    print('Predictd sentiment:', pred_sentiment)
    print('Probability:', round(probdist.prob(pred_sentiment), 2))


Predictions:

Review: It is an amazing movie
Predictd sentiment: Positive
Probability: 0.61

Review: This is a dull movie. I would never recommend it to anyone.
Predictd sentiment: Negative
Probability: 0.77

Review: The cinematography is pretty great in this movie
Predictd sentiment: Positive
Probability: 0.67

Review: The direction was terrible and the story was all over the place
Predictd sentiment: Negative
Probability: 0.63
