# 使用NLTK训练情感分析器

In [1]:
import nltk
if __name__ == '__main__':
    nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [2]:
# 导入movie review数据
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [3]:
# 我们需要训练数据，这里将用NLTK提供的电影评论数据
if __name__ == '__main__':
# 加载积极与消极评论
    positive_fileids = movie_reviews.fileids('pos')# list类型 1000条数据 每一条是一个txt文件
    negative_fileids = movie_reviews.fileids('neg')

In [4]:
# 定义一个函数，用来提取特征数据
# 输入一段文本返回形如：{'It': True, 'movie': True, 'amazing': True, 'is': True, 'an': True}
# 返回类型是一个dict
def extract_features(word_list):
    return dict([(word,True) for word in word_list])
# 将这些评论数据分成积极评论和消极评论
# movie_reviews.words(fileids=[f])表示每一个txt文本里面的内容，结果是单词的列表：['films', 'adapted', 'from', 'comic', 'books', 'have', ...]
# features_positive 结果为一个list
# 结果形如：[({'shakesp: True, 'limit': True, 'mouth': True, ..., 'such': True, 'prophetic': True}, 'Positive'), ..., ({...}, 'Positive'), ...]
features_positive =[(extract_features(movie_reviews.words(fileids=[f])),
                     'Positive') for f in positive_fileids]
features_negative =[(extract_features(movie_reviews.words(fileids=[f])),
                     'Negative') for f in negative_fileids]

In [5]:
# 分成训练数据集（80%）和测试数据集（20%）
threshold_factor = 0.8
threshold_positive =int(threshold_factor*len(features_positive))# 800
threshold_negative =int(threshold_factor*len(features_negative))# 800
# 提取特征 800个积极文本800个消极文本构成训练集  200+200构成测试文本
features_train = features_positive[:threshold_positive]+ features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] +features_negative[threshold_negative:]
print("\n训练数据点的数量:", len(features_train))
print("测试数据点的数量:", len(features_test))


训练数据点的数量: 1600
测试数据点的数量: 400


In [6]:
# 训练朴素贝叶斯分类器
classifier = NaiveBayesClassifier.train(features_train)
print("\n分类器的准确性:",nltk.classify.util.accuracy(classifier, features_test))

print("\n十大信息最丰富的单词:")
for item in classifier.most_informative_features()[:10]:
    print(item[0])


分类器的准确性: 0.735

十大信息最丰富的单词:
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
symbol
affecting


In [7]:
# 输入一些简单的评论
input_reviews = [
    "It is an amazing movie",
    "This is a dull movie. I would never recommend it toanyone.",
    "A complete and utter destruction of one of the mosticonic superheroes. 0.1 effort and thought put into thestoryline. A coming of age awkward teenage movie with a'spiderman' stamp put on it. Bad jokes aimed atteenagers (at best). A complete caricature of a villain,a complete caricature of a Spiderman. Just please stopmaking this garbage Put some god damn effort! A totalwaste of time",
    "Just staving off some negative reviews. Fits wellinto the Marvel movies to date and is an excellentfollow up to Avengers: Endgame."]

In [9]:
# ⽤我们之前训练出的分类器预测这些⽂本的分类
# 运行分类器，获得预测结果
print("\n预测:")
for review in input_reviews:
    print("\n评论:", review)
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max()
# 打印输出
    print("预测情绪:", pred_sentiment)
    print("可能性:", round(probdist.prob(pred_sentiment), 2))


预测:

评论: It is an amazing movie
预测情绪: Positive
可能性: 0.61

评论: This is a dull movie. I would never recommend it toanyone.
预测情绪: Negative
可能性: 0.77

评论: A complete and utter destruction of one of the mosticonic superheroes. 0.1 effort and thought put into thestoryline. A coming of age awkward teenage movie with a'spiderman' stamp put on it. Bad jokes aimed atteenagers (at best). A complete caricature of a villain,a complete caricature of a Spiderman. Just please stopmaking this garbage Put some god damn effort! A totalwaste of time
预测情绪: Negative
可能性: 0.98

评论: Just staving off some negative reviews. Fits wellinto the Marvel movies to date and is an excellentfollow up to Avengers: Endgame.
预测情绪: Positive
可能性: 0.67
