# 导入所需库

In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
import nltk
from nltk.corpus import stopwords

### 用pandas读取训练数据

In [3]:
BASE_PATH = os.getcwd()
training_file_path = os.path.join(BASE_PATH, 'data/labeledTrainData.tsv')

df = pd.read_csv(training_file_path, sep='\t', escapechar='\\')

In [4]:
print('Num of reviews: {}'.format(len(df)))

Num of reviews: 25000


In [5]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [6]:
df['review'][1]

'"The Classic War of the Worlds" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur "critics" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the "critics". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the "critics" perceive to be its shortcomings.'

1表示正面评论，0表示负面评论。

### 对影评数据做预处理，大概有以下环节：

1. 去掉html标签
1. 移除标点
1. 切分成词/token
1. 去掉停用词
1. 重组为新的句子

In [7]:
def display(text, title):
    print(title)
    print('\n-----我是分割线-----\n')
    print(text)

In [9]:
raw_example = df['review'][1]
display(raw_example, '原始数据')

原始数据

-----我是分割线-----

"The Classic War of the Worlds" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur "critics" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the "critics". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the "critics" perceive to be its shortcomings.


In [13]:
example = BeautifulSoup(raw_example, 'html.parser').get_text()
display(example, '去掉HTML标签的数据')

去掉HTML标签的数据

-----我是分割线-----

"The Classic War of the Worlds" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur "critics" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the "critics". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the "critics" perceive to be its shortcomings.


In [14]:
example_letters = re.sub(r'[^a-zA-X]', ' ', example)
display(example_letters, '去掉标点的数据')

去掉标点的数据

-----我是分割线-----

 The Classic War of the Worlds  by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H  G  Wells  classic book  Mr  Hines succeeds in doing so  I  and those who watched his film with me  appreciated the fact that it was not the standard  predictable Hollywood fare that comes out every year  e g  the Spielberg version with Tom Cruise that had only the slightest resemblance to the book  Obviously  everyone looks for different things in a movie  Those who envision themselves as amateur  critics  look only to criticize everything they can  Others rate a movie on more important bases like being entertained  which is why most people never agree with the  critics   We enjoyed the effort Mr  Hines put into being faithful to H G  Wells  classic novel  and we found it to be very entertaining  This made it easy to overlook what the  critics  perceive to be its shortcomings 


In [15]:
words = example_letters.lower().split()
display(words, '纯词列表数据')

纯词列表数据

-----我是分割线-----

['the', 'classic', 'war', 'of', 'the', 'worlds', 'by', 'timothy', 'hines', 'is', 'a', 'very', 'entertaining', 'film', 'that', 'obviously', 'goes', 'to', 'great', 'effort', 'and', 'lengths', 'to', 'faithfully', 'recreate', 'h', 'g', 'wells', 'classic', 'book', 'mr', 'hines', 'succeeds', 'in', 'doing', 'so', 'i', 'and', 'those', 'who', 'watched', 'his', 'film', 'with', 'me', 'appreciated', 'the', 'fact', 'that', 'it', 'was', 'not', 'the', 'standard', 'predictable', 'hollywood', 'fare', 'that', 'comes', 'out', 'every', 'year', 'e', 'g', 'the', 'spielberg', 'version', 'with', 'tom', 'cruise', 'that', 'had', 'only', 'the', 'slightest', 'resemblance', 'to', 'the', 'book', 'obviously', 'everyone', 'looks', 'for', 'different', 'things', 'in', 'a', 'movie', 'those', 'who', 'envision', 'themselves', 'as', 'amateur', 'critics', 'look', 'only', 'to', 'criticize', 'everything', 'they', 'can', 'others', 'rate', 'a', 'movie', 'on', 'more', 'important', 'bases', 'like', 'being

In [16]:
#下载停用词和其他语料会用到
#nltk.download()

In [25]:
stopwords = {}.fromkeys([ line.strip() for line in open('./stopwords.txt')])

In [26]:
stopwords

{"'d": None,
 "'ll": None,
 "'m": None,
 "'re": None,
 "'s": None,
 "'t": None,
 "'ve": None,
 'ZT': None,
 'ZZ': None,
 'a': None,
 "a's": None,
 'able': None,
 'about': None,
 'above': None,
 'abst': None,
 'accordance': None,
 'according': None,
 'accordingly': None,
 'across': None,
 'act': None,
 'actually': None,
 'added': None,
 'adj': None,
 'adopted': None,
 'affected': None,
 'affecting': None,
 'affects': None,
 'after': None,
 'afterwards': None,
 'again': None,
 'against': None,
 'ah': None,
 "ain't": None,
 'all': None,
 'allow': None,
 'allows': None,
 'almost': None,
 'alone': None,
 'along': None,
 'already': None,
 'also': None,
 'although': None,
 'always': None,
 'am': None,
 'among': None,
 'amongst': None,
 'an': None,
 'and': None,
 'announce': None,
 'another': None,
 'any': None,
 'anybody': None,
 'anyhow': None,
 'anymore': None,
 'anyone': None,
 'anything': None,
 'anyway': None,
 'anyways': None,
 'anywhere': None,
 'apart': None,
 'apparently': None,
 'ap

In [27]:
words_nostop = [w for w in words if w not in stopwords]
#words_nostop = [w for w in words if w not in stopwords.words('english')] # nltk.corpus.stopwords
display(words_nostop, '去掉停用词数据')

去掉停用词数据

-----我是分割线-----

['classic', 'war', 'worlds', 'timothy', 'hines', 'entertaining', 'film', 'effort', 'lengths', 'faithfully', 'recreate', 'classic', 'book', 'hines', 'succeeds', 'watched', 'film', 'appreciated', 'standard', 'predictable', 'hollywood', 'fare', 'spielberg', 'version', 'tom', 'cruise', 'slightest', 'resemblance', 'book', 'movie', 'envision', 'amateur', 'critics', 'criticize', 'rate', 'movie', 'bases', 'entertained', 'people', 'agree', 'critics', 'enjoyed', 'effort', 'hines', 'faithful', 'classic', 'entertaining', 'easy', 'overlook', 'critics', 'perceive', 'shortcomings']


In [37]:
# eng_stopwords = set(stopwords.words('english')) # nltk.corpus.stopwords
eng_stopwords = set(stopwords)

def clean_text(text):
    # HTML标记去除
    text = BeautifulSoup(text, 'html.parser').get_text()
    # 移除标点
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # 最小化，分词
    words = text.lower().split()
    # 去掉停用词
    words = [w for w in words if w not in eng_stopwords]
    # 重新组成新的句子
    return ' '.join(words)

In [38]:
clean_text(raw_example)

'classic war worlds timothy hines entertaining film effort lengths faithfully recreate classic book hines succeeds watched film appreciated standard predictable hollywood fare spielberg version tom cruise slightest resemblance book movie envision amateur critics criticize rate movie bases entertained people agree critics enjoyed effort hines faithful classic entertaining easy overlook critics perceive shortcomings'

### 清洗数据添加到dataframe里

In [39]:
df['clean_review'] = df.review.apply(clean_text)
df.head()

Unnamed: 0,id,sentiment,review,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,stuff moment mj ve started listening music wat...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin...",classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film starts manager nicholas bell investors ro...
3,3630_4,0,It must be assumed that those who praised this...,assumed praised film filmed opera didn read do...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...


In [36]:
df['clean_review'][0]

"ith stuff moment mj started listening music, watching odd documentary there, watched wiz watched moonwalker again. insight guy cool eighties mind guilty innocent. moonwalker biography, feature film remember cinema originally released. subtle messages mj's feeling press obvious message drugs bad m'kay.visually impressive michael jackson remotely mj hate boring. call mj egotist consenting movie mj fans fans true nice him.the actual feature film bit finally starts 20 minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord. mj dead bad me. mj overheard plans? nah, joe pesci's character ranted people supplying drugs dunno, hates mj's music.lots cool mj car robot speed demon sequence. also, director patience saint filming kiddy bad sequence directors hate kid bunch performing complex dance scene.bottom line, movie people mj level (which people). not, stay away. wholesome message ironically mj's bestest buddy movie girl! michael jackson talented people

### 抽取bag of words特征(用sklearn的CountVectorizer)

In [40]:
vectorizer = CountVectorizer(max_features=5000)
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
train_data_features.shape

(25000, 5000)

### 训练分类器

In [48]:
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features, df.sentiment)

#### 在训练集上做个predict看看效果如何

In [49]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

#### 删除不用的占内容变量

In [54]:
del df
del train_data_features

### 读取测试数据进行预测

In [55]:
test_file_path = os.path.join(BASE_PATH, 'data/testData.tsv')

df = pd.read_csv(test_file_path, sep='\t', escapechar='\\')

print('Number of reviews: {}'.format(len(df)))


Number of reviews: 25000


#### 在测试集上应用clean_text，同样的方式清洗数据

In [56]:
df['clean_review'] = df.review.apply(clean_text)

In [57]:
df.head()

Unnamed: 0,id,review,clean_review
0,12311_10,Naturally in a film who's main themes are of m...,naturally film main themes mortality nostalgia...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster disaster film action scenes mea...
2,5828_4,"All in all, this is a movie for kids. We saw i...",movie kids tonight child loved kid excitement ...
3,7186_2,Afraid of the Dark left me with the impression...,afraid dark left impression screenplays writte...
4,12128_7,A very accurate depiction of small time mob li...,accurate depiction time mob life filmed jersey...


#### 在测试集上，对clean_review按照训练集生成的vectorizer，出现最高的5000个词进行自动编码

In [60]:
test_data_features = vectorizer.transform(df.clean_review).toarray()
test_data_features.shape

(25000, 5000)

In [63]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id': df.id, 'sentiment': result})

In [64]:
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [65]:
output.to_csv(os.path.join(BASE_PATH, 'data/Bag_of_Words_model_submission.tsv'), index=False)


In [66]:
del df
del test_data_features