### 语料处理与模型建立 ： bag of words 而不是用word2vec

In [1]:
import os
import re
import numpy as np  # 计算
import pandas as pd

from bs4 import BeautifulSoup  # 截取网页的

from sklearn.feature_extraction.text import CountVectorizer  # 抽取文本特征： 简单计数，统计每个词出现的次数
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix  # 预留一部分数据用作评估，这是是评估准则

In [2]:
import nltk
#nltk.download()  # 没有使用nltk 自带的data
from nltk.corpus import stopwords # 自己准备的英文sotpwords 

### 1.用pandas读入训练数据 正面的评论为sentiment 1 ，负面的评论为sentiment 0

In [3]:
datafile = os.path.join('..', 'data', 'labeledTrainData.tsv') # 按照路径读入数据
df = pd.read_csv(datafile, sep='\t', escapechar='\\')  # 读取文件，以table键做分割 sep='\t'，转义:escapechar='\\'
print('Number of reviews: {}'.format(len(df)))
df.head()
print("数据行数表示样本数量，列变数样本特征，sentiment 表示标准便签 0 或者1 , view 是文本")

Number of reviews: 25000
数据行数表示样本数量，列变数样本特征，sentiment 表示标准便签 0 或者1 , view 是文本


#### 1.1数据形式是html的形式，里面含有换行符“< br   />”应该做数据的预处理

In [4]:

#df['review']

### 2 对影评数据做预处理，大概有以下环节：

1. 去掉html标签
1. 移除标点
1. 切分成词/token
1. 去掉停用词
1. 重组为新的句子

In [5]:
def display(text, title):
    print(title)
    #print("\n")
    print(text) 

#### 2.1原始数据含有< b r /> < b r />标签，需要通过文件预处理来去掉

In [6]:
raw_example = df['review'][2]
#display(raw_example, '原始数据是：')

### 2.2 去掉HTML 中的< b r >数据

In [7]:
example = BeautifulSoup(raw_example, 'html.parser').get_text()
#display(example, '去掉HTML标签后的数据是：')

### 2.3 对于英文数据,将除了a-zA-Z之前的数据全部替换成空格表示

In [8]:
example_letters = re.sub(r'[^a-zA-Z]', ' ', example)
#display(example_letters, '去掉标点后的数据')

### 2.4全部转化成小写的然后分割开：

In [9]:
words = example_letters.lower().split()
#display(words, '纯词列表数据(转化成小写的形式)')

In [10]:
#下载停用词和其他语料会用到
#nltk.download()  # 使用nltk的数据stopwords

In [18]:
#words_nostop = [w for w in words if w not in stopwords.words('english')]
stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwordslib/stopwords.txt')])
#words_nostop = [w for w in words if w not in stopwords]  # 取出所有非停用词
#display(words_nostop, '去掉停用词数据')

### 2.将上述数据清新文本过程写成函数的形式

In [75]:
#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = set(stopwords)

def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ' '.join(words)

In [76]:
clean_text(raw_example)

'film starts manager nicholas bell investors robert carradine primal park secret project mutating primal animal fossilized dna jurassik park scientists resurrect nature fearsome predators sabretooth tiger smilodon scientific ambition deadly voltage fence creature escape savagely stalking prey human visitors tourists scientific youngsters enter restricted security center attacked pack pre historical animals deadlier bigger addition security agent stacy haiduk mate brian wimmer fight carnivorous smilodons sabretooths real star stars astounding terrifyingly convincing giant animals savagely stalking prey afoul fight nature fearsome predators sabretooth dangerous slow stalks victims movie delivers lots blood gore beheading hair raising chills scares sabretooths mediocre special effects story exciting stirring entertainment boring giant animals majority computer generator totally lousy middling performances players reacting appropriately food actors vigorously physical performances dodging 

### 3.清洗数据添加到dataframe里,并且添加一列"cleaned_review"

In [79]:
df['cleaned_review'] = df.review.apply(clean_text)  # 通过apply对数据的每一行都做数据的清洗过程 apply(clean_text)
df.head()

Unnamed: 0,id,sentiment,review,clean_review,cleaned_review
0,5814_8,1,With all this stuff going down at the moment w...,stuff moment mj ve started listening music wat...,stuff moment mj ve started listening music wat...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin...",classic war worlds timothy hines entertaining ...,classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film starts manager nicholas bell investors ro...,film starts manager nicholas bell investors ro...
3,3630_4,0,It must be assumed that those who praised this...,assumed praised film filmed opera didn read do...,assumed praised film filmed opera didn read do...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...,superbly trashy wondrously unpretentious explo...


### 4.将所有的词统计一遍拿出top 5000的高频词做成并转成array

In [91]:
vectorizer = CountVectorizer(max_features = 5000) 
train_data_features = vectorizer.fit_transform(df.cleaned_review).toarray() # transform to array


In [92]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [93]:
train_data_features.shape

(25000, 5000)

In [94]:
df.sentiment.shape

(25000,)

### 5.训练分类器 - 随机森林

In [95]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, df.sentiment) # X = train_data_features, y = df.sentiment

### 5.1 在训练集上使用confusion_matrix做预测结果

In [96]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]], dtype=int64)

### 5.2删除不用的占内容变量

In [97]:
#del df
#del train_data_features

### 6. 对测试数据同样进行类似的数据预处理过程

In [98]:
datafile = os.path.join('..', 'data', 'testData.tsv') # reading path 
df = pd.read_csv(datafile, sep='\t', escapechar='\\') # reading data
print('Number of reviews: {}'.format(len(df)))  # number of data
df['test_cleaned_review'] = df.review.apply(clean_text) # apply 
df.head()

Number of reviews: 25000


Unnamed: 0,id,review,test_cleaned_review
0,12311_10,Naturally in a film who's main themes are of m...,naturally film main themes mortality nostalgia...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster disaster film action scenes mea...
2,5828_4,"All in all, this is a movie for kids. We saw i...",movie kids tonight child loved kid excitement ...
3,7186_2,Afraid of the Dark left me with the impression...,afraid dark left impression screenplays writte...
4,12128_7,A very accurate depiction of small time mob li...,accurate depiction time mob life filmed jersey...


In [99]:
test_data_features = vectorizer.transform(df.test_cleaned_review).toarray()
test_data_features.shape

(25000, 5000)

### 5.3 预测结果：output

In [100]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})

In [101]:
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [26]:
output.to_csv(os.path.join('..', 'data', 'Bag_of_Words_model.csv'), index=False)

In [27]:
del df
del test_data_features