In [1]:
import pandas as pd

In [2]:
train=pd.read_csv('labeledTrainData.tsv',header=0,delimiter='\t',quoting=3)

In [3]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


- 数据清洗和文本处理

In [4]:
#用Beautifulsoup来清理html
from bs4 import BeautifulSoup

In [5]:
#在一条评论上初始化一个beautifulsoup对象
example1=BeautifulSoup(train['review'][0],'lxml')

In [6]:
#比较原始文本和处理后文本，用get_text()得到处理后结果
print(train['review'][0])
print()
print(example1.get_text())

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [7]:
#用正则表达式去除停用词，符号
import re

In [8]:
letters_only=re.sub('[^a-zA-Z]',' ',example1.get_text())

In [9]:
letters_only

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

In [10]:
#小写
lower_case=letters_only.lower()

In [11]:
words=lower_case.split()

In [12]:
from nltk.corpus import stopwords

In [13]:
import nltk

In [14]:
 nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/duanxiaoer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
#取出停用词
words=[w for w in words if not w in stopwords.words('english')]

In [16]:
words[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

In [17]:
def review_2words(raw_review):
    #1 取出HTML
    review_text=BeautifulSoup(raw_review,'lxml').get_text()
    #去除非文本
    letters_only=re.sub('^a-zA-Z',' ',review_text)
    #小写，划分为词
    words=letters_only.lower().split()
    #去除停用词
    stops=set(stopwords.words('english'))
    mean_words=[w for w in words if not w in stops]
    return (' '.join(mean_words))

In [18]:
num_reviews=train['review'].size
clean_train_reviews=[]
for i in range(0,num_reviews):
    clean_train_reviews.append(review_2words(train['review'][i]))

In [19]:
num_reviews

25000

In [20]:
#使用skilearn，从词袋中创建特征
print('创建词袋模型...\n')
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(analyzer='word',
                          tokenizer=None,
                          preprocessor=None,
                          stop_words=None,
                          max_features=5000)
train_data_feature=vectorizer.fit_transform(clean_train_reviews)
train_data_feature=train_data_feature.toarray()

创建词袋模型...



In [21]:
print(train_data_feature.shape)

(25000, 5000)


In [24]:
# from sklearn.model_selection import train_test_split
# X_train,X_test,y_train,y_test = train_test_split(train_data_feature,train['sentiment'],test_size=0.3)

In [22]:
#随机森林
print('随机森林')
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier(n_estimators=100)
forest.fit(train_data_feature,train['sentiment'])

随机森林


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
#测试
test=pd.read_csv('testData.tsv',header=0,delimiter='\t',
                 quoting=3)
print(test.shape)

(25000, 2)


In [25]:
num_reviews=len(test['review'])
clean_test_reviews=[]
print('cleaning and parsing the test set movie reviews..\n')
for i in range(0,num_reviews):
    if ((i+1)%1000==0):
        print('review %d of %d \n'%(i+1,num_reviews))
    clean_reviews=review_2words(test['review'][i])
    clean_test_reviews.append(clean_reviews)

cleaning and parsing the test set movie reviews..

review 1000 of 25000 

review 2000 of 25000 

review 3000 of 25000 

review 4000 of 25000 

review 5000 of 25000 

review 6000 of 25000 

review 7000 of 25000 

review 8000 of 25000 

review 9000 of 25000 

review 10000 of 25000 

review 11000 of 25000 

review 12000 of 25000 

review 13000 of 25000 

review 14000 of 25000 

review 15000 of 25000 

review 16000 of 25000 

review 17000 of 25000 

review 18000 of 25000 

review 19000 of 25000 

review 20000 of 25000 

review 21000 of 25000 

review 22000 of 25000 

review 23000 of 25000 

review 24000 of 25000 

review 25000 of 25000 



In [26]:
#get a bag of words for the test set convert to array
test_data_features=vectorizer.transform(clean_test_reviews)
test_data_features=test_data_features.toarray()


In [27]:
#预测
result=forest.predict(test_data_features)

In [29]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )