# 在word2vec上训练情感分析模型

In [4]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

### 和之前的操作一致

In [5]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

In [6]:
eng_stopwords = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

### 读入之前训练好的Word2Vec模型

In [11]:
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(os.path.join('..', 'models', model_name))

FileNotFoundError: [Errno 2] No such file or directory: '..\\models\\300features_40minwords_10context.model'

### 我们可以根据word2vec的结果去对影评文本进行编码

编码方式有一点粗暴，简单说来就是把这句话中的词的词向量做平均

In [8]:
df = load_dataset('labeled_train')
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [9]:
def to_review_vector(review):
    words = clean_text(review, remove_stopwords=True)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

In [10]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

NameError: name 'model' is not defined

### 用随机森林构建分类器

In [8]:
forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

##### 同样在训练集上试试，确保模型能正常work

In [9]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

### 清理占用内容的变量

In [8]:
del df
del train_data_features

### 预测测试集结果并上传kaggle

In [15]:
df = load_dataset('test')
df.head()

Number of reviews: 25000


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [16]:
test_data_features = df.review.apply(to_review_vector)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.019753,-0.005689,0.015961,-0.038633,-0.041745,-0.04468,-0.01279,0.004908,0.053838,0.00849,...,-0.00552,0.034378,-0.02725,0.010244,-0.008976,0.010181,-0.027196,0.010429,0.021153,0.015764
1,0.000497,-0.00414,0.019237,0.011341,-0.02086,-0.013085,-0.005469,0.015154,0.022737,0.009717,...,0.005757,0.018115,-0.010495,-0.00765,0.000969,0.018796,-0.003173,0.001657,0.014491,0.026732
2,-0.015999,-0.012097,0.022069,-0.014368,-0.020226,-0.015809,-0.000826,0.01013,0.033976,0.0057,...,0.001799,0.012403,-0.022812,0.011651,0.001775,0.009241,0.003241,-0.002865,0.027701,0.028418
3,-0.015196,-0.013445,0.010499,-0.035669,-0.040131,-0.018273,-0.020452,-0.003197,0.026555,0.008284,...,0.01172,0.010397,-0.029256,0.007422,-0.000662,0.020593,0.001274,-0.014059,0.024905,0.024326
4,-0.01614,-0.015608,0.010962,-0.008424,-0.022619,-0.022396,-0.018043,0.012519,0.032103,0.009743,...,-0.00182,0.004578,-0.008875,0.009702,-0.012013,0.010689,-0.003468,-0.003109,0.026661,0.005735


In [17]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,1


In [None]:
del df
del test_data_features
del forest

------------------
### 对词向量进行聚类研究和编码
使用Kmeans进行聚类

In [5]:
word_vectors = model.syn0
num_clusters = word_vectors.shape[0] // 10

In [6]:
%%time

kmeans_clustering = KMeans(n_clusters = num_clusters, n_jobs=4)
idx = kmeans_clustering.fit_predict(word_vectors)

CPU times: user 2.03 s, sys: 377 ms, total: 2.41 s
Wall time: 13min 19s


In [7]:
word_centroid_map = dict(zip(model.index2word, idx))

In [8]:
import pickle

filename = 'word_centroid_map_10avg.pickle'
with open(os.path.join('..', 'models', filename), 'bw') as f:
    pickle.dump(word_centroid_map, f)
    
#with open(os.path.join('..', 'models', filename), 'br') as f:
#    word_centroid_map = pickle.load(f)    

### 输出一些clusters看

In [9]:
for cluster in range(0,10):
    print("\nCluster %d" % cluster)
    print([w for w,c in word_centroid_map.items() if c == cluster])


Cluster 0
['praised', 'appreciated', 'noted', 'avoided', 'criticized', 'admired']

Cluster 1
['misfit', 'con', 'hoodlum', 'spy', 'rogue']

Cluster 2
['contrasts', 'healthy', 'glamour', 'eroticism', 'sensual']

Cluster 3
['matthew', 'kingsley', 'klein', 'hackman', 'meyers', 'perry', 'simpson', 'pullman', 'dana', 'olsen', 'ryan', 'barrie', 'caan', 'tho', 'farina', 'stiller', 'hutton', 'sparks', 'lillard', 'broderick', 'kline', 'reprise', 'mcconaughey', 'carvey', 'harrelson']

Cluster 4
['wolves', 'papillon', 'continent']

Cluster 5
['tick', 'drain', 'nailed', 'puke', 'boil', 'stalk']

Cluster 6
['cotton', 'denver', 'windsor', 'marsh', 'bell']

Cluster 7
['lighting', 'costumes', 'sfx', 'props', 'design', 'costuming', 'designs', 'makeup']

Cluster 8
['decline', 'swashbuckling', 'swashbuckler', 'prestige', 'potboiler', 'latter', 'glory', 'untouchables', 'fame']

Cluster 9
['slashed', 'butchered', 'mutilated', 'eaten', 'slaughtered', 'continually']


### 把评论数据转成cluster bag vectors

In [11]:
wordset = set(word_centroid_map.keys())

def make_cluster_bag(review):
    words = clean_text(review, remove_stopwords=True)
    return (pd.Series([word_centroid_map[w] for w in words if w in wordset])
              .value_counts()
              .reindex(range(num_clusters+1), fill_value=0))

In [12]:
df = load_dataset('labeled_train')
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [13]:
train_data_features = df.review.apply(make_cluster_bag)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,5,0,0,0,0,0,0,0,0


### 再用随机森林算法建模

In [14]:
forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

##### 在训练集上试一试效果

In [15]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

#### 去掉无用的占内存的量

In [16]:
del df
del train_data_features

### 载入测试数据做预测

In [17]:
df = load_dataset('test')
df.head()

Number of reviews: 25000


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [18]:
test_data_features = df.review.apply(make_cluster_bag)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_BagOfClusters.csv'), index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [None]:
del df
del test_data_features
del forest