In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_public.csv')

train.shape
train.head()
test.shape
test.head()

(9947, 5)

Unnamed: 0,content_id,content,subject,sentiment_value,sentiment_word
0,vUXizsqexyZVRdFH,因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。,价格,0,影响
1,4QroPd9hNfnCHVt7,四驱价格貌似挺高的，高的可以看齐XC60了，看实车前脸有点违和感。不过大众的车应该不会差。,价格,-1,高
2,QmqJ2AvM5GplaRyz,斯柯达要说质量，似乎比大众要好一点，价格也低一些，用料完全一样。我听说过野帝，但没听说过你说...,价格,1,低
3,KMT1gFJiU4NWrVDn,这玩意都是给有钱任性又不懂车的土豪用的，这价格换一次我妹夫EP020可以换三锅了,价格,-1,有钱任性
4,nVIlGd5yMmc37t1o,17价格忒高，估计也就是14-15左右。,价格,-1,高


(2364, 2)

Unnamed: 0,content_id,content
0,XuPwKCnA2fqNh5vm,欧蓝德，价格便宜，森林人太贵啦！
1,2jNbDn85goX3IuPE,楼主什么时候提的车，南昌优惠多少啊
2,hLgEADQ8sUnvGFK9,吉林，2.5优惠20000，送三年九次保养，贴膜
3,nZmM7LQsfr03wUaz,便宜2万的豪华特装，实用配制提升，优惠还给力，确实划算。
4,pwd8MnrthDqLZafe,如果实在想买就等车展期间，优惠2万，我24.98万入的2.5豪


In [3]:
train.groupby(
    ['subject'], as_index=False)['content_id'].agg({
        'subject_num': 'count'
    })
train.groupby(
    ['sentiment_value'], as_index=False)['content_id'].agg({
        'sentiment_value_num':
        'count'
    })

Unnamed: 0,subject,subject_num
0,价格,1273
1,内饰,536
2,动力,2732
3,外观,489
4,安全性,573
5,操控,1036
6,油耗,1082
7,空间,442
8,舒适性,931
9,配置,853


Unnamed: 0,sentiment_value,sentiment_value_num
0,-1,1616
1,0,6661
2,1,1670


In [4]:
test['subject'] = '暂无'
test['sentiment_value'] = 99

In [5]:
df = pd.concat([train, test], axis=0)
df.reset_index(drop=True, inplace=True)

df.shape

(12311, 5)

## 预处理

In [6]:
import jieba
import re

In [7]:
# 停用词、数字、字母
def sent2words(sentence):
    sentcut = jieba.cut(sentence)
    words = []
    for w in sentcut:
        if (re.compile(r'[^\w]').findall(w) == []) and (
                w not in stopwords) and (
                    re.compile(r'-?([1-9]\d*\.\d*|0\.\d*[1-9]\d*|0?\.0+|0)$')
                    .findall(w) == []) and (
                        re.compile(r'^[A-Za-z0-9]+$').findall(w) == []):
            words.append(w)
    return words

In [8]:
stopwords = []
with open('stopwords/chinese.txt', 'r', encoding='utf-8') as f:
    for line in f:
        stopwords.append(line[:-1])
stopwords = list(set(stopwords))
len(stopwords)

1472

## Word2Vec
- 全部
- 无停用词
- 无字母数字
- 无停用词和字母数字

In [12]:
import os
import gensim
from collections import Counter

In [10]:
sentcut_len = []

for sentence in tqdm_notebook(df.content):
    sentcut_len.append(len(sent2words(sentence)))

A Jupyter Widget

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ZERO\AppData\Local\Temp\jieba.cache
Loading model cost 0.713 seconds.
Prefix dict has been built succesfully.





In [13]:
Counter(sentcut_len)  # max:79

Counter({1: 22,
         2: 127,
         3: 391,
         4: 668,
         5: 790,
         6: 807,
         7: 722,
         8: 768,
         9: 692,
         10: 625,
         11: 572,
         12: 589,
         13: 532,
         14: 451,
         15: 460,
         16: 435,
         17: 340,
         18: 316,
         19: 311,
         20: 253,
         21: 235,
         22: 204,
         23: 194,
         24: 196,
         25: 169,
         26: 171,
         27: 111,
         28: 96,
         29: 106,
         30: 91,
         31: 86,
         32: 77,
         33: 58,
         34: 58,
         35: 42,
         36: 40,
         37: 23,
         38: 44,
         39: 48,
         40: 35,
         41: 47,
         42: 24,
         43: 29,
         44: 33,
         45: 28,
         46: 7,
         47: 22,
         48: 16,
         49: 20,
         50: 12,
         51: 11,
         52: 16,
         53: 20,
         54: 17,
         55: 6,
         56: 11,
         57: 8,
         58: 9,


In [14]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for line in open(self.dirname, 'r', encoding='utf-8'):
            yield line.split()

In [135]:
models = []
for file in os.listdir('./feature/'):
    sentences = list(MySentences('./feature/' + (file)))
    model = gensim.models.Word2Vec(
        sentences,
        sg=1,
        min_count=5,
        size=64,
        hs=1,
        compute_loss=True,
        window=30,
        workers=8,
        sample=1e-4,
        iter=10,
        seed=1024)
    models.append(model)
    print(file, 'train success')

dict(full).txt train success
dict(noNumLetter).txt train success
dict(noStop).txt train success
dict(noStopnoNumLetter).txt train success


In [136]:
for idx, file in enumerate(os.listdir('./feature/')):
    models[idx].wv.save_word2vec_format(
        './w2v/%s.model' % (file[:-4]), binary=False)

In [137]:
models[3].get_latest_training_loss()
models[3].most_similar(['汽车'])

16134405.0

[('估值', 0.7698969841003418),
 ('页面', 0.7649967074394226),
 ('之家', 0.7508060932159424),
 ('官网', 0.718483567237854),
 ('家', 0.6834395527839661),
 ('老车', 0.6385050415992737),
 ('报价', 0.6321704387664795),
 ('电话', 0.6303287148475647),
 ('身边', 0.6217993497848511),
 ('咨询', 0.6178069114685059)]

In [128]:
models[0]['汽车']

array([-0.29320782,  0.17514901, -0.2793455 , -0.727252  ,  0.61787194,
        0.18017343,  0.09768555,  0.5884758 ,  0.5044426 ,  0.62650007,
       -0.08547639, -0.20605849, -0.12815753,  0.18777895,  0.8084431 ,
       -0.5289974 ,  0.03194837, -0.53922343,  0.19737102, -0.346159  ,
       -0.10677625,  0.22649789, -0.09826554, -0.04177133,  0.04161321,
        0.13826741, -0.2572783 , -0.1773619 , -1.1376679 ,  0.15198044,
       -0.3670909 ,  0.08238709, -0.47551388,  0.4484195 ,  0.24288955,
       -0.50507325,  0.25255582,  0.08679068, -0.30211857,  0.36555174,
       -0.55638665, -0.3381527 ,  0.29136539,  0.70238346,  0.82825685,
        0.02208399, -0.26053947, -0.13771176,  0.11087234,  0.14119835,
       -0.4200934 ,  0.7024862 ,  0.18627398,  0.10363261,  0.17139427,
       -0.4963966 ,  0.0422016 ,  0.46446633, -0.5936849 ,  0.49129108,
       -0.01314174,  1.0727158 , -0.32537478, -0.324537  ], dtype=float32)