# 1. 数据预处理

## 1.1 加载数据

In [2]:
import pandas as pd

data = pd.read_csv('./datasets/todo_classification.csv')

data.head()

Unnamed: 0,label,text
0,1,小张，你明天需要把产品方案输出，然后给小王。小王后天把交互做一下，大后天评审。
1,1,我明天应该可以把这个文档搞定。
2,1,统计异常数据那么简单的事情，怎么可能要3天时间呢！今天下班前，务必把这个事情搞定！
3,1,李总说的这个想法，一定要在8月10日之前形成一个成熟的方案。
4,1,走廊里的空箱子太多了，小张你明天把它们全部处理掉。


In [3]:
data.shape

(130, 2)

In [4]:
print('有意义文本：%d' %data[data.label==1].shape[0])
print('无意义文本：%d' %data[data.label==0].shape[0])
# for name,group in data.groupby(data.columns[0]):
#     print(name,len(group))

有意义文本：65
无意义文本：65


## 1.2 使用jieba库制作分词列表wordlist

In [5]:
import jieba
import time

wordlist = []
startT = time.time()
for row in data['text']:
    # splitrow = jieba.cut(row,True)
    words = [i for i in jieba.cut(row,True) if i not in ['，','。','！','？','、','.']]
    wordlist.append(words)
print('文本分词使用时间：',time.time()-startT)
# print(wordlist[:2])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/vz/l8r_5knx1g3c4yz2yq3mmjnw0000gn/T/jieba.cache
Loading model cost 3.747 seconds.
Prefix dict has been built successfully.
文本分词使用时间： 3.879559278488159


## 1.3 保存分词结果至本地txt文件

In [6]:
txtFilepath = './datasets/todotrain.txt'
with open(txtFilepath,'w',encoding='utf-8') as fp:
    for sentence in wordlist:
        fp.write(' '.join(sentence))
        fp.write('\n')

## 1.4 从本地txt文件加载分词

In [7]:
txtFilepath = './datasets/todotrain.txt'
with open(txtFilepath,'r',encoding='utf-8') as fp:
    wordlist = [k.strip().split(' ') for k in fp.readlines()]

In [8]:
print(wordlist[:2])

[['小张', '你', '明天', '需要', '把', '产品', '方案', '输出', '然后', '给', '小', '王', '小', '王后', '后天', '把', '交互', '做', '一下', '大后天', '后天', '评审'], ['我', '明天', '应该', '可以', '把', '这个', '文档', '搞定']]


# 2. word2vec词向量模型

## 2.1 实例化word2vec对象

In [9]:
from gensim.models import Word2Vec
import time
startTime = time.time()
word2vec_model = Word2Vec(wordlist, size=500, iter=10, min_count=1)
# sentences = wordlist
# sentences：可以是一个list，对于大语料集，建议使用BrownCorpus，Text8Corpus或lineSentence构建
# size：是指特征向量的维度，默认为100。大的size需要更多的训练数据，但是效果会更好，推荐值为几十到几百
# min_count：可以对字典做截断，词频少于min_count次数的单词会被丢弃掉, 默认值为5
usedTime = time.time() - startTime
print('形成word2vec模型共花费%.2f秒' %usedTime)

形成word2vec模型共花费1.68秒


## 2.2 通过word2vec对象的most_similar方法获取词义相近的次

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
word2vec_model.wv.most_similar('周一')

[('的', 0.6049918532371521),
 ('公司', 0.5968180298805237),
 ('一下', 0.5958437919616699),
 ('等', 0.5909460783004761),
 ('为', 0.5864100456237793),
 ('有', 0.5846232771873474),
 ('月', 0.5777654647827148),
 ('我', 0.5730652213096619),
 ('行业', 0.5712067484855652),
 ('和', 0.5682703256607056)]

In [12]:
print(word2vec_model[u'文档'])

[ 5.61756839e-04  1.16959738e-04 -9.87790176e-04  6.63977058e-04
  7.32941320e-04 -4.13161877e-04  3.69930567e-05  4.47949045e-04
  9.05527500e-04 -7.61229428e-04  5.14284766e-04 -5.21588023e-04
  8.61329318e-04  6.65424392e-04  7.55101442e-04  5.96803613e-04
 -4.11115121e-04  2.15845546e-04  7.77378271e-04  1.31619337e-04
 -5.20858099e-04  5.22769871e-04  3.35594668e-04  5.20189642e-04
 -9.67365922e-04 -6.66490116e-04 -8.21034890e-04 -1.46806793e-04
  9.01843887e-05  1.02256867e-03  3.37564648e-04 -4.36384784e-04
 -6.35581382e-04 -5.78270759e-04  1.54905691e-04 -2.07292633e-05
  3.63018538e-04 -8.49527132e-05 -6.13602970e-05  3.19443905e-04
  3.17792961e-04  7.89852988e-04 -3.75509087e-04 -1.20603159e-04
  7.58316251e-04 -2.02676543e-04 -9.97727737e-04  9.22703766e-04
 -2.51987018e-04 -5.21067006e-04  7.33412046e-04 -6.36105775e-04
  9.29108937e-04 -1.17426513e-04  1.84606048e-04 -4.87781363e-04
 -8.43831396e-04 -3.33187985e-04  1.82192161e-04  1.53232744e-04
  4.27756837e-04 -5.49746

## 2.3 word2vec词向量模型的保存与调用

In [13]:
# 保存模型
word2vec_model.save( 'word2vec_model.w2v' )

In [14]:
# 加载模型
word2vec_model = Word2Vec.load( 'word2vec_model.w2v' )

# 3. 语料向量模型

## 3.1 每条语料转化为向量

In [15]:
import numpy as np

def getVector(wordlist, word2vec_model):
    vector_list = [ word2vec_model.wv[k] for k in wordlist if k in word2vec_model]
    wordVector = np.array(vector_list).mean(axis = 0)
    return wordVector

In [16]:
import time
import numpy as np

starttime = time.time()
contentVector = []
for i in range(len(wordlist)):
    word = wordlist[i]
    usedTime = time.time() - starttime
    contentVector.append(getVector(word, word2vec_model))
print('语料向量形成使用时间为：',usedTime)
X = np.array(contentVector)

语料向量形成使用时间为： 0.18552803993225098


## 3.2 使用ndarray对象的dump方法保存文章向量化结果X

In [17]:
X.dump('./datasets/todo_X.txt')

# 4. 模型训练

## 4.1 标签编码转换为矩阵

In [18]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(data['label'])

In [19]:
print(y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [20]:
print(X.shape, y.shape)

(130, 500) (130,)


## 4.2 把X，y拆分为训练集和测试集

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 3)

## 4.3 模型训练与评估

In [22]:
print(y_test)

[0 1 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0]


In [23]:
# 1）Logistic Regression 模型
def LR_Classify(X_train,y_train):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(penalty = 'l2', max_iter = 10000)
    model.fit(X_train,y_train)
    return model

LR_model = LR_Classify(X_train,y_train)
LR_predict = LR_model.predict(X_test)

print('y_test is    ', y_test)
print('LR_predict is', LR_predict)
LR_model.score(X_test,y_test)

from sklearn import metrics
LR_accuracy = metrics.accuracy_score(y_test, LR_predict)
print("Logistic Regression Accurary Is %.5f" %float(LR_accuracy))

y_test is     [0 1 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0]
LR_predict is [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Logistic Regression Accurary Is 0.48485


In [24]:
# 2）Random Forest 模型
def RF_Classifier(x_train, y_train):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=8)
    model.fit(x_train, y_train)
    return model

RF_model = RF_Classifier(X_train,y_train)
RF_predict = RF_model.predict(X_test)
print('y_test is    ', y_test)
print('RF_predict is', RF_predict)
RF_model.score(X_test,y_test)

from sklearn import metrics
RF_accuracy = metrics.accuracy_score(y_test, RF_predict)
print("Random Forest Accurary Is %.5f" %float(RF_accuracy))

y_test is     [0 1 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0]
RF_predict is [0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 0 1 1 1 1 0 0 0]
Random Forest Accurary Is 0.66667


In [25]:
# 3）SVM 模型
def SVM_Classifier(x_train, y_train):
    from sklearn.svm import SVC
    model = SVC(C=10,kernel='rbf')
    model.fit(x_train, y_train)  
    return model

SVM_model = SVM_Classifier(X_train,y_train)
SVM_predict = SVM_model.predict(X_test)
print('y_test is     ', y_test)
print('SVM_predict is', SVM_predict)
SVM_model.score(X_test,y_test)

from sklearn import metrics
SVM_accuracy = metrics.accuracy_score(y_test, SVM_predict)
print("SVM Accurary Is %.5f" %float(SVM_accuracy))

y_test is      [0 1 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0]
SVM_predict is [0 1 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 0 0 1 0 1 1 0]
SVM Accurary Is 0.96970


# 5. 模型测试

## 5.1 输入测试数据

In [26]:
text = [
    '周二就是发布会了，小王，你今天下午就把演示PPT发给我',
    '你们产品评审做到哪一步了，今天下午能不能把方案定稿出来',
    '这外卖这么难吃给个差评不过分吧',
    '王总购买我们的产品尾款还一直没有付，小李你去催一下看看下个月初能不能结清',
    '你们现在效率不太行啊，上班集中一点注意力，今天下午把你们拖一周的产品文案写完发到我这',
    '现在上学也太难了',
    '公司是一家专注于民生领域信息化的高新技术企业，公司的主营业务为医疗卫生、民政等民生领域的软件开发及硬件销售、技术服务业务，主要面向各级医疗卫生行政管理机构、医院、社区卫生服务中心、新农合、民政行政管理机构等领域的客户，提供软件产品及整体解决方案。公司2013年、2014年、2015年1-10月的营业收入分别为33,681,276.13元、48,088,344.70元、40,972,648.75元。其中，主营业务收入占比分别为99.72%、99.63%、99.64%，主营业务突出。自公司成立以来，主营业务未发生重大变化。',
    '我感觉我就是背着电脑换了个地方干和我在产业楼干的一样的事情',
    '老王，你下周三把这个程序问题解决掉，不要再让我问第三次',
    '周日之前必须把发布需要用的样本做完，不然不够发布的出货',
    '别说废话了，再说撕烂你的嘴'
]
# [1 1 0 1 1 0 0 0 1 1 0]

## 5.2 将输入数据转换为X矩阵

In [27]:
def TextToX(text,word2vec_model):
    textwords = []
    for i in range(len(text)):
        sentence = text[i]
        words = [k for k in jieba.cut(sentence) if k not in ['，','。','！','？','、']]
        textwords.append(words)
    textwords_Vec = []
    for i in range(len(textwords)):
        textword = textwords[i]
        textwords_Vec.append(getVector(textword, word2vec_model))
    X_text = np.array(textwords_Vec)
    return X_text

X_text = TextToX(text,word2vec_model)

## 5.3 传入预测模型

In [28]:
RF_predict = RF_model.predict(X_text)
print('y_text is     [1 1 0 1 1 0 0 0 1 1 0]')
print('RF_predict is',RF_predict)

y_text is     [1 1 0 1 1 0 0 0 1 1 0]
RF_predict is [1 1 0 1 1 0 0 0 1 0 0]
