In [1]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision      # 数据库模块
import numpy as np
import pandas as pd
import gensim
import jieba
import jieba.posseg as pseg
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
torch.manual_seed(1)    # reproducible



<torch._C.Generator at 0x599ea70>

## 1. 加载数据，处理数据

In [2]:
data = pd.read_csv('data/zhishiku.csv',encoding='utf-8',sep='\t')
data['label'] = LabelEncoder().fit_transform(data.categ_id)
data = data[['categ_id','standard_question']]

### 1.1 对label进行数值化处理

In [3]:
data['label'] = LabelEncoder().fit_transform(data.categ_id)

### 1.2 对X进行数值化处理 用之前训练好的词向量

In [4]:
# 如果词不在词向量中则用0代替
UNK = np.zeros(400)

### 1.3 用jieba切词，词的长度少于6个则都用不足的用UNK补上，如果大于6个则选n和v，选完后如果少于6个则用UNK代替，如果还多于6个则取前6个

In [5]:
jieba.load_userdict('data/user_dict.txt')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\16121360\AppData\Local\Temp\jieba.cache
Loading model cost 0.973 seconds.
Prefix dict has been built succesfully.


In [6]:
# 加载停用词
f = open('data/stop_words.txt','r',encoding='utf-8')
stop_words_list = [line.strip() for line in f.readlines()]

In [7]:
# 词的长度少于6个则都用不足的用UNK补上，如果大于等于6个则选n和v;选完后如果少于6个再选m,t,a的如果选完还是不足则用UNK代替
def ff(s):
    pg = pseg.lcut(s)
    pg = [x for x in pg if x.word not in stop_words_list] # 去停用词
    pg_l = len(pg)
    if pg_l < 7:
        return [x.word for x in pg if x.word not in stop_words_list] + ['UNK']*(6-pg_l)
    else:
        pg2 = [x for x in pg if x.flag in ('n','nr','ns','nt','nz','v','vn','vd','vg')][:6]
        pg2_l = len(pg2)
        if pg2_l < 6:
            pg2 += [x for x in pg if x.flag in ('m','t','a')][:6-pg2_l]
        return [x.word for x in pg2 if x.word not in stop_words_list] + ['UNK']*(6-len(pg2))

In [8]:
## 分期付款用jieba怎么都分不开。。。手工分下
def ff2(l):
    if '分期付款' in l:
        idx = l.index('分期付款')
        l.pop(idx)
        l.insert(idx,'分期')
        l.insert(idx+1,'付款')
        l.pop(-1)
        return l
    return l

In [9]:
%%time
pg3 = data.standard_question.apply(ff)
pg3 = pg3.apply(ff2)

Wall time: 5.85 s


In [10]:
# 加载word2vec模型
model=Word2Vec.load('model/skip_dia.model')

In [11]:
# 对X进行数值化处理，拿出词向量，没有的用0填充
data['X'] = pg3.apply(lambda x : np.array([model.wv[s] if s in model.wv.vocab.keys() else UNK for s in x]))

In [14]:
# 保存dataframe，用csv保存方法发现有点问题
# from sklearn.externals import joblib #jbolib模块
# joblib.dump(data[['label','X']], 'data/all_numerical_data.pkl')

#### train_test_split

In [12]:
def train_test_sep(X, test_size = 0.3, stratify = None, random_state = 1001):
        train, test = train_test_split(X, test_size = test_size, stratify = stratify, random_state = random_state)
        return train, test
train1, test = train_test_sep(data[['label','X']])

### 1.4 统计一下训练数据每个类别的count

In [13]:
train1.groupby(by='label').count()

Unnamed: 0_level_0,X
label,Unnamed: 1_level_1
0,1258
1,890
2,117
3,66
4,288
5,92
6,574
7,461
8,198
9,73


### 1.5 对类别count少的进行上采样 train1

In [17]:
'''
类别2*2倍
类别3*4倍
类别5*2倍
类别8*1倍
类别9*3倍
类别10*1倍
'''

'\n类别2*2倍\n类别3*4倍\n类别5*2倍\n类别8*1倍\n类别9*3倍\n类别10*1倍\n'

In [14]:
# 进行上采样
train = pd.concat([train1,train1[train1.label==2],train1[train1.label==2],train1[train1.label==3],train1[train1.label==3],\
                  train1[train1.label==3],train1[train1.label==3],train1[train1.label==5],train1[train1.label==5],\
                  train1[train1.label==8],train1[train1.label==9],train1[train1.label==9],train1[train1.label==9],
                  train1[train1.label==10]],axis=0)
# 不进行上采样
train = train1

In [15]:
# 进行一步shuffle操作，打乱顺序对训练效果会好一点
train = shuffle(train)

In [16]:
# 查看下数据结构
train.head(10)

Unnamed: 0,label,X
3859,6,"[[-0.18964225053787231, 0.13085795938968658, 0..."
3224,6,"[[-0.4948671758174896, -0.07261338084936142, -..."
4303,8,"[[-0.12191753089427948, 0.32593342661857605, 0..."
51,4,"[[-0.33129289746284485, 0.2676745057106018, -0..."
1429,0,"[[-0.155685693025589, -0.08552972972393036, 0...."
622,0,"[[-0.04397333785891533, -0.028153445571660995,..."
4373,8,"[[-0.04569557, 0.10023516, 0.09676585, 0.02597..."
4615,9,"[[-0.24380038678646088, 0.39492788910865784, 0..."
5318,7,"[[-0.05486312508583069, -0.05952757969498634, ..."
3391,6,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


## 2. 跑CNN神经网络 Conv2D

### 2.1 数据处理成tensor，dataset，dataloader形式

In [17]:
EPOCH = 3           # 训练整批数据多少次, 为了节约时间, 我们只训练三次
BATCH_SIZE = 64
LR = 0.003          # 学习率

In [18]:
# 这里trainX_tensor转成trainX是为了加一个维度，值为1；
# 本来一句话是6*400，现在变成1*6*400；如果是彩色图片有RGB，值为3
print('conv2d training data')
trainX_tensor = torch.from_numpy(np.array([x for x in train.X]))
print(trainX_tensor.shape)
trainX = torch.unsqueeze(trainX_tensor, dim=1).type(torch.FloatTensor)
print(trainX.shape)
trainy_tensor = torch.from_numpy(np.array(train.label))
print(trainy_tensor.shape)
print('\n')
#--------------------------------
print('conv2d testing data')
testX_tensor = torch.from_numpy(np.array([x for x in test.X]))
print(testX_tensor.shape)
testX = torch.unsqueeze(testX_tensor, dim=1).type(torch.FloatTensor)
print(testX.shape)
testy_tensor = torch.from_numpy(np.array(test.label))
print(testy_tensor.shape)

conv2d training data
torch.Size([4202, 6, 400])
torch.Size([4202, 1, 6, 400])
torch.Size([4202])


conv2d testing data
torch.Size([1801, 6, 400])
torch.Size([1801, 1, 6, 400])
torch.Size([1801])


In [19]:
# 组装成dataset，到时候放入dataloader(放入dataloader是为了进行批训练)
torch_dataset = Data.TensorDataset(trainX, trainy_tensor)
train_loader = Data.DataLoader(
    dataset=torch_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=2,              # subprocesses for loading data
)

### 2.2 构建cnn模型

In [20]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(  # input shape (1, 6, 400) 如果加上样本量那就是(4202, 1, 6, 400)
            nn.Conv2d(
                in_channels=1,      # input height 又叫通道
                out_channels=16,    # n_filters output height
                kernel_size=3,      # filter size
                stride=1,           # filter movement/step
                padding=1,      # 如果想要 con2d 出来的图片长宽没有变化, padding=(kernel_size-1)/2 当 stride=1
            ),      # output shape (16, 6, 400)
            nn.ReLU(),    # activation
            nn.MaxPool2d(kernel_size=2),    # 在 2x2 空间里向下采样, output shape (16, 3, 200)
        )
        self.conv2 = nn.Sequential(  # input shape (16, 3, 200)
            nn.Conv2d(16, 32, 3, 1, 1),  # output shape (32, 3, 200)
            nn.ReLU(),  # activation
            nn.MaxPool2d(2),  # output shape (32, 1, 100)
        )
        self.out = nn.Linear(32 * 1 * 100, 11)   # fully connected layer, output 11 classes

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)   # 展平多维的卷积图成 (batch_size, 32 * 1 * 100)
        output = self.out(x)
        return output

cnn = CNN()
print(cnn)  # net architecture

CNN(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (out): Linear(in_features=3200, out_features=11, bias=True)
)


### 2.3 训练

In [21]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)   # optimize all cnn parameters;Adam比较好用
loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted

In [22]:
%%time
# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        output = cnn(b_x)               # cnn output
        loss = loss_func(output, b_y)   # cross entropy loss
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients

Wall time: 19.4 s


In [23]:
test_output = cnn(testX)
pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
# 评价准确率
accuracy_score(testy_tensor.numpy(),pred_y)

## 3. 保存模型

#### 方法一

In [84]:
# 保存
torch.save(cnn, 'cnn1.pkl')

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
# 加载
cnn = torch.load('cnn1.pkl')

#### 方法二（推荐）

In [85]:
# 保存
torch.save(cnn.state_dict(), 'cnn1_params.pkl')

In [48]:
# 加载
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(  # input shape (1, 6, 400)
            nn.Conv2d(
                in_channels=1,      # input height
                out_channels=16,    # n_filters
                kernel_size=3,      # filter size
                stride=1,           # filter movement/step
                padding=1,      # 如果想要 con2d 出来的图片长宽没有变化, padding=(kernel_size-1)/2 当 stride=1
            ),      # output shape (16, 6, 400)
            nn.ReLU(),    # activation
            nn.MaxPool2d(kernel_size=2),    # 在 2x2 空间里向下采样, output shape (16, 3, 200)
        )
        self.conv2 = nn.Sequential(  # input shape (16, 3, 200)
            nn.Conv2d(16, 32, 3, 1, 1),  # output shape (32, 3, 200)
            nn.ReLU(),  # activation
            nn.MaxPool2d(2),  # output shape (32, 1, 100)
        )
        self.out = nn.Linear(32 * 1 * 100, 11)   # fully connected layer, output 11 classes

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)   # 展平多维的卷积图成 (batch_size, 32 * 2 * 100)
        output = self.out(x)
        return output

cnn = CNN()
# 定义完和save前一样的cnn结构以后，加载模型参数
cnn.load_state_dict(torch.load('cnn1_params.pkl'))

In [None]:
#############

## 2(2). 跑cnn模型之用Conv1D

In [18]:
# 注意conv1d不需要conv2d那样增加channel，conv1d没有height；
# 还有这里的trainX要变成float，trainy要变成long
print('conv1d training data')
trainX = torch.from_numpy(np.array([x for x in train.X])).type(torch.DoubleTensor).float() # .float()
print(trainX.shape)
trainy_tensor = torch.from_numpy(np.array(train.label)).type(torch.DoubleTensor).long() # .long()
print(trainy_tensor.shape) 
print('\n')
#--------------------------------
print('conv1d testing data')
testX = torch.from_numpy(np.array([x for x in test.X])).type(torch.DoubleTensor).float()
print(testX.shape)
testy_tensor = torch.from_numpy(np.array(test.label)).type(torch.DoubleTensor).long()
print(testy_tensor.shape)

# 组装成dataset，到时候放入dataloader(放入dataloader是为了进行批训练)
torch_dataset = Data.TensorDataset(trainX, trainy_tensor)
train_loader = Data.DataLoader(
    dataset=torch_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=2,              # subprocesses for loading data
)

conv1d training data
torch.Size([4202, 6, 400])
torch.Size([4202])


conv1d testing data
torch.Size([1801, 6, 400])
torch.Size([1801])


#### 构建cnn模型 conv1d

In [19]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(  # input shape (6, 400) 样本数不算，算上的话就是(4202, 6, 400)
            nn.Conv1d(
                in_channels=6,      # 这里相当于词的个数，固定句子中的词都为6
                out_channels=16,    # 输出的词，也相当于特征
                kernel_size=3,      # filter size ；conv1d中筛子维度是1*3
                stride=1,           # filter movement/step
                padding=1,      # 如果想要 conv1d 出来的图片长宽没有变化, padding=(kernel_size-1)/2 当 stride=1
            ),      # output shape (16, 400)
            nn.ReLU(),    # activation
            nn.MaxPool1d(kernel_size=2),    # 在 1x2 空间里向下采样, output shape (16, 200)
        )
        self.conv2 = nn.Sequential(  # input shape (16, 200)
            nn.Conv1d(16, 32, 3, 1, 1),  # output shape (32, 200)
            nn.ReLU(),  # activation
            nn.MaxPool1d(2),  # output shape (32, 100)
        )
        self.out = nn.Linear(32 * 100, 11)   # fully connected layer, output 11 classes

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)   # 展平多维的卷积图成 (batch_size, 32 * 100)
        output = self.out(x)
        return output

cnn = CNN()
print(cnn)  # net architecture

CNN(
  (conv1): Sequential(
    (0): Conv1d(6, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (out): Linear(in_features=3200, out_features=11, bias=True)
)


In [20]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)   # optimize all cnn parameters;Adam比较好用
loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted
# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        output = cnn(b_x)               # cnn output
        loss = loss_func(output, b_y)   # cross entropy loss
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients

In [21]:
test_output = cnn(testX)
pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
# 评价准确率
accuracy_score(testy_tensor.numpy(),pred_y)

0.9722376457523598

In [None]:
##########