In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import torchvision      # 数据库模块
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
torch.manual_seed(1)    # reproducible



<torch._C.Generator at 0x7fb205a5e7b0>

# 1.构建embedding矩阵 train和test要合起来

In [2]:
import numpy as np
from tqdm import tqdm

class BOW(object):
    def __init__(self, X, min_count=10, maxlen=100):
        """
        X: [[w1, w2],]]
        """
        self.X = X
        self.min_count = min_count
        self.maxlen = maxlen
        self.__word_count()
        self.__idx()
        self.__doc2num()

    def __word_count(self):
        wc = {}
        for ws in tqdm(self.X, desc='   Word Count'):
            for w in ws:
                if w in wc:
                    wc[w] += 1
                else:
                    wc[w] = 1
        self.word_count = {i: j for i, j in wc.items() if j >= self.min_count}

    def __idx(self):
        self.idx2word = {i + 1: j for i, j in enumerate(self.word_count)}
        self.word2idx = {j: i for i, j in self.idx2word.items()}

    def __doc2num(self):
        doc2num = []
        for text in tqdm(self.X, desc='Doc To Number'):
            s = [self.word2idx.get(i, 0) for i in text[:self.maxlen]]
            doc2num.append(s + [0]*(self.maxlen-len(s)))  # 未登录词全部用0表示
        self.doc2num = np.asarray(doc2num)

In [3]:
def get_ids(qids):
    ids = []
    for t_ in qids:
        ids.append(int(t_[1:]))
    return np.asarray(ids)


def get_texts(file_path, question_path):
    qes = pd.read_csv(question_path)
    file = pd.read_csv(file_path)
    q1id, q2id = file['q1'], file['q2']
    id1s, id2s = get_ids(q1id), get_ids(q2id)
    all_words = qes['words']
    texts = []
    for t_ in zip(id1s, id2s):
        texts.append(all_words[t_[0]] + ' ' + all_words[t_[1]])
    return texts
TRAIN_PATH = 'mojing/train.csv'
TEST_PATH = 'mojing/test.csv'
QUESTION_PATH = 'mojing/question.csv'
train_texts = get_texts(TRAIN_PATH, QUESTION_PATH)
test_texts = get_texts(TEST_PATH, QUESTION_PATH)
a = train_texts + test_texts
a1 = [x.split(' ') for x in a]

In [4]:
%%time
bow = BOW(a1,min_count=1,maxlen=24) # count大于1，句子(q1,q2)相加最大长度为24

   Word Count: 100%|██████████| 427342/427342 [00:01<00:00, 348343.04it/s]
Doc To Number: 100%|██████████| 427342/427342 [00:02<00:00, 155920.04it/s]


CPU times: user 4.75 s, sys: 97.2 ms, total: 4.84 s
Wall time: 4.82 s


## 得到词向量

In [5]:
%%time
word_embed = pd.read_csv('mojing/word_embed.txt',header=None)
word_embed.columns = ['wv']
word_embed_dict = dict()
for s in word_embed.wv.values:
    l = s.split(' ')
    word_embed_dict[l[0]] = list(map(float,l[1:]))
word_embed_dict['UNK'] = [0]*300

CPU times: user 2.43 s, sys: 184 ms, total: 2.62 s
Wall time: 3.44 s


In [6]:
vocab_size = len(word_embed_dict)
embedding_matrix = np.zeros((vocab_size+1,300))

In [8]:
for key, value in bow.word2idx.items():
    embedding_matrix[value] = word_embed_dict.get(key)

In [63]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 2.29952765, -4.29687977,  3.71340919, ...,  0.99011242,
         0.41728863,  3.15365911],
       [-1.52279055,  2.12538552, -0.3590863 , ..., -2.17771411,
         1.37241161, -3.44047666],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [9]:
# np.save('save/embedding_matrix.npz',embedding_matrix)
# embedding_matrix = np.load('save/embedding_matrix.npz.npy')

# 2. 导入数据，处理数据

## 2.1 拼接数据 将q1和q2问题的词拼接起来，做cnn

In [9]:
## save
# train[['label','words']].to_csv('mojing1/train_words2.csv',index=False)
# test[['words']].to_csv('mojing1/test_words2.csv',index=False)
# load
train = pd.read_csv('mojing1/train_words2.csv')
test = pd.read_csv('mojing1/test_words2.csv')

In [10]:
def fill_unkQ1(s):
    l1 = s.split(',')[0].split(' ')[:12]
    l1 += ['UNK']*(12-len(l1))
    l1 = [bow.word2idx.get(x) if x in bow.word2idx.keys() else 0 for x in l1]
    return l1
def fill_unkQ2(s):
    l2 = s.split(',')[1].split(' ')[:12]
    l2 += ['UNK']*(12-len(l2))
    l2 = [bow.word2idx.get(x) if x in bow.word2idx.keys() else 0 for x in l2]
    return l2
# train = train.words.apply(lambda x : x.split(',')[0].split(' ')[:12]).tolist()
# test = test.words.apply(lambda x : x.split(',')[0].split(' ')[:12]).tolist()
train_q1 = train.words.apply(fill_unkQ1).tolist()
train_q2 = train.words.apply(fill_unkQ2).tolist()

# 3.构建CNN模型

In [11]:
from BasicModule import BasicModule
import torch as t
import numpy as np
from torch import nn

kernel_sizes =  [1,2,3,4]
kernel_sizes2 = [1,2,3,4]
class MultiCNNTextBNDeep(BasicModule): 
    def __init__(self, opt ):
        super(MultiCNNTextBNDeep, self).__init__()
        self.model_name = 'MultiCNNTextBNDeep'
        self.opt=opt
        self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)

        title_convs = [ nn.Sequential(
                                nn.Conv1d(in_channels = opt.embedding_dim,
                                        out_channels = opt.title_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(opt.title_dim),
                                nn.ReLU(inplace=True),

                                nn.Conv1d(in_channels = opt.title_dim,
                                out_channels = opt.title_dim,
                                kernel_size = kernel_size),
                                nn.BatchNorm1d(opt.title_dim),
                                nn.ReLU(inplace=True),
                                nn.MaxPool1d(kernel_size = (opt.title_seq_len - kernel_size*2 + 2))
                            )
         for kernel_size in kernel_sizes]

        content_convs = [ nn.Sequential(
                                nn.Conv1d(in_channels = opt.embedding_dim,
                                        out_channels = opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(opt.content_dim),
                                nn.ReLU(inplace=True),

                                nn.Conv1d(in_channels = opt.content_dim,
                                        out_channels = opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(opt.content_dim),
                                nn.ReLU(inplace=True),
                                nn.MaxPool1d(kernel_size = (opt.content_seq_len - kernel_size*2 + 2))
                            )
            for kernel_size in kernel_sizes ]

        self.title_convs = nn.ModuleList(title_convs)
        self.content_convs = nn.ModuleList(content_convs)

        self.fc = nn.Sequential(
            nn.Linear(len(kernel_sizes)*(opt.title_dim+opt.content_dim),opt.linear_hidden_size),
            nn.BatchNorm1d(opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(opt.linear_hidden_size,opt.num_classes),
            nn.Sigmoid()
        )
        

        if opt.embedding_path:
            self.encoder.weight.data.copy_(t.from_numpy(np.load(opt.embedding_path)))

    def forward(self, title, content):
        title = self.encoder(title)
        content = self.encoder(content)

        if self.opt.static:
            title.detach()
            content.detach()

        title_out = [ title_conv(title.permute(0, 2, 1)) for title_conv in self.title_convs]
        content_out = [ content_conv(content.permute(0,2,1)) for content_conv in self.content_convs]
        conv_out = t.cat((title_out+content_out),dim=1)
        reshaped = conv_out.view(conv_out.size(0), -1)
        sigmoid = self.fc((reshaped))
        return sigmoid

    # def get_optimizer(self):  
    #    return  t.optim.Adam([
    #             {'params': self.title_conv.parameters()},
    #             {'params': self.content_conv.parameters()},
    #             {'params': self.fc.parameters()},
    #             {'params': self.encoder.parameters(), 'lr': 5e-4}
    #         ], lr=self.opt.lr)
    # # end method forward

# 4.开始跑模型

In [12]:
EPOCH = 3           # 训练整批数据多少次,  我只训练了3次
BATCH_SIZE = 128
LR = 0.002         # 学习率

In [13]:
%%time
if __name__ == '__main__':
    from config import opt
    it = 1
    opt.content_dim = 100
    opt.title_dim = 100
    m = MultiCNNTextBNDeep(opt)
    if t.cuda.is_available():
        m.cuda()
    # 数据处理成tensor
    label_tensor = torch.from_numpy(np.array(train.label).reshape(-1,1)).float()
    title_tensor = t.autograd.Variable(t.from_numpy(np.array(train_q1))).long()
    content_tensor = t.autograd.Variable(t.from_numpy(np.array(train_q2))).long()
    # train test split
    train_idx, test_idx = train_test_split(np.arange(len(train_q1)),test_size=0.2)
    train_label_tensor = label_tensor[train_idx]
    train_title_tensor = title_tensor[train_idx]
    train_content_tensor = content_tensor[train_idx]
    valid_label_tensor = label_tensor[test_idx]
    valid_title_tensor = title_tensor[test_idx]
    valid_content_tensor = content_tensor[test_idx]
    del train
    del train_q1
    del train_q2
    del label_tensor
    del title_tensor
    del content_tensor
    #----------------------
    # train torch dataset
    torch_dataset = Data.TensorDataset(train_title_tensor, train_content_tensor, train_label_tensor)
    train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=4,              # subprocesses for loading data
    )
    # valid torch dataset
    valid_torch_dataset = Data.TensorDataset(valid_title_tensor, valid_content_tensor, valid_label_tensor)
    valid_train_loader = Data.DataLoader(
        dataset=valid_torch_dataset,      # torch TensorDataset format
        batch_size=128,      # 预测的batch size 可以大一点1024
        num_workers=4,              # subprocesses for loading data
    )
    # optimizer, loss_func
    optimizer = torch.optim.Adam(m.parameters(), lr=LR)   # optimize all cnn parameters;Adam比较好用
    # loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
    loss_func = nn.BCELoss() # binary
    loss_func.cuda()
    for epoch in tqdm(range(EPOCH)):
        for step, (title, content, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
            title, content, b_y = title.cuda(), content.cuda(), b_y.cuda()
            output = m(title,content)
            loss = loss_func(output, b_y)
            if it % 200 == 0:
                val_loss_list = []
                # 写一个验证集的迭代器得到valid logloss
                for step, (val_title, val_content, val_b_y) in enumerate(valid_train_loader):   # 分配 batch data, normalize x when iterate train_loader
                    val_title, val_content, val_b_y = val_title.cuda(), val_content.cuda(), val_b_y.cuda()
                    tmp_m = m(val_title, val_content)
                    val_loss_tmp = loss_func(tmp_m, val_b_y) # loss_func可以调整为False，输出的是batch size个logloss而不是平均值
                    del tmp_m
                    val_loss_list.append(val_loss_tmp.cpu().data.numpy().tolist())
                    del val_loss_tmp
                loss_print = loss.cpu().data.numpy().tolist()
                print('train logloss: ', loss_print)
                print('valid logloss: ', np.mean(val_loss_list))
                del loss_print
                del val_loss_list
            optimizer.zero_grad()           # clear gradients for this training step
            loss.backward()                 # backpropagation, compute gradients
            optimizer.step()                # apply gradients
            it += 1
        print('epoch %d is done'%epoch)

  0%|          | 0/3 [00:00<?, ?it/s]

train logloss:  0.43701791763305664
valid logloss:  0.46073699566587134
train logloss:  0.3912964463233948
valid logloss:  0.41645912271947716
train logloss:  0.3772851824760437
valid logloss:  0.3964012126677
train logloss:  0.34570229053497314
valid logloss:  0.37854946378487436
train logloss:  0.4105028808116913
valid logloss:  0.369999797649719
train logloss:  0.3139829933643341
valid logloss:  0.35444292474781447
train logloss:  0.2828434109687805
valid logloss:  0.3387564829920405


 33%|███▎      | 1/3 [01:13<02:26, 73.04s/it]

epoch 0 is done
train logloss:  0.2947517931461334
valid logloss:  0.33039405721066584
train logloss:  0.297991544008255
valid logloss:  0.32518049921072906
train logloss:  0.2922203540802002
valid logloss:  0.31840508015611063
train logloss:  0.267525315284729
valid logloss:  0.31637792525129704
train logloss:  0.3234981894493103
valid logloss:  0.31094805679129595
train logloss:  0.24049006402492523
valid logloss:  0.3059733941327387
train logloss:  0.23675303161144257
valid logloss:  0.29659887421969794
train logloss:  0.2970202565193176
valid logloss:  0.3003741011547683


 67%|██████▋   | 2/3 [02:29<01:14, 74.60s/it]

epoch 1 is done
train logloss:  0.27917879819869995
valid logloss:  0.28867411931705234
train logloss:  0.22653546929359436
valid logloss:  0.29121695237992395
train logloss:  0.2424982786178589
valid logloss:  0.29961863744198974
train logloss:  0.17280668020248413
valid logloss:  0.2884653770743902
train logloss:  0.2548966705799103
valid logloss:  0.28675260619452253
train logloss:  0.3268270790576935
valid logloss:  0.2830558728108454
train logloss:  0.1753063201904297
valid logloss:  0.2805648595843483
train logloss:  0.2063988298177719
valid logloss:  0.27560979521004997


100%|██████████| 3/3 [03:45<00:00, 75.06s/it]

epoch 2 is done
CPU times: user 2min 44s, sys: 1min 2s, total: 3min 47s
Wall time: 3min 52s





In [16]:
# 保存模型参数
torch.save(m.state_dict(), 'save/m1_params.pkl')

# 5.跑测试数据，输出result

In [21]:
test_q1 = test.words.apply(fill_unkQ1).tolist()
test_q2 = test.words.apply(fill_unkQ2).tolist()

In [23]:
test_title_tensor = t.autograd.Variable(t.from_numpy(np.array(test_q1))).long()
test_content_tensor = t.autograd.Variable(t.from_numpy(np.array(test_q2))).long()

In [54]:
def batch_gen(title,content):  # 定义batch数据生成器
    idx = 0
    length = len(title)
    while True:
        if idx+2000>length:
            yield title[idx:idx+2000],content[idx:idx+2000]
            break
        start = idx
        idx += 2000
        yield title[start:start+2000],content[start:start+2000]
gen = batch_gen(test_title_tensor,test_content_tensor)

In [55]:
result = []
for title,content in tqdm(gen):
    title, content = title.cuda(), content.cuda()
    output = m(title,content)
    output = list(np.squeeze(output.cpu().data.numpy().tolist()))
    result += output
    del output

87it [00:06, 13.58it/s]


In [61]:
# 保存result提交
pd.DataFrame(result,columns=['y_pre']).to_csv('save/result.csv',index=False)