In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import torchvision      # 数据库模块
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
# from gensim.models import Word2Vec
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
torch.manual_seed(1)    # reproducible



<torch._C.Generator at 0x7fa108ade750>

# 1.构建embedding矩阵 train和test要合起来

In [2]:
import numpy as np
from tqdm import tqdm

class BOW(object):
    def __init__(self, X, min_count=10, maxlen=100):
        """
        X: [[w1, w2],]]
        """
        self.X = X
        self.min_count = min_count
        self.maxlen = maxlen
        self.__word_count()
        self.__idx()
        self.__doc2num()

    def __word_count(self):
        wc = {}
        for ws in tqdm(self.X, desc='   Word Count'):
            for w in ws:
                if w in wc:
                    wc[w] += 1
                else:
                    wc[w] = 1
        self.word_count = {i: j for i, j in wc.items() if j >= self.min_count}

    def __idx(self):
        self.idx2word = {i + 1: j for i, j in enumerate(self.word_count)}
        self.word2idx = {j: i for i, j in self.idx2word.items()}

    def __doc2num(self):
        doc2num = []
        for text in tqdm(self.X, desc='Doc To Number'):
            s = [self.word2idx.get(i, 0) for i in text[:self.maxlen]]
            doc2num.append(s + [0]*(self.maxlen-len(s)))  # 未登录词全部用0表示
        self.doc2num = np.asarray(doc2num)

In [3]:
def get_ids(qids):
    ids = []
    for t_ in qids:
        ids.append(int(t_[1:]))
    return np.asarray(ids)


def get_texts(file_path, question_path):
    qes = pd.read_csv(question_path)
    file = pd.read_csv(file_path)
    q1id, q2id = file['q1'], file['q2']
    id1s, id2s = get_ids(q1id), get_ids(q2id)
    all_words = qes['words']
    texts = []
    for t_ in zip(id1s, id2s):
        texts.append(all_words[t_[0]] + ' ' + all_words[t_[1]])
    return texts
TRAIN_PATH = 'mojing/train.csv'
TEST_PATH = 'mojing/test.csv'
QUESTION_PATH = 'mojing/question.csv'
train_texts = get_texts(TRAIN_PATH, QUESTION_PATH)
test_texts = get_texts(TEST_PATH, QUESTION_PATH)
a = train_texts + test_texts
a1 = [x.split(' ') for x in a]

In [4]:
%%time
bow = BOW(a1,min_count=1,maxlen=24) # count大于1，句子(q1,q2)相加最大长度为24

   Word Count: 100%|██████████| 427342/427342 [00:01<00:00, 237201.38it/s]
Doc To Number: 100%|██████████| 427342/427342 [00:02<00:00, 151976.34it/s]


CPU times: user 5.37 s, sys: 130 ms, total: 5.5 s
Wall time: 5.48 s


In [5]:
%%time
word_embed = pd.read_csv('mojing/word_embed.txt',header=None)
word_embed.columns = ['wv']
word_embed_dict = dict()
for s in word_embed.wv.values:
    l = s.split(' ')
    word_embed_dict[l[0]] = list(map(float,l[1:]))
word_embed_dict['UNK'] = [0]*300

CPU times: user 2.47 s, sys: 1.69 s, total: 4.16 s
Wall time: 5.89 s


In [6]:
vocab_size = len(word_embed_dict)
embedding_matrix = np.zeros((vocab_size+1,300))

In [7]:
for key, value in bow.word2idx.items():
    embedding_matrix[value] = word_embed_dict.get(key)

In [8]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 2.29952765, -4.29687977,  3.71340919, ...,  0.99011242,
         0.41728863,  3.15365911],
       [-1.52279055,  2.12538552, -0.3590863 , ..., -2.17771411,
         1.37241161, -3.44047666],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

# 2. 导入数据，处理数据

## 2.1 拼接数据 将q1和q2问题的词处理，到时候分别输入lstm

In [9]:
## save
# train[['label','words']].to_csv('mojing1/train_words2.csv',index=False)
# test[['words']].to_csv('mojing1/test_words2.csv',index=False)
# load
train = pd.read_csv('mojing1/train_words2.csv')
test = pd.read_csv('mojing1/test_words2.csv')

In [10]:
def fill_unkQ1(s):
    l1 = s.split(',')[0].split(' ')[:12]
    l1 += ['UNK']*(12-len(l1))
    l1 = [bow.word2idx.get(x) if x in bow.word2idx.keys() else 0 for x in l1]
    return l1
def fill_unkQ2(s):
    l2 = s.split(',')[1].split(' ')[:12]
    l2 += ['UNK']*(12-len(l2))
    l2 = [bow.word2idx.get(x) if x in bow.word2idx.keys() else 0 for x in l2]
    return l2
# train = train.words.apply(lambda x : x.split(',')[0].split(' ')[:12]).tolist()
# test = test.words.apply(lambda x : x.split(',')[0].split(' ')[:12]).tolist()
train_q1 = train.words.apply(fill_unkQ1).tolist()
train_q2 = train.words.apply(fill_unkQ2).tolist()

# 3.构建LSTM模型

In [11]:
from BasicModule import BasicModule
import torch as t
import numpy as np
from torch import nn


def kmax_pooling(x, dim, k):
    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
    return x.gather(dim, index)

class LSTMText(BasicModule): 
    def __init__(self, opt ):
        super(LSTMText, self).__init__()
        self.model_name = 'LSTMText'
        self.opt=opt

        kernel_size = opt.kernel_size
        self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)

        self.title_lstm = nn.LSTM(input_size = opt.embedding_dim,\
                            hidden_size = opt.hidden_size,
                            num_layers = opt.num_layers,
                            bias = True,
                            batch_first = False,
                            dropout = 0.5, # dropout
                            bidirectional = True
                            )
        self.content_lstm =nn.LSTM(input_size = opt.embedding_dim,\
                            hidden_size = opt.hidden_size,
                            num_layers = opt.num_layers,
                            bias = True,
                            batch_first = False,
                            dropout = 0.5, # dropout
                            bidirectional = True
                            )

#         self.dropout = nn.Dropout()
        self.fc = nn.Sequential(
            nn.Linear(opt.kmax_pooling*(opt.hidden_size*2*2),opt.linear_hidden_size),
            nn.Dropout(0.2), # dropout
            nn.BatchNorm1d(opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(opt.linear_hidden_size,opt.num_classes),
            nn.Sigmoid()
        )
        # self.fc = nn.Linear(3 * (opt.title_dim+opt.content_dim), opt.num_classes)
        if opt.embedding_path:
            self.encoder.weight.data.copy_(t.from_numpy(np.load(opt.embedding_path)))
 
    def forward(self, title, content):
        title = self.encoder(title)
        content = self.encoder(content)
        if self.opt.static:
            title=title.detach()
            content=content.detach()
        
        title_out = self.title_lstm(title.permute(1,0,2))[0].permute(1,2,0) 

        content_out = self.content_lstm(content.permute(1,0,2))[0].permute(1,2,0)


        title_conv_out = kmax_pooling((title_out),2,self.opt.kmax_pooling)
        content_conv_out = kmax_pooling((content_out),2,self.opt.kmax_pooling)

        conv_out = t.cat((title_conv_out,content_conv_out),dim=1)
        reshaped = conv_out.view(conv_out.size(0), -1)
        sigmoid = self.fc((reshaped))
        return sigmoid

    # def get_optimizer(self):  
    #    return  t.optim.Adam([
    #             {'params': self.title_conv.parameters()},
    #             {'params': self.content_conv.parameters()},
    #             {'params': self.fc.parameters()},
    #             {'params': self.encoder.parameters(), 'lr': 5e-4}
    #         ], lr=self.opt.lr)
    # # end method forward

# 4.开始跑模型

In [12]:
EPOCH = 5           # 训练整批数据多少次,  我训练了5次
BATCH_SIZE = 128
LR = 0.002         # 学习率

In [13]:
%%time
if __name__ == '__main__':
    from config import opt
    it = 1
    opt.content_dim = 100
    opt.title_dim = 100
    m = LSTMText(opt)
    if t.cuda.is_available():
        m.cuda()
    # 数据处理成tensor
    label_tensor = torch.from_numpy(np.array(train.label).reshape(-1,1)).float()
    title_tensor = t.autograd.Variable(t.from_numpy(np.array(train_q1))).long()
    content_tensor = t.autograd.Variable(t.from_numpy(np.array(train_q2))).long()
    # train test split
    train_idx, test_idx = train_test_split(np.arange(len(train_q1)),test_size=0.05) # 调节训练集和验证集的比例
    train_label_tensor = label_tensor[train_idx]
    train_title_tensor = title_tensor[train_idx]
    train_content_tensor = content_tensor[train_idx]
    valid_label_tensor = label_tensor[test_idx]
    valid_title_tensor = title_tensor[test_idx]
    valid_content_tensor = content_tensor[test_idx]
    del train
    del train_q1
    del train_q2
    del label_tensor
    del title_tensor
    del content_tensor
    #----------------------
    # train torch dataset
    torch_dataset = Data.TensorDataset(train_title_tensor, train_content_tensor, train_label_tensor)
    train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=4,              # subprocesses for loading data
    )
    # valid torch dataset
    valid_torch_dataset = Data.TensorDataset(valid_title_tensor, valid_content_tensor, valid_label_tensor)
    valid_train_loader = Data.DataLoader(
        dataset=valid_torch_dataset,      # torch TensorDataset format
        batch_size=128,      # 预测的batch size 可以大一点1024
        num_workers=4,              # subprocesses for loading data
    )
    # optimizer, loss_func
    optimizer = torch.optim.Adam(m.parameters(), lr=LR)   # optimize all cnn parameters;Adam比较好用
    # loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
    loss_func = nn.BCELoss() # binary
    loss_func.cuda()
    for epoch in tqdm(range(EPOCH)):
        for step, (title, content, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
            title, content, b_y = title.cuda(), content.cuda(), b_y.cuda()
            output = m(title,content)
            loss = loss_func(output, b_y)
            if it % 200 == 0:
                val_loss_list = []
                m.eval() # 改成evaluate模型，dropout无效（验证的时候用全模型，不用dropout）
                # 写一个验证集的迭代器得到valid logloss
                for step, (val_title, val_content, val_b_y) in enumerate(valid_train_loader):   # 分配 batch data, normalize x when iterate train_loader
                    val_title, val_content, val_b_y = val_title.cuda(), val_content.cuda(), val_b_y.cuda()
                    tmp_m = m(val_title, val_content)
                    val_loss_tmp = loss_func(tmp_m, val_b_y) # loss_func可以调整为False，输出的是batch size个logloss而不是平均值
                    del tmp_m
                    val_loss_list.append(val_loss_tmp.cpu().data.numpy().tolist())
                    del val_loss_tmp
                loss_print = loss.cpu().data.numpy().tolist()
                print('train logloss: ', loss_print)
                print('valid logloss: ', np.mean(val_loss_list))
                del loss_print
                del val_loss_list
                m.train() # 变成train模式，dropout有效
            optimizer.zero_grad()           # clear gradients for this training step
            loss.backward()                 # backpropagation, compute gradients
            optimizer.step()                # apply gradients
            it += 1
        print('epoch %d is done'%epoch)

  0%|          | 0/5 [00:00<?, ?it/s]

train logloss:  0.5203981399536133
valid logloss:  0.4832713657617569
train logloss:  0.4543154537677765
valid logloss:  0.46938819229602813
train logloss:  0.44766563177108765
valid logloss:  0.41660228043794634
train logloss:  0.41195303201675415
valid logloss:  0.40250764340162276
train logloss:  0.37728428840637207
valid logloss:  0.4089146041870117
train logloss:  0.41779762506484985
valid logloss:  0.3801661291718483
train logloss:  0.3788602948188782
valid logloss:  0.38513225942850116
train logloss:  0.34949246048927307
valid logloss:  0.3846263459324837
train logloss:  0.2873624265193939
valid logloss:  0.36341639041900636


 20%|██        | 1/5 [01:49<07:19, 109.79s/it]

epoch 0 is done
train logloss:  0.3050110340118408
valid logloss:  0.3632037815451622
train logloss:  0.39709538221359253
valid logloss:  0.36454082041978836
train logloss:  0.340201735496521
valid logloss:  0.3601865091919899
train logloss:  0.25215864181518555
valid logloss:  0.3529305890202522
train logloss:  0.30501216650009155
valid logloss:  0.3462480956315994
train logloss:  0.31545740365982056
valid logloss:  0.34580546751618385
train logloss:  0.357025146484375
valid logloss:  0.35067665100097656
train logloss:  0.2753453552722931
valid logloss:  0.33336266726255415
train logloss:  0.28542983531951904
valid logloss:  0.32995369225740434


 40%|████      | 2/5 [03:39<05:29, 109.78s/it]

epoch 1 is done
train logloss:  0.3565730154514313
valid logloss:  0.3302577315270901
train logloss:  0.31011927127838135
valid logloss:  0.3339629453420639
train logloss:  0.2464858889579773
valid logloss:  0.3411118359863758
train logloss:  0.3291729688644409
valid logloss:  0.33318966209888456
train logloss:  0.3360821604728699
valid logloss:  0.3218403685092926
train logloss:  0.33644187450408936
valid logloss:  0.32533564627170564
train logloss:  0.2882845103740692
valid logloss:  0.32076655730605125
train logloss:  0.3483291566371918
valid logloss:  0.3246272438764572
train logloss:  0.237955242395401
valid logloss:  0.31924026027321817
train logloss:  0.3384959101676941
valid logloss:  0.31946116253733636


 60%|██████    | 3/5 [05:31<03:40, 110.44s/it]

epoch 2 is done
train logloss:  0.2170129120349884
valid logloss:  0.31919210359454153
train logloss:  0.23013758659362793
valid logloss:  0.3091035702824593
train logloss:  0.32513362169265747
valid logloss:  0.3210012054443359
train logloss:  0.3542996644973755
valid logloss:  0.30835054486989977
train logloss:  0.20002661645412445
valid logloss:  0.30600378423929214
train logloss:  0.29175055027008057
valid logloss:  0.30958916142582893
train logloss:  0.2737938165664673
valid logloss:  0.304500747770071
train logloss:  0.2920929491519928
valid logloss:  0.2969368878006935
train logloss:  0.20922386646270752
valid logloss:  0.2959528116881847


 80%|████████  | 4/5 [07:21<01:50, 110.41s/it]

epoch 3 is done
train logloss:  0.2762778401374817
valid logloss:  0.29851986959576604
train logloss:  0.239698588848114
valid logloss:  0.29731502816081046
train logloss:  0.26651179790496826
valid logloss:  0.30353258177638054
train logloss:  0.23750382661819458
valid logloss:  0.30335391268134115
train logloss:  0.24961994588375092
valid logloss:  0.30159954965114594
train logloss:  0.26876991987228394
valid logloss:  0.2965726514160633
train logloss:  0.3215482234954834
valid logloss:  0.290412195622921
train logloss:  0.2949153482913971
valid logloss:  0.2922604976594448
train logloss:  0.2729884386062622
valid logloss:  0.2881654557585716
train logloss:  0.22499841451644897
valid logloss:  0.2849304476380348


100%|██████████| 5/5 [09:14<00:00, 110.80s/it]

epoch 4 is done
CPU times: user 5min 27s, sys: 3min 45s, total: 9min 12s
Wall time: 9min 21s





In [14]:
# 保存模型参数
torch.save(m.state_dict(), 'save/m2_params.pkl')

# 5.跑测试数据，输出result

In [14]:
test_q1 = test.words.apply(fill_unkQ1).tolist()
test_q2 = test.words.apply(fill_unkQ2).tolist()
test_title_tensor = t.autograd.Variable(t.from_numpy(np.array(test_q1))).long()
test_content_tensor = t.autograd.Variable(t.from_numpy(np.array(test_q2))).long()

In [15]:
def batch_gen(title,content):  # 定义batch数据生成器
    idx = 0
    length = len(title)
    while True:
        if idx+2000>length:
            yield title[idx:idx+2000],content[idx:idx+2000]
            break
        start = idx
        idx += 2000
        yield title[start:start+2000],content[start:start+2000]
gen = batch_gen(test_title_tensor,test_content_tensor)

In [16]:
result = []
for title,content in tqdm(gen):
    title, content = title.cuda(), content.cuda()
    output = m(title,content)
    output = list(np.squeeze(output.cpu().data.numpy().tolist()))
    result += output
    del output

87it [00:21,  4.10it/s]


In [17]:
# 保存result提交
pd.DataFrame(result,columns=['y_pre']).to_csv('save/res2.csv',index=False)