In [1]:
import os
import re
import csv
import time
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydotplus
plt.style.use('classic')
%matplotlib inline

from smart_open import smart_open
from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Input,LSTM,Embedding,Dropout,Activation
from keras.layers.merge import concatenate,dot
from keras.models import Model
from keras.layers import Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import plot_model

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


## 设置数据路径及模型参数

In [2]:
BASE_DIR = '../inputs/'

# glove词向量
EMBEDDING_FILE = BASE_DIR + 'glove.6B/glove.6B.50d.txt'
# 训练数据与测试数据
TRAIN_DATA_FILE  = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'

In [3]:
MAX_SEQUENCE_LENGTH = 50
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2

In [4]:
num_lstm = np.random.randint(175,275)
num_dense = np.random.randint(100,150)
rate_drop_lstm = 0.15 + 0.25*np.random.rand()
rate_drop_dense = 0.15 + 0.25*np.random.rand()
act = 'relu'
re_weight = True
STAMP = 'lstm_{}_{}_{:.2f}_{:.2f}'.format(num_lstm,num_dense,rate_drop_lstm,rate_drop_dense)

## 提取glove词向量

In [5]:
print('Indexing word vectors.')
embeddings_index = {}
f = smart_open(EMBEDDING_FILE,encoding='utf-8',mode='r')
for line in f.readlines():
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found {} word vectors of glove.'.format(len(embeddings_index)))

Indexing word vectors.
Found 400000 word vectors of glove.


## 数据预处理

The function 'text_to_wordlist' is from 

[https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text]

In [6]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # 取出停顿词，提取词干
    
    # 转换小写后分词
    text = text.lower().split()

    # 去除停顿词
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    # 提取词干
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # 返回最终处理结果
    return(text)

#### 这里只做选取10%的训练数据和5%的测试数据作为演示，即便如此，计算量也是非常大的

In [7]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)
print("Found {} pairs texts in train.csv".format(len(train)))
print('Found {} pairs texts in test.csv'.format(len(test)))
train = train.sample(frac=0.1)
test = test.sample(frac=0.05)
print('Choose {} samples in the train set'.format(len(train)))
print('Choose {} samples in the test set'.format(len(test)))

Found 404290 pairs texts in train.csv
Found 3563475 pairs texts in test.csv
Choose 40429 samples in the train set
Choose 178174 samples in the test set


In [8]:
train[['question1','question2']] = train[['question1','question2']].astype(str)
test[['question1','question2']] = test[['question1','question2']].astype(str)

In [9]:
start_time = time.time()
# pandas会自动将read_csv中字符串中的数值转换为float或int，因此，需要强制声明为str
train['texts_1'] = train['question1'].apply(lambda x: text_to_wordlist(text=str(x),remove_stopwords=True,stem_words=True))
train['texts_2'] = train['question2'].apply(lambda x: text_to_wordlist(text=str(x),remove_stopwords=True,stem_words=True))
print('Cost {:.2f} sec for preprocessing train set.'.format(time.time() - start_time))

Cost 29.37 sec for preprocessing train set.


In [10]:
start_time = time.time()
test['texts_1'] = test['question1'].apply(lambda x: text_to_wordlist(text=str(x),remove_stopwords=True,stem_words=True))
test['texts_2'] = test['question2'].apply(lambda x: text_to_wordlist(text=str(x),remove_stopwords=True,stem_words=True))
print('Cost {:.2f} sec for preprocessing test set.'.format(time.time()-start_time))

Cost 136.44 sec for preprocessing test set.


使用keras中text的处理函数，进行分词，输入是text句子，通过对所有的的句子进行汇总，得到一个分词词典，词典的大小为MAX_NB_WORDS，如果真实字典比这个要大，那么根据单词词频排序，选择排名前MAX_NB_WORDS的单词，输出是句子中的单词在分词词典的id索引。

fit_on_texts需要提供数据集中所有句子的总合。

In [11]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train['texts_1'].values.tolist()+train['texts_2'].values.tolist()+test['texts_1'].values.tolist()+test['texts_2'].values.tolist())

In [12]:
sequences_1 = tokenizer.texts_to_sequences(train['texts_1'].values)
sequences_2 = tokenizer.texts_to_sequences(train['texts_2'].values)

In [13]:
test_sequences_1 = tokenizer.texts_to_sequences(test['texts_1'].values)
test_sequences_2 = tokenizer.texts_to_sequences(test['texts_2'].values)

texts_to_sequences的输出是每条句子中各个分词在之前的分词字典的索引id组成的list

In [14]:
display(sequences_1[:10])
display(train.iloc[:10,]['texts_1'])

[[232,
  922,
  1214,
  1155,
  30798,
  3100,
  44,
  25523,
  3100,
  68,
  25524,
  3100,
  88,
  30799],
 [16711],
 [345, 10249],
 [131, 25, 1187, 1922, 4733],
 [2, 759, 136, 5208, 62, 9344],
 [2, 6530, 2991, 8198, 780, 362],
 [41, 354, 162],
 [143, 1931, 4064],
 [124, 23, 254, 135, 2388, 3],
 [51, 34, 20]]

113824    idea, prove (no computer) 253*sqrt(2) +874*sqr...
73464                                                babar?
325387                                      popular majors?
262015                           busi start five ten lakhs?
4739               get twitter give suspend account handle?
70007                   get flexibl hors fenc solut sydney?
333457                                   take stock market?
296092                              human depend computers?
218276                  earn money music video album india?
203259                                   happen think much?
Name: texts_1, dtype: object

In [15]:
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

Found 65908 unique tokens.


在我们sample出来的数据中，总共有50665个唯一的分词，这里面有些是会在glove的词典中的，有些是不在的，有些是因为拼写错误，所以不在glove中的。

In [16]:
data_1 = pad_sequences(sequences=sequences_1,maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences=sequences_2,maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(train['is_duplicate'])

In [17]:
print('Shape of data tensor: {}'.format(data_1.shape))
print('Shape of label tensor: {}'.format(labels.shape))

Shape of data tensor: (40429, 50)
Shape of label tensor: (40429,)


In [18]:
test_data_1 = pad_sequences(sequences=test_sequences_1,maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(sequences=test_sequences_2,maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test['test_id'])

## generate leaky features
统计特征：问题搜索热度

In [19]:
ques = pd.concat([train[['question1','question2']],test[['question1','question2']]],axis=0,ignore_index=True)
display(ques.head())

Unnamed: 0,question1,question2
0,"Is there any idea, how to prove (no computer) ...","Is there any idea, how to prove (no computer) ..."
1,Who was Babar?,Do you know Babar?
2,What are the most popular majors?,What is the most popular major in America?
3,Which business can I start with five and ten l...,Which business can I start with $5000?
4,How can I get Twitter to give me a suspended a...,Why was the Twitter account @Bill_Nye_tho susp...


**q_dict**中记录的是一个广义`dic`，`key`为问题，对应的`value`为与`key`匹配的问题集set（去重）。

In [20]:
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
    q_dict[ques.question1[i]].add(ques.question2[i])
    q_dict[ques.question2[i]].add(ques.question1[i])

以下是计算一个问题的热度的函数，`q1_freq`计算了与`question1`匹配的问题有多少个，`q2_freq`计算了与`question2`匹配的问题有多少个，`q1_q2_intersect`计算了与`question1`和`question2`匹配问题字典的交集有多少个。

In [21]:
def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

In [22]:
train['q1_q2_intersect'] = train.apply(q1_q2_intersect, axis=1, raw=True)
train['q1_freq'] = train.apply(q1_freq, axis=1, raw=True)
train['q2_freq'] = train.apply(q2_freq, axis=1, raw=True)

In [23]:
test['q1_q2_intersect'] = test.apply(q1_q2_intersect, axis=1)
test['q1_freq'] = test.apply(q1_freq, axis=1)
test['q2_freq'] = test.apply(q2_freq, axis=1)

In [24]:
q1_q2_intersect = train.groupby('q1_q2_intersect')['is_duplicate'].mean().sort_values()

In [25]:
leaks = train[['q1_q2_intersect','q1_freq','q2_freq']]
test_leaks = test[['q1_q2_intersect','q1_freq','q2_freq']]

In [26]:
scal = StandardScaler()
scal.fit(np.concatenate((leaks,test_leaks),axis=0))
leaks = scal.transform(leaks)
test_leaks = scal.transform(test_leaks)

## 提取Glove词向量

In [27]:
print('Preparing embedding matrix.')
nb_words = min(MAX_NB_WORDS,len(word_index)) + 1
embedding_matrix = np.zeros((nb_words,EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: {}'.format(np.sum(np.sum(embedding_matrix,axis=1)==0)))

Preparing embedding matrix.
Null word embeddings: 24365


### split train/validation data

In [28]:
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_valid = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.concatenate((data_1[idx_train],data_2[idx_train]),axis=0)
data_2_train = np.concatenate((data_2[idx_train],data_1[idx_train]),axis=0)
leaks_train = np.concatenate((leaks[idx_train],leaks[idx_train]),axis=0)
labels_train = np.concatenate((labels[idx_train],labels[idx_train]))

data_1_valid = np.concatenate((data_1[idx_valid],data_2[idx_valid]),axis=0)
data_2_valid = np.concatenate((data_2[idx_valid],data_1[idx_valid]),axis=0)
leaks_valid = np.concatenate((leaks[idx_valid],leaks[idx_valid]),axis=0)
labels_valid = np.concatenate((labels[idx_valid],labels[idx_valid]))

In [29]:
weight_val = np.ones(len(labels_valid))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_valid==0] = 1.309028344

## 定义模型
模型包括两个输入和一个输出，输入为两个需要对比的text，经embedding层后，各自经过一个双向LSTM，两个的输出结果与leaky feature经过dense后的输出结果，拼接成一个新的特征向量，输入到最终的分类全连接层，全连接层的输出为两个句子是否匹配的概率。

In [30]:
embedding_layer = Embedding(input_dim=nb_words,
                            input_length=MAX_SEQUENCE_LENGTH,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)

In [31]:
lstm_layer = Bidirectional(LSTM(units=num_lstm,
                                dropout=rate_drop_lstm,
                                recurrent_dropout=rate_drop_lstm))

In [32]:
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32',name='sequences_input1')
embedded_sequnces_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequnces_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32',name='sequences_input2')
embedded_sequnces_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequnces_2)

leaks_input = Input(shape=(leaks.shape[1],),dtype='float32',name='leaks_input')
leaks_dense = Dense(units=int(num_dense/2),activation=act)(leaks_input)

merged = concatenate([x1,y1,leaks_dense])
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

merged = Dense(num_dense,activation=act)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(units=1,activation='sigmoid',name='prediction')(merged)

### 通过损失权重平衡样本

In [33]:
if re_weight:
    class_weight = {0:1.3090,
                    1:0.4720}
else:
    class_weight = None

In [34]:
model = Model(inputs=[sequence_1_input,sequence_2_input,leaks_input],outputs=preds)
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['acc'])
print(model.summary())
#plot_model(model)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequences_input1 (InputLayer)   (None, 50)           0                                            
__________________________________________________________________________________________________
sequences_input2 (InputLayer)   (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 50)       3295450     sequences_input1[0][0]           
                                                                 sequences_input2[0][0]           
__________________________________________________________________________________________________
leaks_input (InputLayer)        (None, 3)            0                                            
__________

In [35]:
print(STAMP)

lstm_272_149_0.22_0.25


In [36]:
early_stopping = EarlyStopping(monitor='val_loss',patience=3,verbose=1)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(filepath=bst_model_path,
                                   save_best_only=True,
                                   save_weights_only=True)

In [37]:
data_2_train.shape

(64686, 50)

In [None]:
hist = model.fit(x=[data_1_train,data_2_train,leaks_train],
                 y=labels_train,
                 batch_size=128,
                 epochs=20,
                 shuffle=True,
                 validation_data=([data_1_valid,data_2_valid,leaks_valid],
                                  labels_valid,weight_val),
                 class_weight=class_weight,
                 callbacks=[early_stopping,model_checkpoint],
                 verbose=1)

Train on 64686 samples, validate on 16172 samples
Epoch 1/20

笔记本的GPU计算性能有限，这里就不继续跑了。在Kaggle官方提供的计算平台上，经过30个epoch，最终loss大致在0.19，acc为0.9。

In [None]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])