In [1]:
import os
import sys
import numpy as np
import pandas as pd
import json

# Load data

In [2]:
stacked_clusters_train_file = "./data/stacked_clusters_train_df.csv"
stacked_clusters_train_df = pd.read_csv(stacked_clusters_train_file)

In [3]:
stacked_clusters_train_df.head()

Unnamed: 0,q_class,qid,question
0,0,11,Astrology: I am a Capricorn Sun Cap moon and c...
1,0,12,"I'm a triple Capricorn (Sun, Moon and ascendan..."
2,1,16,What should I do to be a great geologist?
3,1,15,How can I be a good geologist?
4,2,24,How can I see all my Youtube comments?


In [4]:
data_df = stacked_clusters_train_df[['q_class', 'question']]

In [5]:
data_df.head()

Unnamed: 0,q_class,question
0,0,Astrology: I am a Capricorn Sun Cap moon and c...
1,0,"I'm a triple Capricorn (Sun, Moon and ascendan..."
2,1,What should I do to be a great geologist?
3,1,How can I be a good geologist?
4,2,How can I see all my Youtube comments?


In [6]:
print(data_df.info())
print("Total question classes: {}".format(data_df.q_class.unique().shape[0]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149650 entries, 0 to 149649
Data columns (total 2 columns):
q_class     149650 non-null int64
question    149650 non-null object
dtypes: int64(1), object(1)
memory usage: 2.3+ MB
None
Total question classes: 60460


In [7]:

maxlen = 64
batch_size = 128
epochs = 25 # amsoftmax需要25个epoch，其它需要20个epoch

In [8]:
num_train_groups = int(np.floor(data_df.q_class.unique().shape[0] * 0.8)) #

train_data = data_df[data_df.q_class <= num_train_groups].copy()
valid_data = data_df[data_df.q_class > num_train_groups].copy()

In [9]:
print(train_data.shape)
print(valid_data.shape)

(125243, 2)
(24407, 2)


In [10]:
# Load spacy
# !conda list

In [11]:
import spacy

# nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en_core_web_md')
# nlp = spacy.load('en')

In [12]:
def get_embeddings(vocab):
    return vocab.vectors.data

def get_embeddings_loop(vocab):
    max_rank = max(lex.rank for lex in vocab if lex.has_vector)
    vectors = np.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank] = lex.vector
    return vectors

def get_features_from_list_docs(docs, max_length):
    docs = list(docs)
    Xs = np.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            # Get the vector id from the vocab of embeddings.
            # token.orth return total vector id.
            vector_id = token.vocab.vectors.find(key=token.orth)
            if vector_id >= 0:
                Xs[i, j] = vector_id
            else:
                Xs[i, j] = 0
            j += 1
            if j >= max_length:
                break
    return Xs
# Testing
# np.shape(get_features_from_list_docs(nlp.pipe(x_data[:2]), maxlen))

In [13]:
spacy_embeddings = get_embeddings(nlp.vocab)

In [14]:
print(spacy_embeddings.shape)
# print(spacy_embeddings_loop.shape)

(20000, 300)


In [15]:
x_train_data = get_features_from_list_docs(nlp.pipe(train_data.question.values), maxlen)
y_train_data = train_data.q_class.values

In [16]:
x_valid_data = get_features_from_list_docs(nlp.pipe(valid_data.question.values), maxlen)
y_valid_data = valid_data.q_class.values

In [17]:
y_train_data = y_train_data.reshape((-1,1))
y_valid_data = y_valid_data.reshape((-1,1))

In [18]:
from keras.models import Sequential, Model
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import K
from keras.layers import LSTM, CuDNNGRU, GRU
from keras.layers import Lambda
from keras.constraints import unit_norm
from keras.callbacks import ModelCheckpoint, Callback
from keras.utils import np_utils
from keras_tqdm import TQDMNotebookCallback
from margin_softmax import sparse_amsoftmax_loss

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
from keras.models import load_model

model = load_model('model_25epoch.h5')
encoder = load_model('encoder_25epoch.h5')

In [19]:
# 正式模型，基于GRU的分类器
x_in = Input(shape=(maxlen,))
x_embedded = Embedding(
        spacy_embeddings.shape[0],
        spacy_embeddings.shape[1],
        input_length=maxlen,
        trainable=False,
        weights=[spacy_embeddings])(x_in)
x = CuDNNGRU(spacy_embeddings.shape[1])(x_embedded)
# x = GRU(spacy_embeddings.shape[1])(x_embedded)
x = Lambda(lambda x: K.l2_normalize(x, 1))(x)

pred = Dense(num_train_groups,
             use_bias=False,
             kernel_constraint=unit_norm())(x)

encoder = Model(x_in, x) # 最终的目的是要得到一个编码器
model = Model(x_in, pred) # 用分类问题做训练

model.compile(loss=sparse_amsoftmax_loss,
              optimizer='adam',
              metrics=['sparse_categorical_accuracy'])

In [20]:
# 为验证集的排序准备
# 实际上用numpy写也没有问题，但是用Keras写能借助GPU加速
x_in = Input(shape=(spacy_embeddings.shape[1],))
x = Dense(len(valid_data), use_bias=False)(x_in) # 计算相似度
x = Lambda(lambda x: K.tf.nn.top_k(x, 11)[1])(x) # 取出topk的下标
model_sort = Model(x_in, x)


In [21]:
id2g = dict(zip(valid_data.index-valid_data.index[0], valid_data.q_class))

def evaluate(): # 评测函数
    print('validing...')
    valid_vec = encoder.predict(
        # np.array(list(x_valid_data)),
        x_valid_data,
                                verbose=True,
                                batch_size=1000) # encoder计算句向量
    model_sort.set_weights([valid_vec.T]) # 载入句向量为权重
    sorted_result = model_sort.predict(valid_vec,
                                       verbose=True,
                                       batch_size=1000) # 计算topk
    new_result = np.vectorize(lambda s: id2g[s])(sorted_result)
    _ = new_result[:, 0] != new_result[:, 0] # 生成一个全为False的向量
    for i in range(10): # 注意按照相似度排序的话，第一个就是输入句子（全匹配）
        _ = _ + (new_result[:, 0] == new_result[:, i+1])
        if i+1 == 1:
            top1_acc = 1. * _.sum() / len(_)
        elif i+1 == 5:
            top5_acc = 1. * _.sum() / len(_)
        elif i+1 == 10:
            top10_acc = 1. * _.sum() / len(_)

    return top1_acc, top5_acc, top10_acc


In [22]:
# 定义Callback器，计算验证集的acc，并保存最优模型
class Evaluate(Callback):
    def __init__(self):
        self.accs = {'top1': [], 'top5': [], 'top10': []}
        self.highest = 0.
    def on_epoch_end(self, epoch, logs=None):
        top1_acc, top5_acc, top10_acc = evaluate()
        self.accs['top1'].append(top1_acc)
        self.accs['top5'].append(top5_acc)
        self.accs['top10'].append(top10_acc)
        if top1_acc >= self.highest: # 保存最优模型权重
            self.highest = top1_acc
            model.save_weights('sent_sim_amsoftmax.model')
        json.dump({'accs': self.accs, 'highest_top1': self.highest},
                  open('valid_amsoftmax.log', 'w'), indent=4)
        print('top1_acc: %s, top5_acc: %s, top10_acc: %s' % (top1_acc, top5_acc, top10_acc))

In [23]:
evaluator = Evaluate()

history = model.fit(x_train_data,
                    y_train_data,
                    batch_size=batch_size,
                    epochs=epochs,
                    callbacks=[evaluator])


valid_vec = encoder.predict(x_valid_data,
                            verbose=True,
                            batch_size=1000) # encoder计算句向量

Epoch 1/25
validing...
top1_acc: 0.03834965378784775, top5_acc: 0.05060023763674356, top10_acc: 0.055680747326586635
Epoch 2/25
validing...
top1_acc: 0.1668373827180727, top5_acc: 0.2163723521940427, top10_acc: 0.23837423689925022
Epoch 3/25
validing...
top1_acc: 0.12365305035440652, top5_acc: 0.16925472200598188, top10_acc: 0.19326422747572417
Epoch 4/25
validing...
top1_acc: 0.46130208546728396, top5_acc: 0.5949522677920269, top10_acc: 0.648010816569017
Epoch 5/25
validing...
top1_acc: 0.6504691277092637, top5_acc: 0.7816200270414225, top10_acc: 0.8250911623714509
Epoch 6/25
validing...
top1_acc: 0.7313065923710411, top5_acc: 0.8481173433850945, top10_acc: 0.8812226000737493
Epoch 7/25
validing...
top1_acc: 0.771786782480436, top5_acc: 0.8816323185971238, top10_acc: 0.9096160937435982
Epoch 8/25
validing...
top1_acc: 0.7949768509034294, top5_acc: 0.8984307780554759, top10_acc: 0.9248576228131273
Epoch 9/25
validing...
top1_acc: 0.8100135207112713, top5_acc: 0.9084279100258122, top10_

validing...
top1_acc: 0.835989675093211, top5_acc: 0.9212930716597697, top10_acc: 0.9416560822714795
Epoch 25/25
validing...
top1_acc: 0.8376695210390461, top5_acc: 0.921047240545745, top10_acc: 0.941246363748105


In [26]:
model.save('model_25epoch.h5')
encoder.save('encoder_25epoch.h5')

In [24]:
def most_similar(s):
    v = encoder.predict(get_features_from_list_docs(nlp.pipe([s]), max_length=maxlen))[0]
    sims = np.dot(valid_vec, v)
    for i in sims.argsort()[-10:][::-1]:
        if sims[i] > 0.66:
            print(valid_data.iloc[i][1],sims[i])
    print("=============================")

# most_similar(u'What is the concept of Cisco GPL 2016?')
# most_similar(u'What are the best Web Hosting discount deals for New Year 2017?')
most_similar(u'How can I know that I\'m gay if I never had sex?')

How can I know that I'm gay if I never had sex? 0.99999994


In [25]:
for s in valid_data.sample(n=5).question.values:
    most_similar(s)

What is the best way to prepare for TCS Aptitude Test? 1.0
How do I crack the TCS aptitude test? 0.7359774
What does it really mean to be married? 0.99999994
What does it mean to be married? 0.96669066
What is a statement sentence? What are some examples? 0.9999999
What are statement sentences? What are some examples? 0.7649162
What should my rank be in JEE Main to get CSE in IIITD if I'm OBC of outside Delhi region? 1.0
What should my rank be in JEE Main to get CSE in IIITD if I'm OBC of Delhi region? 0.9444523
How are table salt and cooking salt different? 0.99999994
How does table salt differ from cooking salt? 0.7755568
