# Question duplicates
Implementación de red siamesa aplicada a lenguaje natural

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/andrea/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import os
import nltk
import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp
import numpy as np
import pandas as pd
import random as rnd

#### Cargando datos
Se usará un conjunto de datos con preguntas de Quora, este modelo resulta muy util para evitar que una pregunta similar sea posteada en el blog en repetidas ocasiones

In [3]:
data = pd.read_csv("questions.csv")
N=len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


Es necesario dividir el conjunto de datos en entrenamiento, validación y prueba

In [4]:
N_train = 300000
N_test  = 10*1024
print("bloque")

bloque


In [5]:
data_train = data[:N_train]
data_test  = data[N_train:N_train+N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del(data) # remove to free memory

Train set: 300000 Test set: 10240


Para este problema sólo se seleccionarán las preguntas duplicadas para entrenar el modelo.
Se construyen dos batches como entrada a la red Siamesa y se asume que la pregunta $$ q1_i $$ (pregunta $i$ en el primer batch) es el duplicado de $$q2_i$$, pero las restantes pregunta del batch no son duplicadas de $$q1_i$$

Para el conjunto de prueba, se usa el par original de preguntas y estatus que describe si son o no duplicadas

In [6]:
td_index = (data_train['is_duplicate'] == 1).to_numpy()
td_index = [i for i, x in enumerate(td_index) if x]
print('number of duplicate questions: ', len(td_index))
print('indexes of first ten duplicate questions:', td_index[:10])


number of duplicate questions:  111486
indexes of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


In [7]:
print(data_train['question1'][5])  #  Example of question duplicates (first one in data)
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1


Para el entrenamiento solo se toman las preguntas duplicadas, de esta forma el generador puede regresar $$q1_i$$ y $$q2_i$$ duplicados si $i=k$

In [8]:
Q1_train_words = np.array(data_train['question1'][td_index])
Q2_train_words = np.array(data_train['question2'][td_index])

Q1_test_words = np.array(data_test['question1'])
Q2_test_words = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

In [9]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train_words[0])
print('Question 2: ', Q2_train_words[0], '\n')
print('Question 1: ', Q1_train_words[5])
print('Question 2: ', Q2_train_words[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test_words[0])
print('Question 2: ', Q2_test_words[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0 



El siguiente paso es codificar cada palabra de entre los pares duplicados con un índice.
Dada una pregunta, se codifica como una lista de números
1. Tokenizar las preguntas con ```nltk.word_tokenize```
2. Crear un diccionario que asigne 0 a las palabras que no se encuentren en el vocabulario
3. Codificar cada palabra de las duplicadas con un índice

In [11]:
#create arrays
Q1_train = np.empty_like(Q1_train_words)
Q2_train = np.empty_like(Q2_train_words)

Q1_test = np.empty_like(Q1_test_words)
Q2_test = np.empty_like(Q2_test_words)

In [17]:
# Building the vocabulary with the train set         (this might take a minute)
from collections import defaultdict

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

for idx in range(len(Q1_train_words)):
    Q1_train[idx] = nltk.word_tokenize(Q1_train_words[idx])
    Q2_train[idx] = nltk.word_tokenize(Q2_train_words[idx])
    q = Q1_train[idx] + Q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print('The length of the vocabulary is: ', len(vocab))

The length of the vocabulary is:  36268


In [44]:
print(vocab['<PAD>'])
print(vocab['Recognition'])
print(vocab['Economy'])

1
0
6518


In [19]:
for idx in range(len(Q1_test_words)):
    Q1_test[idx] = nltk.word_tokenize(Q1_test_words[idx])
    Q2_test[idx] = nltk.word_tokenize(Q2_test_words[idx])

In [20]:
print('Train set has reduced to: ', len(Q1_train) )
print('Test set length: ', len(Q1_test) )

Train set has reduced to:  111486
Test set length:  10240


#### Convertir pregunta a tensor

In [21]:
for i in range(len(Q1_train)):
    Q1_train[i] = [vocab[word] for word in Q1_train[i]]
    Q2_train[i] = [vocab[word] for word in Q2_train[i]]


for i in range(len(Q1_test)):
    Q1_test[i] = [vocab[word] for word in Q1_test[i]]
    Q2_test[i] = [vocab[word] for word in Q2_test[i]]

In [22]:
print('first question in the train set:\n')
print(Q1_train_words[0], '\n')
print('encoded version:')
print(Q1_train[0],'\n')

first question in the train set:

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

encoded version:
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 



#### Dividir datos Train/Val/Test


In [23]:
cut_off = int(len(Q1_train)*.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off: ], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  111486
The length of the training set is:   89188
The length of the validation set is:  22298


El generador de datos se asegura que el solo un par de preguntas se encuentre repetido y contra los demás pares no haya repetición

In [None]:
def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True):

    input1 = []
    input2 = []
    idx = 0
    len_q = len(Q1)
    question_indexes = [*range(len_q)]

    if shuffle:
        rnd.shuffle(question_indexes)

    while True:
        if idx >= len_q:
            idx = 0
            if shuffle:
                rnd.shuffle(question_indexes)

        q1 = Q1[question_indexes[idx]]
        q2 = Q2[question_indexes[idx]]

        idx += 1
        input1.append(q1)
        input2.append(q2)
        if len(input1) == batch_size:
            max_len = max(max([len(inpt1) for inpt1 in input1]),max([len(inpt2) for inpt2 in input2]))
            max_len = 2**int(np.ceil(np.log2(max_len)))
            b1 = []
            b2 = []
            for q1, q2 in zip(input1, input2):
                q1 = q1+[pad]*(max_len-len(q1))
                q2 = q2+[pad]*(max_len-len(q2))
                b1.append(q1)
                b2.append(q2)
            yield np.array(b1), np.array(b2)
            input1, input2 = [], []  # reset the batches

In [25]:
batch_size = 2
res1, res2 = next(data_generator(train_Q1, train_Q2, batch_size))
print("First questions  : ",'\n', res1, '\n')
print("Second questions : ",'\n', res2)

First questions  :  
 [[   30    87   116   131    78   216   761    72   207    81    28  1871
     72  2753  3327  2809 10102    21     1     1     1     1     1     1
      1     1     1     1     1     1     1     1]
 [   30    87   116    35  1099   292    21     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1]] 

Second questions :  
 [[   30    87   116   131    78   216   761    72   207  2809 10102    21
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1]
 [   30    87   116    35  1099 20097    21 10849   292     4    64    78
    111    17 13784 31860 22229   363    78  4717 25035  3616     1     1
      1     1     1     1     1     1     1     1]]


#### Modelo Siames
Es una red neuronal la cual usa los mismos pesos mientras trabaja en a la par sobre dos vectores de entrada diferentes.

El proceso es el siguiente:
1. Llegan los embebidos de las preguntas duplicadas
2. Pasan por una  capa LSTM
3. Se normalizan
4. Se usa un error triple para obtener la similitud coseno entre el par de preguntas

El error triple utiliza una base que es comparado con la entrada positiva(real) y contra la falsa. La distancia de la base a la entrada positiva es minimizada y la distancia de la base a la negativa es maximizada


$$\mathcal{L}(A, P, N)=\max \left(\|\mathrm{f}(A)-\mathrm{f}(P)\|^{2}-\|\mathrm{f}(A)-\mathrm{f}(N)\|^{2}+\alpha, 0\right)$$

$A$ es la entrada base, por ejemplo $q1_1$, $P$ la entrada duplicada,  $q2_1$, y $N$ la entrada negativa (la pregunta no duplicada), por ejemplo $q2_2$.<br>
$\alpha$ es un margen; una red segura, que tanto se quiere acercar los duplicados a los no duplicados.


<br>

El modelo ocupa las siguientes capas
* ```tl.Serial```: Secuencia de capas
* ```tl.Embedding```: Mapeo de tokens dicretos a vector
* ```tl.LSTM```: Capa LSTM
* ```tl.Mean```: Calcula promedio a lo largo de un eje
* ```tl.Fn```: Capa sin pesos y aplica función $f$
* ```tl.parallel```: Aplica una lista de capas en paralelo a las entradas

In [None]:
def Siamese(vocab_size=len(vocab), d_model=128, mode='train'):

    def normalize(x):  # normalizes the vectors to have L2 norm 1
        return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))

    q_processor = tl.Serial(  # Processor will run on Q1 and Q2.
        tl.Embedding(vocab_size,d_model), # Embedding layer
        tl.LSTM(d_model), # LSTM layer
        tl.Mean(axis=1), # Mean over columns
        tl.Fn('Normalize', lambda x: normalize(x))  # Apply normalize function
    )  # Returns one vector of shape [batch_size, d_model].


    model = tl.Parallel(q_processor, q_processor)
    return model


In [27]:
model = Siamese()
print(model)

Parallel_in2_out2[
  Serial[
    Embedding_41699_128
    LSTM_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_41699_128
    LSTM_128
    Mean
    Normalize
  ]
]


#### Error Triple
El error es compuesto por dos términos. Uno utiliza la media de todos los NO duplicados, el segundo utiliza el negativo más cercano. El error queda de la siguiente forma:
\begin{align}
 \mathcal{Loss_1(A,P,N)} &=\max \left( -cos(A,P)  + mean_{neg} +\alpha, 0\right) \\
 \mathcal{Loss_2(A,P,N)} &=\max \left( -cos(A,P)  + closest_{neg} +\alpha, 0\right) \\
\mathcal{Loss(A,P,N)} &= mean(Loss_1 + Loss_2) \\
\end{align}


In [None]:
def TripletLossFn(v1, v2, margin=0.25):

    scores = fastnp.dot(v1,v2.T)  # pairwise cosine sim
    batch_size = len(scores)
    positive = fastnp.diag(scores)  # the positive ones (duplicates)
    negative_without_positive = scores-2*fastnp.eye(batch_size)
    closest_negative = negative_without_positive.max(axis=1)
    negative_zero_on_duplicate = fastnp.multiply((1.0-fastnp.eye(batch_size)),scores)
    mean_negative = fastnp.sum(negative_zero_on_duplicate,axis=1)/(batch_size - 1)
    triplet_loss1 = fastnp.maximum(closest_negative-positive+margin,0)
    triplet_loss2 = fastnp.maximum(0,mean_negative-positive+margin)
    triplet_loss = fastnp.mean(triplet_loss1+triplet_loss2)


    return triplet_loss


In [29]:
from functools import partial
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

#### Entrenamiento
Se define la función de costo y el optimizador

In [30]:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size, vocab['<PAD>'])
val_generator = data_generator(val_Q1, val_Q2, batch_size, vocab['<PAD>'])
print('train_Q1.shape ', train_Q1.shape)
print('val_Q1.shape   ', val_Q1.shape)

train_Q1.shape  (89188,)
val_Q1.shape    (22298,)


#### Entrenando modelo


In [33]:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01)

# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION: train_model
def train_model(Siamese, TripletLoss, lr_schedule,
                train_generator=train_generator, val_generator=val_generator, output_dir='model/'):

    output_dir = os.path.expanduser(output_dir)


    train_task = training.TrainTask(
        labeled_data=train_generator,       # Use generator (train)
        loss_layer=TripletLoss(),         # Use triplet loss. Don't forget to instantiate this object
        optimizer=trax.optimizers.Adam(0.01),          # Don't forget to add the learning rate parameter
        lr_schedule=lr_schedule, # Use Trax multifactor schedule function
    )

    eval_task = training.EvalTask(
        labeled_data=val_generator,       # Use generator (val)
        metrics=[TripletLoss()],          # Use triplet loss. Don't forget to instantiate this object
    )


    training_loop = training.Loop(Siamese(),
                                  train_task,
                                  eval_tasks=[eval_task],
                                  output_dir=output_dir)

    return training_loop

In [34]:
train_steps = 5
training_loop = train_model(Siamese, TripletLoss, lr_schedule)
training_loop.run(train_steps)





Step      1: Total number of trainable weights: 5469056
Step      1: Ran 1 train steps in 4.89 secs
Step      1: train TripletLoss |  0.49999815
Step      1: eval  TripletLoss |  0.49999839


#### Evaluación

In [None]:
# Loading in the saved model
model = Siamese()
model.init_from_file('/media/andrea/Baba Yaga/BIGFOOT/CIC/PycharmProjects/PycharmProjects/semester work/Proy6/model.pkl.gz')

#### Clasificación

Determinar la precisión del modelo. En el entrenamineto solo se usaron ejemplos positivos, en el caso del conjunto de prueba, tenemos parejas de preguntas de las cuales algunas son duplicadas y otas no. Pasarán por el modelo, se calculará la similitud coseno entre cada par y se fija un umbral para comparar si es o nó duplicado. El resultado se almacena acumulado en la precisión.

In [None]:
def classify(test_Q1, test_Q2, y, threshold, model, vocab, data_generator=data_generator, batch_size=64):

    accuracy = 0
    for i in range(0, len(test_Q1), batch_size):
        q1, q2 = next(data_generator(test_Q1[i:i+batch_size], test_Q2[i:i+batch_size], batch_size, pad=vocab['<PAD>'], shuffle=False))
        y_test = y[i:i+batch_size]
        # Call the model
        v1, v2 = model((q1,q2))

        for j in range(batch_size):
            d = fastnp.dot(v1[j],v2[j].T)
            res = d>threshold
            accuracy += int(y_test[j]==res)
    accuracy = accuracy / len(test_Q1)

    return accuracy


In [37]:
accuracy = classify(Q1_test,Q2_test, y_test, 0.7, model, vocab, batch_size = 512)
print("Accuracy", accuracy)


Accuracy 0.69091796875


#### Probando con preguntas nuevas

El modelo puede regresar 1 o 0 dependiendo si el par es una pregunta duplicada



In [38]:
def predict(question1, question2, threshold, model, vocab, data_generator=data_generator, verbose=False):
    # use `nltk` word tokenize function to tokenize
    q1 = nltk.word_tokenize(question1)  # tokenize
    q2 = nltk.word_tokenize(question2)  # tokenize
    Q1, Q2 = [], []
    for word in q1:  # encode q1
        Q1.append(vocab[word])
    for word in q2:  # encode q2
        Q2.append(vocab[word])

    Q1, Q2 = next(data_generator([Q1], [Q2], batch_size=1, pad=vocab['<PAD>'], shuffle=False))
    # Call the model
    v1, v2 = model((Q1,Q2))
    d = fastnp.dot(v1,v2.T)
    res = d>threshold


    if(verbose):
        print("Q1  = ", Q1, "\nQ2  = ", Q2)
        print("d   = ", d)
        print("res = ", res)

    return res


In [50]:
question1 = "When will I see you?"
question2 = "Where can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.6, model, vocab, verbose = True)


Q1  =  [[585  76   4  46  53  21   1   1]] 
Q2  =  [[ 676   33    4   46   53 7280   21    1]]
d   =  [[0.6093261]]
res =  [[ True]]


DeviceArray([[ True]], dtype=bool)

In [41]:
# Feel free to try with your own questions
question1 = "Do they enjoy eating the dessert?"
question2 = "Do they like hiking in the desert?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, vocab, verbose=True)

Q1  =  [[  443  1145  3159  1169    78 29017    21     1]] 
Q2  =  [[  443  1145    60 15302    28    78  7431    21]]
d   =  [[0.47753587]]
res =  [[False]]


DeviceArray([[False]], dtype=bool)