In [1]:
import re
import os
import keras.backend as K
import numpy as np
import pandas as pd
from keras import layers, models, utils
import json

In [2]:
def reset_everything():
    import tensorflow as tf
    %reset -f in out dhist
    tf.reset_default_graph()
    K.set_session(tf.InteractiveSession())

In [3]:
# Constants for our networks.  We keep these deliberately small to reduce training time.

VOCAB_SIZE = 250000
EMBEDDING_SIZE = 100
MAX_DOC_LEN = 128
MIN_DOC_LEN = 12

In [4]:
def extract_stackexchange(filename, limit=1000000):
    json_file = filename + 'limit=%s.json' % limit

    rows = []
    for i, line in enumerate(os.popen('7z x -so "%s" Posts.xml' % filename)):
        line = str(line)
        if not line.startswith('  <row'):
            continue
            
        if i % 1000 == 0:
            print('\r%05d/%05d' % (i, limit), end='', flush=True)

        parts = line[6:-5].split('"')
        record = {}
        for i in range(0, len(parts), 2):
            k = parts[i].replace('=', '').strip()
            v = parts[i+1].strip()
            record[k] = v
        rows.append(record)
        
        if len(rows) > limit:
            break
    
    with open(json_file, 'w') as fout:
        json.dump(rows, fout)
    
    return rows


xml_7z = utils.get_file(
    fname='travel.stackexchange.com.7z',
    origin='https://ia800107.us.archive.org/27/items/stackexchange/travel.stackexchange.com.7z',
)
print()

rows = extract_stackexchange(xml_7z)

Downloading data from https://ia800107.us.archive.org/27/items/stackexchange/travel.stackexchange.com.7z

113000/1000000

# Data Exploration

Now that we have extracted our data, let's clean it up and take a look at what we have to work with.

In [5]:
df = pd.DataFrame.from_records(rows)    
df = df.set_index('Id', drop=False)
df['Title'] = df['Title'].fillna('').astype('str')
df['Tags'] = df['Tags'].fillna('').astype('str')
df['Body'] = df['Body'].fillna('').astype('str')
df['Id'] = df['Id'].astype('int')
df['PostTypeId'] = df['PostTypeId'].astype('int')
df['ViewCount'] = df['ViewCount'].astype('float')

df.head()

Unnamed: 0_level_0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ClosedDate,ContentLicense,FavoriteCount,ParentId,LastEditorDisplayName,CommunityOwnedDate,OwnerDisplayName
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,393.0,2011-06-21T20:19:34.730,8,517.0,&lt;p&gt;My fiancée and I are looking for a go...,9,101.0,2011-12-28T21:36:43.910,...,&lt;caribbean&gt;&lt;cruising&gt;&lt;vacations...,4.0,4,2013-02-25T23:52:47.953,CC BY-SA 3.0,,,,,
2,2,1,,2011-06-21T20:22:33.760,39,2847.0,&lt;p&gt;This was one of our definition questi...,13,50282.0,2019-11-14T13:06:23.997,...,&lt;guides&gt;&lt;extreme-tourism&gt;&lt;amazo...,8.0,4,,CC BY-SA 4.0,5.0,,,,
3,3,2,,2011-06-21T20:24:28.080,15,,&lt;p&gt;One way would be to go through an Adv...,9,,,...,,,2,,CC BY-SA 3.0,,2.0,,,
4,4,1,,2011-06-21T20:24:57.160,8,288.0,&lt;p&gt;Singapore Airlines has an all-busines...,24,693.0,2013-01-09T09:55:22.743,...,&lt;loyalty-programs&gt;&lt;routes&gt;&lt;ewr&...,1.0,1,,CC BY-SA 3.0,,,,,
5,5,1,770.0,2011-06-21T20:25:56.787,14,470.0,&lt;p&gt;Another definition question that inte...,13,101.0,2011-12-28T21:36:18.230,...,&lt;romania&gt;&lt;transportation&gt;,5.0,0,,CC BY-SA 3.0,2.0,,,,


In [6]:
list(df[df['ViewCount'] > 250000]['Title'])

['Do I need a US visa to transit (or layover) through an American airport?',
 'How much electronics and other valuables can I bring duty-free when going to India?',
 'How to get from Nice to Monaco by public transport?',
 'Should my first trip be to the country which issued my Schengen Visa?',
 "What harm can be done with a copy of one's passport?",
 'Can I cross the USA-Canada border with a birth certificate and a passport locator number?',
 'Can I bring my desktop computer as check-in baggage on a flight?',
 "What's the difference between 'Redress Number' and 'Known Traveler Number'? Do I need both for TSA PreCheck?",
 'Can I use Google Maps traffic information to estimate driving time for a specific date/time?',
 'Is there a way to find out if I need a transit visa for a layover in the UK?',
 'Are aerosol cans allowed and safe, in checked luggage?',
 'How to track my UK Visa Application Status?',
 "When applying for an Indian Passport, how do I know if I'm in the ECR or non-ECR cate

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df['Body'] + df['Title'])

In [8]:
# Compute TF/IDF Values

total_count = sum(tokenizer.word_counts.values())
idf = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }

In [9]:
# Download pre-trained word2vec embeddings

import gensim

glove_100d = utils.get_file(
    fname='glove.6B.100d.txt',
    origin='https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt',
)

w2v_100d = glove_100d + '.w2v'
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_100d, w2v_100d)
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_100d)

w2v_weights = np.zeros((VOCAB_SIZE, w2v_model.syn0.shape[1]))
idf_weights = np.zeros((VOCAB_SIZE, 1))

for k, v in tokenizer.word_index.items():
    if v >= VOCAB_SIZE:
        continue
    
    if k in w2v_model:
        w2v_weights[v] = w2v_model[k]
    
    idf_weights[v] = idf[k]
    
del w2v_model

Downloading data from https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt


  w2v_weights = np.zeros((VOCAB_SIZE, w2v_model.syn0.shape[1]))


In [10]:
df['title_tokens'] = tokenizer.texts_to_sequences(df['Title'])
df['body_tokens'] = tokenizer.texts_to_sequences(df['Body'])

In [11]:
import random

# We can create a data generator that will randomly title and body tokens for questions.  We'll use random text
# from other questions as a negative example when necessary.
def data_generator(batch_size, negative_samples=1):
    questions = df[df['PostTypeId'] == 1]
    all_q_ids = list(questions.index)
        
    batch_x_a = []
    batch_x_b = []
    batch_y = []
    
    def _add(x_a, x_b, y):
        batch_x_a.append(x_a[:MAX_DOC_LEN])
        batch_x_b.append(x_b[:MAX_DOC_LEN])
        batch_y.append(y)
    
    while True:
        questions = questions.sample(frac=1.0)
        
        for i, q in questions.iterrows():
            _add(q['title_tokens'], q['body_tokens'], 1)
            
            negative_q = random.sample(all_q_ids, negative_samples)
            for nq_id in negative_q:
                _add(q['title_tokens'], df.at[nq_id, 'body_tokens'], 0)            
            
            if len(batch_y) >= batch_size:
                yield ({
                    'title': pad_sequences(batch_x_a, maxlen=None),
                    'body': pad_sequences(batch_x_b, maxlen=None),
                }, np.asarray(batch_y))
                
                batch_x_a = []
                batch_x_b = []
                batch_y = []

# dg = data_generator(1, 2)
# next(dg)
# next(dg)

# Embedding Lookups

Let's define a helper class for looking up our embedding results.  We'll use it
to verify our models.

In [12]:
questions = df[df['PostTypeId'] == 1]['Title'].reset_index(drop=True)
question_tokens = pad_sequences(tokenizer.texts_to_sequences(questions))

class EmbeddingWrapper(object):
    def __init__(self, model):
        self._r = questions
        self._i = {i:s for (i, s) in enumerate(questions)}
        self._w = model.predict({'title': question_tokens}, verbose=1, batch_size=1024)
        self._model = model
        self._norm = np.sqrt(np.sum(self._w * self._w + 1e-5, axis=1))

    def nearest(self, sentence, n=10):
        x = tokenizer.texts_to_sequences([sentence])
        if len(x[0]) < MIN_DOC_LEN:
            x[0] += [0] * (MIN_DOC_LEN - len(x))
        e = self._model.predict(np.asarray(x))[0]
        norm_e = np.sqrt(np.dot(e, e))
        dist = np.dot(self._w, e) / (norm_e * self._norm)

        top_idx = np.argsort(dist)[-n:]
        return pd.DataFrame.from_records([
            {'question': self._r[i], 'dist': float(dist[i])}
            for i in top_idx
        ])

In [13]:
# Our first model will just sum up the embeddings of each token.
# The similarity between documents will be the dot product of the final embedding.

import tensorflow as tf

def sum_model(embedding_size, vocab_size, embedding_weights=None, idf_weights=None):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    def make_embedding(name):
        if embedding_weights is not None:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=w2v_weights.shape[1], 
                                         weights=[w2v_weights], trainable=False, 
                                         name='%s/embedding' % name)
        else:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=embedding_size,
                                        name='%s/embedding' % name)

        if idf_weights is not None:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1, 
                                   weights=[idf_weights], trainable=False,
                                   name='%s/idf' % name)
        else:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1,
                                   name='%s/idf' % name)
            
        return embedding, idf
    
    embedding_a, idf_a = make_embedding('a')
    embedding_b, idf_b = embedding_a, idf_a
#     embedding_b, idf_b = make_embedding('b')

    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(args):
        [embedding, idf] = args
        return K.sum(embedding * K.abs(idf), axis=1)

    sum_layer = layers.Lambda(_combine_and_sum, name='combine_and_sum')

    sum_a = sum_layer([mask(embedding_a(title)), idf_a(title)])
    sum_b = sum_layer([mask(embedding_b(body)), idf_b(body)])

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    sim_model.summary()

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [14]:
# Try using our model with pretrained weights from word2vec

sum_model_precomputed, sum_embedding_precomputed = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE,
    embedding_weights=w2v_weights, idf_weights=idf_weights
)

x, y = next(data_generator(batch_size=4096))
sum_model_precomputed.evaluate(x, y)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
body (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking (Masking)               (None, None, 100)    0           a/embedding[0][0]            

[0.958429753780365, 0.509765625]

In [15]:
SAMPLE_QUESTIONS = [
    'Roundtrip ticket versus one way',
    'Shinkansen from Kyoto to Hiroshima',
    'Bus tour of Germany',
]

def evaluate_sample(lookup):
    pd.set_option('display.max_colwidth', 100)
    results = []
    for q in SAMPLE_QUESTIONS:
        print(q)
        q_res = lookup.nearest(q, n=4)
        q_res['result'] = q_res['question']
        q_res['question'] = q
        results.append(q_res)

    return pd.concat(results)

lookup = EmbeddingWrapper(model=sum_embedding_precomputed)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,question,dist,result
0,Roundtrip ticket versus one way,0.814901,What is cheapest way to fly around SE Asia in a circuit - hub with roundtrip tickets or sequence...
1,Roundtrip ticket versus one way,0.815658,Fare hack: risk of booking a same-day multi-city ticket vs. through one-way ticket
2,Roundtrip ticket versus one way,0.826483,The penalty for changing an airline ticket is per leg or per ticket?
3,Roundtrip ticket versus one way,0.832242,Price of self-transfer tickets versus separate tickets
0,Shinkansen from Kyoto to Hiroshima,0.757703,Where does the Tokaido Shinkansen stop in Tokyo?
1,Shinkansen from Kyoto to Hiroshima,0.766435,From Hiroshima to Fukuoka via local trains
2,Shinkansen from Kyoto to Hiroshima,0.775135,Best connection Tokyo - Kyoto
3,Shinkansen from Kyoto to Hiroshima,0.813679,Travel from Tokyo to Sendai with Shinkansen
0,Bus tour of Germany,0.890238,Trip in the south of Germany
1,Bus tour of Germany,0.894592,Travelling outside of Germany on a German Working Holiday visa (Australian)


# Training our own network

The results are okay but not great... instead of using the word2vec embeddings, what happens if we train our network end-to-end?

In [18]:
sum_model_trained, sum_embedding_trained = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE, 
    embedding_weights=None,
    idf_weights=None
)
sum_model_trained.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000
)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
body (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 100)    0           a/embedding[0][0]          

<tensorflow.python.keras.callbacks.History at 0x7f49a05a3550>

In [19]:
lookup = EmbeddingWrapper(model=sum_embedding_trained)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,question,dist,result
0,Roundtrip ticket versus one way,0.764395,How to book this complex multi carrier roundtrip flight I found on ITA?
1,Roundtrip ticket versus one way,0.788245,How to pick the (phony) return destination for a roundtrip ticket intended as a one-way?
2,Roundtrip ticket versus one way,0.793011,Is it okay to board only the second flight of a return ticket? The return ticket is much cheaper...
3,Roundtrip ticket versus one way,0.827135,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible"
0,Shinkansen from Kyoto to Hiroshima,0.967517,Hokkaido Shinkansen - Sendai with JR Pass
1,Shinkansen from Kyoto to Hiroshima,0.969129,Where does the Tokaido Shinkansen stop in Tokyo?
2,Shinkansen from Kyoto to Hiroshima,0.971026,"Cheap way to get around Tokyo, Kyoto and Osaka - alternatives to the JR Pass?"
3,Shinkansen from Kyoto to Hiroshima,0.975549,Can I converse with Japanese travellers on the Shinkansen or is it rude?
0,Bus tour of Germany,0.644724,"Bus ticket transfer in Ingolstadt, Germany?"
1,Bus tour of Germany,0.695948,Bus search engines for Germany without place of arrival?


## CNN Model

Using a sum-of-embeddings model works well. What happens if we try to make a simple CNN model?

In [20]:
def cnn_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=False,
        input_dim=vocab_size,
        output_dim=embedding_size,
    )


    def _combine_sum(v):
        return K.sum(v, axis=1)

    cnn_1 = layers.Convolution1D(256, 3)
    cnn_2 = layers.Convolution1D(256, 3)
    cnn_3 = layers.Convolution1D(256, 3)
    
    global_pool = layers.GlobalMaxPooling1D()
    local_pool = layers.MaxPooling1D(strides=2, pool_size=3)

    def forward(input):
        embed = embedding(input)
        return global_pool(
            cnn_2(local_pool(cnn_1(embed))))

    sum_a = forward(title)
    sum_b = forward(body)

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=False)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [21]:
cnn, cnn_embedding = cnn_model(embedding_size=25, vocab_size=VOCAB_SIZE)
cnn.summary()
cnn.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000,
)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
body (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 25)     6250000     title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, None, 256)    19456       embedding[0][0]            

<tensorflow.python.keras.callbacks.History at 0x7f499fdc7520>

In [22]:
lookup = EmbeddingWrapper(model=cnn_embedding)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,question,dist,result
0,Roundtrip ticket versus one way,0.988642,Compassionate circumstances
1,Roundtrip ticket versus one way,0.989091,Tegelbergbahn Tickets
2,Roundtrip ticket versus one way,0.989348,Helium balloons
3,Roundtrip ticket versus one way,0.989934,Mugging 'Etiquette'?
0,Shinkansen from Kyoto to Hiroshima,0.993169,Compassionate circumstances
1,Shinkansen from Kyoto to Hiroshima,0.993772,Tegelbergbahn Tickets
2,Shinkansen from Kyoto to Hiroshima,0.994184,Helium balloons
3,Shinkansen from Kyoto to Hiroshima,0.994599,Mugging 'Etiquette'?
0,Bus tour of Germany,0.96822,jigsaw puzzle carrier
1,Bus tour of Germany,0.968379,Tegelbergbahn Tickets


## LSTM Model

We can also make an LSTM model.  Warning, this will be very slow to train and evaluate unless you have a relatively fast GPU to run it on!

In [23]:
def lstm_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=True,
        input_dim=vocab_size,
        output_dim=embedding_size,
#         weights=[w2v_weights],
#         trainable=False
    )

    lstm_1 = layers.LSTM(units=512, return_sequences=True)
    lstm_2 = layers.LSTM(units=512, return_sequences=False)
    
    sum_a = lstm_2(lstm_1(embedding(title)))
    sum_b = lstm_2(lstm_1(embedding(body)))

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
#     sim = layers.Activation(activation='sigmoid')(sim)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [24]:
lstm, lstm_embedding = lstm_model(embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE)
lstm.summary()
lstm.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=100,
)

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
body (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
lstm (LSTM)                     (None, None, 512)    1255424     embedding_1[0][0]          

<tensorflow.python.keras.callbacks.History at 0x7f499027b7c0>

In [25]:
lookup = EmbeddingWrapper(model=lstm_embedding)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,question,dist,result
0,Roundtrip ticket versus one way,0.996062,"EHIC, applying for a refund"
1,Roundtrip ticket versus one way,0.996072,Potential gastric problems in China
2,Roundtrip ticket versus one way,0.996233,Adding Wizz Flex after purchase
3,Roundtrip ticket versus one way,0.996433,Current Local Address Malaysia eVisa
0,Shinkansen from Kyoto to Hiroshima,0.995533,Cat Tourist Attractions In Japan
1,Shinkansen from Kyoto to Hiroshima,0.995565,Train from Krakow to Berlin
2,Shinkansen from Kyoto to Hiroshima,0.995755,Train from Brussels to Bruges
3,Shinkansen from Kyoto to Hiroshima,0.99664,Tramway from airport to Casablanca
0,Bus tour of Germany,0.993296,Multi entry Schengen visa
1,Bus tour of Germany,0.993333,Driving time around Iceland
