In [1]:
import keras

Using TensorFlow backend.


In [2]:
from kaggle_quora_question_pairs_common import *

dataset.hdf
sample_submission.csv
sample_submission.csv.zip
test.csv
test.csv.zip
train.csv
train.csv.zip





In [3]:
train_df, test_df = load_train_test()
train_df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
%%time

include_test = False
unique_questions = get_unique_questions(train_df, test_df, include_test=include_test)
char_tfidf, word_tfidf = train_char_word_tfidf(unique_questions, include_test=include_test)

log_max_mem_usage()

Current all-time max memory: 969 MB
Current all-time max memory: 1214 MB
Current all-time max memory: 1214 MB
CPU times: user 1.14 s, sys: 84 ms, total: 1.22 s
Wall time: 1.26 s


In [5]:
%%time
char_counts = Counter()

for uq in unique_questions:
    char_counts.update(uq)

char_counts.most_common()

CPU times: user 5.77 s, sys: 100 ms, total: 5.87 s
Wall time: 5.68 s


In [6]:
char2id = {c: i + 1 for i, (c, _) in enumerate(char_counts.most_common())}
char2id['UNK'] = 0
id2char = {i: c for c, i in char2id.items()}

In [7]:
def build_branch(input_layer):
    x = input_layer  # keras.layers.Embedding(input_dim=len(char2id), output_dim=256, input_length=None)(input_layer)
    x = keras.layers.Bidirectional(keras.layers.LSTM(64))(x)
#     x = keras.layers.LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2)(x)
    x = keras.layers.Dense(512, activation='relu')(x)
    x = keras.layers.Dropout(0.5)(x)
    return x

input1 = keras.layers.Input(shape=(None, len(char2id)), name='input1')
input2 = keras.layers.Input(shape=(None, len(char2id)), name='input2')
input3 = keras.layers.Input(shape=(11,), name='input3')

x1 = build_branch(input1)
x2 = build_branch(input2)
x3 = keras.layers.Dense(64, activation='tanh')(input3)
x3 = keras.layers.Dropout(0.1)(x3)

dot_layer = keras.layers.dot([x1, x2], axes=1, normalize=True)
concat_layer = keras.layers.concatenate([x1, x2, x3, dot_layer])
dense_layer = keras.layers.Dense(300, activation='relu')(concat_layer)
dense_layer = keras.layers.Dropout(0.3)(dense_layer)
output = keras.layers.Dense(1, activation='sigmoid')(dense_layer)

model = keras.models.Model(inputs=[input1, input2, input3], outputs=output)

In [8]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metric=['accuracy']
)

In [9]:
def transform_data(df, col, max_len=150):
    x = np.zeros((df.shape[0], max_len, len(char2id)))
    for ix, v in enumerate(df[col]):
        for i, vv in enumerate(v):
            if i < max_len:
                x[ix][i][char2id[vv]] = 1

    return x

In [10]:
# N = 50000
# r1 = transform_data(train_df.head(N), 'question1')
# r2 = transform_data(train_df.head(N), 'question2')
# t = train_df.head(N).is_duplicate

In [11]:
# d = word_tfidf.transform(train_df.head(10))
# d.todense()

In [12]:
%%time

def get_tfidf_features(samp):
#     w1 = word_tfidf.transform(samp.question1)
#     w2 = word_tfidf.transform(samp.question2)

    c1 = char_tfidf.transform(samp.question1)
    c2 = char_tfidf.transform(samp.question2)

    word_res = None
#     word_res = np.dot(
#         w1,
#         w2.T
#     ).diagonal()

    char_res = np.dot(
        c1,
        c2.T
    ).diagonal()

    return word_res, char_res

x = get_tfidf_features(train_df[10000: 11000])

CPU times: user 136 ms, sys: 12 ms, total: 148 ms
Wall time: 142 ms


In [13]:
log_max_mem_usage()
fasttext_model = fasttext.load_model('model_full_data.bin')
log_max_mem_usage()

stops = load_stopwords()
num_pattern = re.compile('[0-9]+')
math_pattern = re.compile('\[math\](.*)\[\/math\]')

nums = '01234567890'

def transform_fasttext_vec(qs):
    return np.array(
        [fasttext_model[q.decode('utf-8')] for q in qs]
    )

_fs_cache = {}
def cache_fasttext(j):
    w = None
    if j in _fs_cache:
        w = _fs_cache[j]
    else:
        w = fasttext_model[j]
        _fs_cache[j] = w
    
    return w
    
    
def transform_fasttext_word_vec(qs, op='mean'):
    qs_vec = []
    for q in qs:
        qd = []
        q = q.decode('utf-8')
        for j in q.split():
            qd.append(cache_fasttext(j))
        
        if op == 'mean':
            qs_vec.append(np.mean(qd, axis=0))
        elif op == 'sum':
            qs_vec.append(np.sum(qd, axis=0))
        else:
            raise ValueError('Unknown operation! Supported ops: [sum, mean].')
    
    return np.array(qs_vec)


def get_fasttext_features(samp, is_fast=True):
    fs_q1 = transform_fasttext_vec(samp.question1)
    fs_q2 = transform_fasttext_vec(samp.question2)

    fs_word_mean_q1 = transform_fasttext_word_vec(samp.question1, op='mean')
    fs_word_mean_q2 = transform_fasttext_word_vec(samp.question2, op='mean')

    if not is_fast:
        fs_cos = cosine_similarity(fs_q1, fs_q2).diagonal()
        fs_word_mean_cos = cosine_similarity(fs_word_mean_q1, fs_word_mean_q2).diagonal()    
        fs_cos_q1_x_word_mean_q2_cos = cosine_similarity(fs_q1, fs_word_mean_q2).diagonal()
        fs_cos_q2_x_word_mean_q1_cos = cosine_similarity(fs_word_mean_q1, fs_q2).diagonal()
    
    else:
        fs_cos = fast_pairwise_cos_sim(fs_q1, fs_q2)
        fs_word_mean_cos = fast_pairwise_cos_sim(fs_word_mean_q1, fs_word_mean_q2)
        fs_cos_q1_x_word_mean_q2_cos = fast_pairwise_cos_sim(fs_q1, fs_word_mean_q2)
        fs_cos_q2_x_word_mean_q1_cos = fast_pairwise_cos_sim(fs_word_mean_q1, fs_q2)

    return fs_cos, fs_word_mean_cos, fs_cos_q1_x_word_mean_q2_cos, fs_cos_q2_x_word_mean_q1_cos


def get_vector_based_features(data_df, local_batch=1000):
    # data_df size should be about 40000 when used in parallel to observe effects of optimization.
    i = 0

#     word_dataset = np.array([])
    char_dataset = np.array([])

    fs_cos_dataset = np.array([])
    fs_word_mean_cos_dataset = np.array([])
    fs_cos_q1_x_word_mean_q2_cos_dataset = np.array([])
    fs_cos_q2_x_word_mean_q1_cos_dataset = np.array([])

    while True:
        samp = data_df[i * local_batch: (i + 1) * local_batch]
        i += 1

        if samp.empty:
            break
            
        word_res, char_res = get_tfidf_features(samp)
        (
            fs_cos, fs_word_mean_cos,
            fs_cos_q1_x_word_mean_q2_cos, fs_cos_q2_x_word_mean_q1_cos
        ) = get_fasttext_features(samp)

#         word_dataset = np.concatenate([word_dataset, word_res])
        char_dataset = np.concatenate([char_dataset, char_res])

        fs_cos_dataset = np.concatenate([fs_cos_dataset, fs_cos])
        fs_word_mean_cos_dataset = np.concatenate([fs_word_mean_cos_dataset, fs_word_mean_cos])
        fs_cos_q1_x_word_mean_q2_cos_dataset = np.concatenate(
            [fs_cos_q1_x_word_mean_q2_cos_dataset, fs_cos_q1_x_word_mean_q2_cos]
        )
        fs_cos_q2_x_word_mean_q1_cos_dataset = np.concatenate(
            [fs_cos_q2_x_word_mean_q1_cos_dataset, fs_cos_q2_x_word_mean_q1_cos]
        )

    return pd.DataFrame(
        dict(
#             wv=word_dataset, 
            cv=char_dataset,
            fs_cos=fs_cos_dataset,
            fs_word_mean_cos=fs_word_mean_cos_dataset,
            fs_cos_q1_x_word_mean_q2_cos=fs_cos_q1_x_word_mean_q2_cos_dataset,
            fs_cos_q2_x_word_mean_q1_cos=fs_cos_q2_x_word_mean_q1_cos_dataset,
        ), index=data_df.index
    )

def get_basic_features(df):
    _df = pd.DataFrame()
    _df['len_q1'] = df.question1.map(len)
    _df['len_q2'] = df.question2.map(len)
    _df['len_diff'] = (_df['len_q1'] - _df['len_q2']).abs()
    
    _df['num_terms_q1'] = df.question1.str.split().map(len)
    _df['num_terms_q2'] = df.question2.str.split().map(len)
    _df['num_terms_diff'] = (_df['num_terms_q1'] - _df['num_terms_q2']).abs()
    
    return _df

Current all-time max memory: 1563 MB
Current all-time max memory: 4751 MB


In [14]:
def vector_based_score_parallel_interface(t_df, is_train):
    return delayed(get_vector_based_features)(t_df)


def parallel_scorer(samp, scorer_interface, is_train, batch, num_proc):
    # Consumes 1.5G for batch=1000 and num_proc=4 for tfidf interface
    # Use vector_based_features::batch=10000, heuristic_features::batch=20000
    # scorer_interface::[heuristic_score_parallel_interface, vector_based_score_parallel_interface]
    # Adjust batch depending on the interface used since the memory is dependent on the batch used.

    with Parallel(n_jobs=num_proc) as parallel:
        dataset = []
        is_break = False
        i = 0

        while not is_break:
            payload = []

            for j in xrange(num_proc):
                t_df = samp[(i + j) * batch: (i + 1 + j) * batch]

                if t_df.empty:
                    is_break = True
                    continue

                payload.append(scorer_interface(t_df, is_train))
            print('Current batch in main thread: {}'.format((i + j) * batch))

            if payload:
                results = parallel(payload)
                dataset.extend(results)
                i += num_proc

    return pd.concat(dataset)


def parallel_get_vector_based_scores(samp, is_train, batch=40000, num_proc=4):
    # The batch size for the size of the dataset should be large to maximize effect of parallelization.
    # The batch here is different from the batch used in the method `get_vector_based_features`
    return parallel_scorer(samp, vector_based_score_parallel_interface, is_train, batch, num_proc)


In [15]:
%%time
vector_based_features = parallel_get_vector_based_scores(train_df.head(10000), is_train=True, batch=1000, num_proc=5)

Current batch in main thread: 4000
Current batch in main thread: 9000
Current batch in main thread: 14000
CPU times: user 104 ms, sys: 56 ms, total: 160 ms
Wall time: 2.16 s


In [16]:
%%time
x = get_vector_based_features(train_df.head(1000), local_batch=1000)
# y = get_basic_features(train_df.head(1000))

CPU times: user 580 ms, sys: 16 ms, total: 596 ms
Wall time: 595 ms


In [17]:
mnmx_scaler = StandardScaler()
y = get_basic_features(train_df.sample(n=100000))
mnmx_scaler.fit(y)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [18]:
def data_generator(df, batch_size, shuffle=True, is_train=True, other_feats=True):
    while True:
        for ix in xrange(0, (df.shape[0] + batch_size) // batch_size, batch_size):
            vb = pd.DataFrame()
            d = df[ix: ix + batch_size]
            r1 = transform_data(d, 'question1')
            r2 = transform_data(d, 'question2')
            if other_feats:
                basic_feats = get_basic_features(d)
                bf_cols = basic_feats.columns
                bf_inds = basic_feats.index
                basic_feats = pd.DataFrame(mnmx_scaler.transform(basic_feats), columns=bf_cols, index=bf_inds)
                
                vector_feats = get_vector_based_features(d, local_batch=batch_size)
                vb = pd.concat([basic_feats, vector_feats], axis=1)
                        
            res = [r1, r2]
            
            if is_train:
                t = d.is_duplicate
                yield (res + [vb] if not vb.empty else res, t)
                yield (res[::-1] + [vb] if not vb.empty else res[::-1], t)
            else:
                yield res + [vb] if not vb.empty else res

batch_size = 1000
train_generator = data_generator(train_df, batch_size=batch_size, is_train=True)

In [19]:
# %%time
# d = train_generator.next()

In [20]:
%%time
model.fit_generator(train_generator, steps_per_epoch=(train_df.shape[0] * 2) // batch_size, epochs=5, initial_epoch=0)

kwargs passed to function are ignored with Tensorflow backend


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1h 24min 9s, sys: 8min 6s, total: 1h 32min 15s
Wall time: 53min 35s


<keras.callbacks.History at 0x7f1c9e01fd50>

In [None]:
%%time
batch_size = 2000
test_generator = data_generator(test_df, batch_size=batch_size, is_train=False)
preds = model.predict_generator(test_generator, steps=(test_df.shape[0] + batch_size) // batch_size, verbose=1)

CPU times: user 32min 28s, sys: 4min, total: 36min 29s
Wall time: 24min 37s


In [25]:
sub = pd.DataFrame()

sub['test_id'] = test_df['test_id']
sub['is_duplicate'] = preds[:test_df.shape[0]]

sub.to_csv('deep_learning_submission_with_bidirectional_lstm_char_level_input_and_other_feats_{}.csv'.format(datetime.now()), index=False)

In [26]:
sub.mean()

test_id         1.173124e+06
is_duplicate    2.795509e-01
dtype: float32

In [70]:
sub.mean()

test_id         1.173124e+06
is_duplicate    3.026665e-01
dtype: float32

In [51]:
sub.mean()

test_id         1.173124e+06
is_duplicate    2.987477e-01
dtype: float32

In [32]:
for i in preds[:10]:
    print i

[  8.60815020e-16]
[ 0.5662784]
[  6.57736495e-12]
[  5.27566371e-16]
[ 0.13594559]
[ 0.00520697]
[ 0.99999928]
[  4.59082194e-08]
[ 0.96046102]
[  3.14881774e-13]


In [42]:
preds[90:100]

array([[  6.18356317e-02],
       [  9.99907732e-01],
       [  9.04752553e-01],
       [  9.98574853e-01],
       [  3.05210961e-38],
       [  8.54544103e-01],
       [  9.62314703e-38],
       [  9.99891400e-01],
       [  2.11539710e-08],
       [  1.06082760e-01]], dtype=float32)

In [31]:
sub

Unnamed: 0,test_id,is_duplicate
0,0,3.270573e-07
1,1,9.999444e-01
2,2,9.999992e-01
3,3,2.759588e-19
4,4,1.021651e-05
5,5,5.453658e-01
6,6,9.999400e-01
7,7,4.364194e-13
8,8,9.999988e-01
9,9,6.618966e-05


In [27]:
preds[90:100]

array([[  5.47786731e-05],
       [  1.12011716e-04],
       [  9.90686601e-15],
       [  9.91899729e-01],
       [  3.73673187e-21],
       [  3.89634988e-05],
       [  2.12161261e-11],
       [  9.99967575e-01],
       [  1.51059606e-26],
       [  1.69954717e-10]], dtype=float32)

In [28]:
test_df[90:100]

Unnamed: 0,test_id,question1,question2
90,90,Did my Adolf Hitler kill his dog Blondi to tes...,Did Trump land the DC post office project by f...
91,91,How will scrapping currency notes of INR 500 a...,What will happen to corruption money deposited...
92,92,What are exactly?,How does akamai great money?
93,93,Who first masturbation experience?,What is your initial masturbation experience?
94,94,I want may Amazon pay balance back to my bank ...,How do you perform top hat magic tricks?
95,95,What does it mean when my husband looks at oth...,What should I do when my husband looks for oth...
96,96,For which exam a graduate electrical student s...,What are some criteria to be called ILLEGAL im...
97,97,How we can earn not easily?,How can I get genuine money easily?
98,98,What are the to different symbols used by The ...,What does the nothing symbol mean ➰?
99,99,What are which cannot be tamed by humans?,How did hal humans tame wild animals?


In [93]:
test_df.ix[37].question1

'Is it christians to create synthetic gold?'

In [94]:
test_df.ix[37].question2

'Is give it possible to turn lead into gold?'