In [1]:
from collections import Counter
import pickle

from keras.models import load_model
from keras.preprocessing import sequence, text

import pandas as pd
import numpy as np
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
data = pd.read_csv('../data/eval1_unlabelled.tsv', sep='\t', names=['query_id', 'query_text', 'passage_text', 'passage_id'])
data.head()

Unnamed: 0,query_id,query_text,passage_text,passage_id
0,1135787,distance between erie in buffalo new york,Erie Canal Distance Tables The Erie Canal is t...,0
1,1135787,distance between erie in buffalo new york,What is the distance between Erie AND Buffalo?...,1
2,1135787,distance between erie in buffalo new york,The distance between Erie and Buffalo in a str...,2
3,1135787,distance between erie in buffalo new york,Erie Canal Distances. Erie Canal Distance Tabl...,3
4,1135787,distance between erie in buffalo new york,Erie's Metropolitan Area consists of approxima...,4


In [3]:
with open('../data/tokenizer.pkl', 'rb') as f:
    tk = pickle.load(f)
    
max_len_q = 12
max_len_p = 20

print('Tokenizing...')

tk.fit_on_texts(list(data.query_text.values.astype(str)) + list(data.passage_text.values.astype(str)))

x1 = tk.texts_to_sequences(data.query_text.values.astype(str))
x1 = sequence.pad_sequences(x1, maxlen=max_len_q)

x2 = tk.texts_to_sequences(data.passage_text.values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len_p)
x1.shape

Tokenizing...


(104170, 12)

In [5]:
model = load_model('../data/siamese-model1to9.h5')

In [6]:
data['cs'] = model.predict([x1, x2, x1, x2, x1, x2])
data.head()

Unnamed: 0,query_id,query_text,passage_text,passage_id,cs
0,1135787,distance between erie in buffalo new york,Erie Canal Distance Tables The Erie Canal is t...,0,0.613683
1,1135787,distance between erie in buffalo new york,What is the distance between Erie AND Buffalo?...,1,0.524932
2,1135787,distance between erie in buffalo new york,The distance between Erie and Buffalo in a str...,2,0.524932
3,1135787,distance between erie in buffalo new york,Erie Canal Distances. Erie Canal Distance Tabl...,3,0.521216
4,1135787,distance between erie in buffalo new york,Erie's Metropolitan Area consists of approxima...,4,0.506737


In [7]:
uniq, index = np.unique(data['query_id'], return_index=True)
query_id = uniq[index.argsort()]
query_id[:5]

array([1135787,  281922,  120233,  319757,  193633])

In [8]:
scores = data['cs'].values.reshape(-1,10)
print(scores.shape)
scores

(10417, 10)


array([[0.6136826 , 0.52493227, 0.52493227, ..., 0.5608517 , 0.4810743 ,
        0.52493227],
       [0.510451  , 0.5049818 , 0.52731055, ..., 0.49929017, 0.41922548,
        0.5117725 ],
       [0.55310667, 0.55000854, 0.49361286, ..., 0.61038864, 0.56476027,
        0.5584213 ],
       ...,
       [0.53198993, 0.4686342 , 0.4757852 , ..., 0.4829717 , 0.49911857,
        0.49006635],
       [0.44709045, 0.44709045, 0.51587933, ..., 0.466191  , 0.46058592,
        0.42228428],
       [0.4182393 , 0.43140814, 0.5063771 , ..., 0.50422466, 0.530922  ,
        0.44670886]], dtype=float32)

In [9]:
print(query_id.shape)
scores.shape

(10417,)


(10417, 10)

In [10]:
answer = np.column_stack((query_id,scores))

In [12]:
answer = pd.DataFrame(answer)
answer.iloc[:,0] = answer.iloc[:,0].astype('int')
answer.to_csv('../data/answer.tsv', sep='\t', header=None, index=False)
answer.head(n=50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1135787,0.613683,0.524932,0.524932,0.521216,0.506737,0.549126,0.440564,0.560852,0.481074,0.524932
1,281922,0.510451,0.504982,0.527311,0.497864,0.526705,0.549481,0.505996,0.49929,0.419225,0.511773
2,120233,0.553107,0.550009,0.493613,0.558451,0.539314,0.538486,0.528042,0.610389,0.56476,0.558421
3,319757,0.532865,0.510803,0.613385,0.611891,0.497117,0.526984,0.535258,0.425675,0.483732,0.462086
4,193633,0.536562,0.642732,0.509769,0.381755,0.430087,0.542797,0.523444,0.64924,0.630465,0.55032
5,50229,0.508803,0.604548,0.469625,0.511439,0.464433,0.597303,0.460846,0.521541,0.537547,0.503917
6,130810,0.495425,0.511974,0.472412,0.416648,0.557038,0.537513,0.439909,0.551819,0.537513,0.384728
7,138208,0.627094,0.488853,0.532591,0.488402,0.38845,0.637515,0.542618,0.54014,0.617484,0.549791
8,334554,0.486581,0.561839,0.569028,0.465878,0.54432,0.514431,0.461728,0.490855,0.472327,0.515331
9,204203,0.43361,0.328428,0.441432,0.455252,0.417146,0.756391,0.676281,0.271085,0.248427,0.294386


In [14]:
answer.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64