In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import tensorflow as tf

from model import SiameseNet
from data_utils import QuoraDataset, DataIterator
from config import Config
from embeddings import load_embeddings

import itertools

import nltk
nltk.data.path.append("nltk_data")

In [51]:
class DataIteratorAE(object):
    def __init__(self, data, batch=1, strict=0):
        self.q1, self.q2, self.l1, self.l2, self.y = data
        self.batch = batch
        self.i = 0
                
        q1, ids1 = np.unique(self.q1, return_index=True, axis=0)
        q2, ids2 = np.unique(self.q2, return_index=True, axis=0)
            
        self.q = np.concatenate([q1, q2], axis=0)
        self.l = np.concatenate([np.array(self.l1)[ids1], np.array(self.l2)[ids2]], axis=0)
        
        self.max = len(self.q)
        if strict:
            self.max -= self.max % self.batch
        
    def __iter__(self):
        return self

    def __next__(self):
        if self.i < self.max:
            ranged = (self.i, min(self.i + self.batch, self.max))
            self.i += self.batch
            return self.q[ranged[0]:ranged[1]], self.l[ranged[0]:ranged[1]]
        raise StopIteration

In [4]:
### Loading config and pretrained Glove embeddings
config = Config()
loaded_embeddings, (w2idx, idx2w) = load_embeddings(config.glove_filename, binary=False)

Loading from saved word_embeddings
Loading vocab


In [5]:
### Loading Quora Datasets
qd_train = QuoraDataset(config.train_filename, save_path=config.train_save)
w2idx_train, idx2w_train = qd_train.w2idx, qd_train.idx2w

embeddings = np.random.normal(scale=0.001, size=(len(w2idx_train), config.we_dim))

In [6]:
for w, i in w2idx_train.items():
    idx = w2idx.get(w)
    if idx is not None:
        embeddings[i] = loaded_embeddings[idx]

In [7]:
qd_dev  = QuoraDataset(config.dev_filename, w2idx=w2idx_train, save_path=config.dev_save)
qd_test = QuoraDataset(config.test_filename, w2idx=w2idx_train, save_path=config.test_save)

In [8]:
train_data = qd_train.data(padlen=config.padlen)
dev_data = qd_dev.data(padlen=config.padlen)
test_data = qd_test.data(padlen=config.padlen)

In [9]:
np.array(dev_data[0])[:10, :10]

array([[21121,   358,     2,   534,   835,    13,     0,     0,     0,
            0],
       [    1,   133,   134,   108,   488,   489,     7,  9534,     7,
           13],
       [    1,   110,   112,   320,   827,   828,    13,     0,     0,
            0],
       [    1,   133,   134,     0,  6122,    13,     0,     0,     0,
            0],
       [   88,  1762,   146,     3,  3673,   430,  6222,  1291,    74,
         2966],
       [    1,     2,     3,   214, 17417,    15,  1216,    30,    29,
          323],
       [   28,   110,    30, 11353,   127,   244,     7,   230,  2000,
           13],
       [  327,   133,  1465,  1466,   307,  1017,    15,   627,  1467,
            3],
       [  351,  1546,  1547,   462,  1548,  1549,   186,  1550,   290,
          483],
       [   45,    74,    28,   351,   112,  1745,   429,   194,  1746,
           13]], dtype=int32)

In [52]:
train_ae = DataIteratorAE(train_data)
dev_ae = DataIteratorAE(dev_data)
test_ae = DataIteratorAE(test_data)

In [53]:
train_ae.__next__()

(array([[ 256, 8265,   13,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0]]), array([3]))

In [55]:
np.random.random()

0.34613329443467633