In [2]:
!pip install --upgrade torchtext

Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/79/ef/54b8da26f37787f5c670ae2199329e7dccf195c060b25628d99e587dac51/torchtext-0.5.0-py3-none-any.whl (73kB)
[K     |████████████████████████████████| 81kB 378kB/s eta 0:00:01
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/11/e0/1264990c559fb945cfb6664742001608e1ed8359eeec6722830ae085062b/sentencepiece-0.1.85-cp37-cp37m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 1.6MB/s eta 0:00:01
Installing collected packages: sentencepiece, torchtext
Successfully installed sentencepiece-0.1.85 torchtext-0.5.0


In [83]:
!pip install tqdm



In [45]:
!pipenv shell

[39m[22mShell for[39m[22m [32m[1m/home/belerico/.local/share/virtualenvs/aml-project-EQl709OG[39m[22m [39m[1malready activated.[39m[22m
No action taken to avoid nested environments.
[0m

In [115]:
import re
import json
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

english_stopwords = set(stopwords.words("english"))
non_alphanum_regex = re.compile(r'\W+')

def preprocess(doc, method='nltk', dataset=True):
    if method == 'spacy':
        tokens = " ".join(
            [
                token.lower_
                for token in doc
                if token
                and not (token.lower_ == "null" or token.is_stop or token.is_punct)
            ]
        )
    elif method == 'nltk':
        # doc = non_alphanum_regex.sub(' ', doc).lower()
        tokens = [
                token
                for token in word_tokenize(doc.lower())
                if not (token == "null" or token in english_stopwords or token in string.punctuation)
            ]
    elif method == 'keras':
        tokens = " ".join(
            [
                token
                for token in text_to_word_sequence(doc)
                if not (token == "null" or token in english_stopwords or token in string.punctuation)
            ]
        )
    if dataset or tokens != "":
        return tokens


def parse_content_line(x, attributes=None, label=True):
    if attributes is None:
        attributes = ["title_left", "title_right"]
    item = json.loads(x)
    elements = [item[attr] if item[attr] is not None else '' for attr in attributes]
    if label:
        elements.append(int(item["label"]))
    item = np.array(elements)
    return item[np.newaxis, :]

In [126]:
from torchtext.data import Field

TEXT = Field(sequential=True, tokenize=preprocess, lower=True, fix_length=20, batch_first=True, pad_token='0')
LABEL = Field(sequential=False, use_vocab=False, is_target=True, batch_first=True)

In [127]:
import numpy as np
from pandas import pandas

contents = []
for i, x in enumerate(open('./dataset/computers/train/computers_splitted_train_medium.json', "r").readlines()):
    try:
        item = parse_content_line(x, attributes=None, label=True)
        contents.append(item)
    except:
        print("Lost data at line {}".format(i))

contents = np.concatenate(contents, axis=0)
train = pandas.DataFrame(data=contents, columns=['title_left', 'title_right', 'label'])

In [128]:
import numpy as np
from pandas import pandas

contents = []
for i, x in enumerate(open('./dataset/computers/valid/computers_splitted_valid_medium.json', "r").readlines()):
    try:
        item = parse_content_line(x, attributes=None, label=True)
        contents.append(item)
    except:
        print("Lost data at line {}".format(i))

contents = np.concatenate(contents, axis=0)
valid = pandas.DataFrame(data=contents, columns=['title_left', 'title_right', 'label'])

In [129]:
from torchtext.data import Field, Dataset, Example
import pandas as pd

class DataFrameDataset(Dataset):
    """Class for using pandas DataFrames as a datasource"""
    def __init__(self, examples, fields, filter_pred=None):
        """
        Create a dataset from a pandas dataframe of examples and Fields
        Arguments:
            examples pd.DataFrame: DataFrame of examples
            fields {str: Field}: The Fields to use in this tuple. The
                string is a field name, and the Field is the associated field.
            filter_pred (callable or None): use only exanples for which
                filter_pred(example) is true, or use all examples if None.
                Default is None
        """
        self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
        if filter_pred is not None:
            self.examples = filter(filter_pred, self.examples)
        self.fields = dict(fields)
        # Unpack field tuples
        for n, f in list(self.fields.items()):
            if isinstance(n, tuple):
                self.fields.update(zip(n, f))
                del self.fields[n]
        
                
class SeriesExample(Example):
    """Class to convert a pandas Series to an Example"""
    @classmethod
    def fromSeries(cls, data, fields):
        return cls.fromdict(data.to_dict(), fields)
    
    @classmethod
    def fromdict(cls, data, fields):
        ex = cls()
        for key, field in fields.items():
            if key not in data:
                raise ValueError("Specified key {} was not found in "
                "the input data".format(key))
            if field is not None:
                setattr(ex, key, field.preprocess(data[key]))
            else:
                setattr(ex, key, data[key])
        return ex

In [130]:
fields = {"title_left": TEXT, 'title_right': TEXT, 'label': LABEL}
train_ds = DataFrameDataset(train, fields)
valid_ds = DataFrameDataset(valid, fields)

In [131]:
print(train_ds[0].title_left)
print(valid_ds[0].title_left)

['asus', 'prime', 'x299', 'deluxe', 'prijzen', 'tweakers']
['495906', 'b21', 'hp', 'x5560', '2', '80ghz', 'ml350', 'g6', 'new', 'wholesale', 'price']


In [132]:
TEXT.build_vocab(train_ds, valid_ds)

In [139]:
TEXT.vocab.freqs.most_common(10)

[('hp', 4006),
 ('2', 3293),
 ('3', 3033),
 ('gb', 2584),
 ('5', 2269),
 ('b21', 2149),
 ('com', 2030),
 ('price', 1979),
 ('wholesale', 1912),
 ('core', 1809)]

In [199]:
from torchtext.data import Iterator, BucketIterator

train_iter, val_iter = BucketIterator.splits(
     (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
     batch_sizes=(32, 32),
     device=torch.device('cpu'), # if you want to use the GPU, specify the GPU number here
     sort_key=lambda x: min(max(len(x.title_left), len(x.title_right)), 20), # the BucketIterator needs to be told what function it should use to group the data.
     sort_within_batch=True,
     repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [222]:
class BatchWrapper:
    def __init__(self, dl, x_vars, y_var):
        self.dl, self.x_vars, self.y_var = dl, x_vars, y_var # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            left = getattr(batch, self.x_vars[0]) # we assume only one input in this wrapper
            right = getattr(batch, self.x_vars[1]) # we assume only one input in this wrapper
            y = torch.Tensor(list(map(float, getattr(batch, self.y_var))))

            yield (left, right, y)

    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, ['title_left', 'title_right'], 'label')
valid_dl = BatchWrapper(val_iter, ['title_left', 'title_right'], 'label')

In [223]:
next(train_dl.__iter__())

(tensor([[  12,  304,  699,  432,    5, 1283,  606,  300, 2030,  408,   38,  757,
           480,    3,    6,  211,   30, 2243,    1,    1],
         [2603,   49,    2,  252,    5,  180,   62,    3,    6,  382,   71,   13,
            43,   10,    9,  208,    1,    1,    1,    1],
         [  64,  186,  376, 1288,  926,  410,   74,   29,   40,    5,  299, 2085,
           569,  326,  173,  285,  101,    8,    1,    1],
         [  19,   84,   26,  107,    4,   11,   24,   36,    5,   81,  222,   42,
           790,  795,  157,  153,  101,    8,    1,    1],
         [  96,   23,  378,  253,  395,  197,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1],
         [1434, 2306,  187, 3154,  291,  358,  232,  128,  318,   50, 1853, 3782,
           218, 3536, 3660, 3587,  321,  153,    1,    1],
         [  42,   23,  237,    4,    6,   15,  729,  216, 2060,  181,  362,  361,
           223,  307,  392,  317,  354,  352,    1,    1],
         [  1

In [252]:
import gensim
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=150, num_linear=3):
        super().__init__() # don't forget to call this!
        model = gensim.models.KeyedVectors.load_word2vec_format('./dataset/embeddings/w2v/w2v_title_300Epochs_1MinCount_9ContextWindow_150d.bin', binary=True)
        weights = torch.FloatTensor(model.vectors)
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.encoder_left = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.encoder_right = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.conv1 = nn.Conv2d(1, 16, 3)
        self.max_pool1 = nn.MaxPool2d(2)
        self.mlp1 = nn.Linear(1296, 32)
        self.predictor = nn.Linear(32, 1)

    def forward(self, seq):
        hdn_left, _ = self.encoder_left(self.embedding(seq[0]))
        hdn_right, _ = self.encoder_right(self.embedding(seq[1]))
        similarity = torch.matmul(hdn_left, torch.transpose(hdn_right, 1, 2))
        similarity = torch.unsqueeze(similarity, 1)
        x = self.conv1(similarity)
        x = F.relu(x)
        x = self.max_pool1(x)
        x = torch.flatten(x, start_dim=1)
        x = self.mlp1(x)
        x = F.relu(x)
        x = self.predictor(x)
        x = F.relu(x)
        return x

In [253]:
em_sz = 150
nh = 150
nl = 3
model = SimpleLSTMBaseline(nh, emb_dim=em_sz)

In [254]:
import tqdm

opt = optim.Adam(model.parameters(), lr=1e-4)
loss_func = nn.BCEWithLogitsLoss()

epochs = 5

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for left, right, y in train_dl: # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model([left, right])
        loss = loss_func(torch.unsqueeze(y, 1), preds)
        loss.backward()
        opt.step()
        running_loss += loss.data.item() * left.size(0)
        
    epoch_loss = running_loss / len(train_dl)

    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for left, right, y in valid_dl:
        preds = model([left, right])
        loss = loss_func(torch.unsqueeze(y, 1), preds)
        val_loss += loss.data.item() * left.size(0)

    val_loss /= len(valid_dl)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

Epoch: 1, Training Loss: -6759.4239, Validation Loss: -24579.5024
Epoch: 2, Training Loss: -57594.1496, Validation Loss: -99182.1950
Epoch: 3, Training Loss: -163141.1378, Validation Loss: -237726.1536


KeyboardInterrupt: 

In [None]:
As = torch.randn(32,20,50)
Bs = torch.randn(32,20,50)
Cs = torch.matmul(As, torch.transpose(Bs, 1, 2))
print(Cs.shape)
print(torch.unsqueeze(Cs, 1).shape)