In [3]:
!pip install --upgrade torchtext

Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/79/ef/54b8da26f37787f5c670ae2199329e7dccf195c060b25628d99e587dac51/torchtext-0.5.0-py3-none-any.whl (73kB)
[K     |████▌                           | 10kB 771kB/s eta 0:00:01[K     |█████████                       | 20kB 1.2MB/s eta 0:00:01[K     |█████████████▍                  | 30kB 1.6MB/s eta 0:00:01[K     |██████████████████              | 40kB 1.3MB/s eta 0:00:01[K     |██████████████████████▍         | 51kB 1.4MB/s eta 0:00:01[K     |██████████████████████████▉     | 61kB 1.6MB/s eta 0:00:01[K     |███████████████████████████████▍| 71kB 1.8MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 1.8MB/s 
Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/4a/72/0282449efe6e8a7ab6354ac990b8275bd8c881dcbf95b3ef0a041da3897b/torch-1.4.0-cp37-none-macosx_10_9_x86_64.whl (81.1MB)
[K     |                                | 10kB 1.9MB/s eta 0:00:44[K   

In [1]:
!pip install tqdm



In [2]:
!pipenv shell

/bin/sh: pipenv: command not found


In [304]:
import re
import json
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

english_stopwords = set(stopwords.words("english"))
non_alphanum_regex = re.compile(r'\W+')

def preprocess(doc, method='nltk', dataset=True):
    if method == 'spacy':
        tokens = " ".join(
            [
                token.lower_
                for token in doc
                if token
                and not (token.lower_ == "null" or token.is_stop or token.is_punct)
            ]
        )
    elif method == 'nltk':
        # doc = non_alphanum_regex.sub(' ', doc).lower()
        tokens = [
                token
                for token in word_tokenize(doc.lower())
                if not (token == "null" or token in english_stopwords or token in string.punctuation)
            ]
    elif method == 'keras':
        tokens = " ".join(
            [
                token
                for token in text_to_word_sequence(doc)
                if not (token == "null" or token in english_stopwords or token in string.punctuation)
            ]
        )
    if dataset or tokens != "":
        return tokens


def parse_content_line(x, attributes=None, label=True):
    if attributes is None:
        attributes = ["title_left", "title_right"]
    item = json.loads(x)
    elements = [item[attr] if item[attr] is not None else '' for attr in attributes]
    if label:
        elements.append(int(item["label"]))
    item = np.array(elements)
    return item[np.newaxis, :]

In [305]:
from torchtext.data import Field

TEXT = Field(sequential=True, tokenize=preprocess, lower=True, fix_length=20, batch_first=True)
LABEL = Field(sequential=False, use_vocab=False, is_target=True, batch_first=True)

In [306]:
import numpy as np
from pandas import pandas

contents = []
for i, x in enumerate(open('./dataset/computers/train/computers_splitted_train_medium.json', "r").readlines()):
    try:
        item = parse_content_line(x, attributes=None, label=True)
        contents.append(item)
    except:
        print("Lost data at line {}".format(i))

contents = np.concatenate(contents, axis=0)
train = pandas.DataFrame(data=contents, columns=['title_left', 'title_right', 'label'])

In [307]:
import numpy as np
from pandas import pandas

contents = []
for i, x in enumerate(open('./dataset/computers/valid/computers_splitted_valid_medium.json', "r").readlines()):
    try:
        item = parse_content_line(x, attributes=None, label=True)
        contents.append(item)
    except:
        print("Lost data at line {}".format(i))

contents = np.concatenate(contents, axis=0)
valid = pandas.DataFrame(data=contents, columns=['title_left', 'title_right', 'label'])

In [386]:
import numpy as np
from pandas import pandas

contents = []
for i, x in enumerate(open('./dataset/computers/test/computers_gs.json', "r").readlines()):
    try:
        item = parse_content_line(x, attributes=None, label=True)
        contents.append(item)
    except:
        print("Lost data at line {}".format(i))

contents = np.concatenate(contents, axis=0)
test = pandas.DataFrame(data=contents, columns=['title_left', 'title_right', 'label'])

In [387]:
from torchtext.data import Field, Dataset, Example
import pandas as pd

class DataFrameDataset(Dataset):
    """Class for using pandas DataFrames as a datasource"""
    def __init__(self, examples, fields, filter_pred=None):
        """
        Create a dataset from a pandas dataframe of examples and Fields
        Arguments:
            examples pd.DataFrame: DataFrame of examples
            fields {str: Field}: The Fields to use in this tuple. The
                string is a field name, and the Field is the associated field.
            filter_pred (callable or None): use only exanples for which
                filter_pred(example) is true, or use all examples if None.
                Default is None
        """
        self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
        if filter_pred is not None:
            self.examples = filter(filter_pred, self.examples)
        self.fields = dict(fields)
        # Unpack field tuples
        for n, f in list(self.fields.items()):
            if isinstance(n, tuple):
                self.fields.update(zip(n, f))
                del self.fields[n]
        
                
class SeriesExample(Example):
    """Class to convert a pandas Series to an Example"""
    @classmethod
    def fromSeries(cls, data, fields):
        return cls.fromdict(data.to_dict(), fields)
    
    @classmethod
    def fromdict(cls, data, fields):
        ex = cls()
        for key, field in fields.items():
            if key not in data:
                raise ValueError("Specified key {} was not found in "
                "the input data".format(key))
            if field is not None:
                setattr(ex, key, field.preprocess(data[key]))
            else:
                setattr(ex, key, data[key])
        return ex

In [388]:
fields = {"title_left": TEXT, 'title_right': TEXT, 'label': LABEL}
train_ds = DataFrameDataset(train, fields)
valid_ds = DataFrameDataset(valid, fields)
test_ds = DataFrameDataset(test, fields)

In [389]:
print(train_ds[0].title_left)
print(valid_ds[0].title_left)
print(test_ds[0].title_left)

['asus', 'prime', 'x299', 'deluxe', 'prijzen', 'tweakers']
['495906', 'b21', 'hp', 'x5560', '2', '80ghz', 'ml350', 'g6', 'new', 'wholesale', 'price']
['417772', 'b21', 'hp', 'xeon', '5130', '2', '0ghz', 'dl140', 'g3', 'new', 'wholesale', 'price']


In [390]:
TEXT.build_vocab(train_ds, valid_ds, test_ds)

In [391]:
from torchtext.vocab import Vectors

model = gensim.models.KeyedVectors.load_word2vec_format('./dataset/embeddings/w2v/w2v_title_300Epochs_1MinCount_9ContextWindow_150d.bin', binary=True)
# needed vectors not in binary form
# vectors = Vectors(name='new_w2v_title_300Epochs_1MinCount_9ContextWindow_150d.bin', cache='./dataset/embeddings/w2v') # model_name + path = path_to_embeddings_file
TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

In [392]:
from torchtext.data import Iterator, BucketIterator

train_iter, val_iter, test_iter = BucketIterator.splits(
     (train_ds, valid_ds, test_ds), # we pass in the datasets we want the iterator to draw data from
     batch_sizes=(32, 64, 64),
     device=torch.device('cpu'), # if you want to use the GPU, specify the GPU number here
     sort_key=lambda x: min(max(len(x.title_left), len(x.title_right)), 20), # the BucketIterator needs to be told what function it should use to group the data.
     sort_within_batch=True,
     shuffle=True,
     repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [394]:
class BatchWrapper:
    def __init__(self, dl, x_vars, y_var):
        self.dl, self.x_vars, self.y_var = dl, x_vars, y_var # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            left = getattr(batch, self.x_vars[0]) # we assume only one input in this wrapper
            right = getattr(batch, self.x_vars[1]) # we assume only one input in this wrapper
            y = torch.Tensor(list(map(float, getattr(batch, self.y_var))))

            yield (left, right, y)

    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, ['title_left', 'title_right'], 'label')
valid_dl = BatchWrapper(val_iter, ['title_left', 'title_right'], 'label')
test_dl = BatchWrapper(test_iter, ['title_left', 'title_right'], 'label')

In [395]:
next(train_dl.__iter__())

(tensor([[   2, 1610,    7, 1199,  825,    5, 2600, 2593,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1],
         [1296,    7,    2,  104,  806,  143,  175,   86,   38,   14,   10,    9,
             1,    1,    1,    1,    1,    1,    1,    1],
         [ 441,  115,  313,  759,  120,   66,   26,  279,   53,  132,  169,  238,
           239,  240,  230,    1,    1,    1,    1,    1],
         [1556,  428,   12,   11,   41, 1157,    4,  205,   56,  185,  355,  304,
            31,   96, 2516,   84,    1,    1,    1,    1],
         [1415,    7,    2,   65, 1951,  140,  444,  412,   14,    3,   51,   10,
             9,  183,    1,    1,    1,    1,    1,    1],
         [   2,  104,  140,  444,  161, 1140,  859,   99,  399,  989, 1196, 1222,
            22,   38,  188,  262,    1,    1,    1,    1],
         [1299,  408,    2,  202,    5,  190,   67,    3,    6,  120,   81,   14,
            51,   10,    9,  223,    1,    1,    1,    1],
         [155

In [403]:
import gensim
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=150):
        super().__init__() # don't forget to call this!
        # model = gensim.models.KeyedVectors.load_word2vec_format('./dataset/embeddings/w2v/w2v_title_300Epochs_1MinCount_9ContextWindow_150d.bin', binary=True)
        # weights = torch.FloatTensor(model.vectors)
        self.embedding = nn.Embedding.from_pretrained(TEXT.vocab.vectors)
        self.encoder_left = nn.LSTM(emb_dim, hidden_dim, num_layers=1, bidirectional=False, batch_first=True)
        self.encoder_right = nn.LSTM(emb_dim, hidden_dim, num_layers=1, bidirectional=False, batch_first=True)
        self.conv1 = nn.Conv2d(1, 16, 3)
        self.batch_norm1 = nn.BatchNorm2d(16)
        self.max_pool1 = nn.MaxPool2d(2)
        self.mlp1 = nn.Linear(1296, 32)
        self.mlp2 = nn.Linear(32, 16)
        self.out = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, seq):
        hdn_left, _ = self.encoder_left(self.embedding(seq[0]))
        hdn_right, _ = self.encoder_right(self.embedding(seq[1]))
        similarity = torch.matmul(hdn_left, torch.transpose(hdn_right, 1, 2))
        similarity = torch.unsqueeze(similarity, 1)
        x = self.conv1(similarity)
        x = F.relu(x)
        x = self.batch_norm1()
        x = self.max_pool1(x)
        x = torch.flatten(x, start_dim=1)
        x = torch.tanh(x)
        x = self.mlp1(x)
        x = F.relu(x)
        x = F.dropout(x, 0.3)
        x = self.mlp2(x)
        x = F.relu(x)
        x = F.dropout(x, 0.3)
        x = self.out(x)
        x = self.sigmoid(x)
        return x

In [404]:
model = SimpleLSTMBaseline(hidden_dim=150, emb_dim=150)

In [405]:
import tqdm

opt = optim.Adam(model.parameters(), lr=1e-4)
loss_func = nn.BCELoss()

epochs = 15

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for left, right, y in train_dl: # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model([left, right])
        loss = loss_func(preds, torch.unsqueeze(y, 1))
        loss.backward()
        opt.step()
        running_loss += loss.data.item()
        
    epoch_loss = running_loss / len(train_dl)

    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for left, right, y in valid_dl:
        preds = model([left, right])
        loss = loss_func(preds, torch.unsqueeze(y, 1))
        val_loss += loss.data.item()

    val_loss /= len(valid_dl)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

RuntimeError: bool value of Tensor with more than one value is ambiguous

In [402]:
import functools
import operator
from sklearn.metrics import classification_report

y_true = [v[2] for v in test_dl]
y_true = functools.reduce(operator.iconcat, y_true, [])
predictions = []
model.eval() # turn on evaluation mode
for left, right, y in test_dl:
    preds = model([left, right])
    predictions.extend(preds.data > .5)
print(classification_report(y_true, predictions))

              precision    recall  f1-score   support

         0.0       0.82      0.80      0.81       800
         1.0       0.49      0.52      0.51       300

    accuracy                           0.72      1100
   macro avg       0.65      0.66      0.66      1100
weighted avg       0.73      0.72      0.73      1100

