In [1]:
#!pip install torchtext

In [2]:
from torchtext.data import Dataset, BucketIterator, Field, TabularDataset, Iterator
from torchtext.vocab import Vocab
import pandas as pd
import numpy as np
import spacy

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch import nn
from random import shuffle

In [3]:
from torch.nn import Linear
from torch.nn.functional import softmax, relu

from sklearn.manifold import TSNE

# we'll use the bokeh library to create beautiful plotstttfttftftft
# *_notebook functions are needed for correct use in jupyter
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, show, push_notebook
output_notebook()

In [4]:
use_cuda = torch.cuda.is_available()

def get_variable(x):
    """ Converts tensors to cuda, if available. """
    if use_cuda:
        return x.cuda()
    return x

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

In [5]:
def condence(nparr):
    uniq = np.unique(nparr)
    name2idx = {o:i for i,o in enumerate(uniq)}
    return np.array([name2idx[o] for o in nparr]), uniq

## Read set
data='raw-data.csv'
interactions = 'user-info-small.csv'
rawtext = pd.read_csv(data)
interactions = pd.read_csv(interactions)
interactions["user.id"], uniq = condence(interactions["user.id"].values)
sizes = [0.7, 0.2]

iteractions = np.random.shuffle(interactions.values) # Shuffle
interactions["doc.id"] = [rawtext.iloc[int(idx),1] for idx in interactions["doc.id"]]

n = len(interactions)
train_size = int(sizes[0] * n)
val_size = int(sizes[1] * n)
test_size = n - train_size - val_size

train = interactions[:train_size]
val = interactions[train_size:train_size+val_size]
test = interactions[train_size+val_size:]

n = len(train)

uniq_items = np.unique(rawtext["doc.id"])[:-1]
uniq_users = np.unique(train["user.id"])[:-1]
items = set((x[0],x[1]) for x in train[["user.id","doc.id"]].values)

pairs = []
i = 0
while(i < n):
    
    item = np.random.choice(uniq_items, size = 1)[0]
    user = np.random.choice(uniq_users, size = 1)[0]
    if (user,item) not in items:
        i += 1
        pairs += [(user,item,0)]
        items.add((user,item))

interactionsNegatives = np.vstack((train, pairs))

train = pd.DataFrame(data = interactionsNegatives,
                                    columns = ["user.id","doc.id", "rating"]
                                    )


train["doc.id"].loc[n:] = [rawtext.iloc[int(idx),1] for idx in train["doc.id"].loc[n:]]

train = train.sample(frac=1) #shuffle panda style

train.to_csv('citeulike/train.csv', header = False, index = False)
val.to_csv('citeulike/val.csv', header = False, index = False)
test.to_csv('citeulike/test.csv', header = False, index = False)

In [6]:
len(train),len(val)

(2988, 427)

In [7]:
spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = Field(sequential=True, lower=True)
LABEL = Field(sequential=False, use_vocab=False)
ID = Field(sequential=False, use_vocab=False)

train, val, test = TabularDataset.splits(
        path='citeulike', train='train.csv',
        validation='val.csv', test='test.csv', format='csv',
        fields=[('ID', ID), ('Text', TEXT), ('Label', LABEL)])

TEXT.build_vocab(train, vectors="glove.6B.100d")
LABEL.build_vocab(train)
ID.build_vocab(train)

In [8]:
print('Text fields:')
#print('keys of TEXT.vocab:', list(TEXT.vocab.__dict__.keys()))
print(' size of vocabulary:', len(TEXT.vocab))
print(" vocabulary's embedding dimension:", TEXT.vocab.vectors.size())
print(' no. times the "fun" appear in the dataset:', TEXT.vocab.freqs['fun'])

print('\nLabel fields:')
#print('keys of LABEL.vocab:', list(LABEL.vocab.__dict__.keys()))
print(" list of vocabulary (int-to-str):", LABEL.vocab.itos)
print(" list of vocabulary (str-to-int):", dict(LABEL.vocab.stoi))

Text fields:
 size of vocabulary: 5038
 vocabulary's embedding dimension: torch.Size([5038, 100])
 no. times the "fun" appear in the dataset: 0

Label fields:
 list of vocabulary (int-to-str): ['<unk>', '0', '1']
 list of vocabulary (str-to-int): {'<unk>': 0, '0': 1, '1': 2}


In [9]:
batch_size = (30, 30, 30)

train_iter, val_iter, test_iter = BucketIterator.splits(
    (train, val, test), batch_sizes=batch_size, sort_key=lambda x: len(x.Text))

In [None]:
# print batch information
batch = next(iter(train_iter))
print("dimension of batch's text:", batch.Text.size())
print("first sequence in text:", batch.Text[:,0])
print("correct label index:", batch.Label[0])
print("the actual label:", LABEL.vocab.itos[get_numpy(batch.Label[0])])

dimension of batch's text: torch.Size([24, 30])
first sequence in text: tensor([  115,  1457,   556,    18,   149,     2,   960,  1120,   809,
          339,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1])
correct label index: tensor(0)
the actual label: <unk>


Bag of Words Model


In [None]:
# size of embeddings
embedding_dim = TEXT.vocab.vectors.size()[1]
num_embeddings = TEXT.vocab.vectors.size()[0]
num_classes = len(LABEL.vocab.itos)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings, embedding_dim)
        # use pretrained embeddings
        self.embeddings.weight.data.copy_(TEXT.vocab.vectors)
        
        # add hidden layers
        # YOUR CODE HERE!
        self.l_1 = Linear(in_features=embedding_dim,
                           out_features=30,
                           bias=True)
        self.l_2 = Linear(in_features=30,
                           out_features=30,
                           bias=True)
        
        self.dropout1 = nn.Dropout(p=0.5)
        self.dropout2 = nn.Dropout(p=0.5)
        self.dropout3 = nn.Dropout(p=0.5)
        
        # output layer
        self.l_out = Linear(in_features=30,
                            out_features=num_classes,
                            bias=False)
        
    def forward(self, x):
        out = {}
        # get embeddings
        x = self.embeddings(x)
        
        # mean embeddings, this is the bag of words trick
        out['bow'] = x = torch.mean(x, dim=0)
        
        # add hidden layers
        # YOUR CODE HERE!
        out['l1_activations'] = x = self.dropout1(relu(self.l_1(x)))
        out['l2_activations'] = x = self.dropout2(relu(self.l_2(x)))


        # Softmax
        out['out'] = softmax(self.l_out(x), dim=1)
        return out

net = Net()

print(net)

Net(
  (embeddings): Embedding(5038, 100)
  (l_1): Linear(in_features=100, out_features=30, bias=True)
  (l_2): Linear(in_features=30, out_features=30, bias=True)
  (dropout1): Dropout(p=0.5)
  (dropout2): Dropout(p=0.5)
  (dropout3): Dropout(p=0.5)
  (l_out): Linear(in_features=30, out_features=3, bias=False)
)


In [None]:
print(LABEL.vocab.itos)

['<unk>', '0', '1']


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=1e-5)

def accuracy(ys, ts):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

In [None]:
def construct_sentences(batch):
    """    
    Parameters
    ----------
    batch: torchtext.data.batch.Batch
    
    Returns
    -------
    [str]
    """
    return [" ".join([TEXT.vocab.itos[elm] 
                      for elm in get_numpy(batch.Text[:,i])])
            for i in range(batch.Text.size()[1])]

def get_labels(batch):
    """
    Parameters
    ----------
    batch: torchtext.data.batch.Batch
    
    Returns
    -------
    [str]
    """
    return [LABEL.vocab.itos[get_numpy(batch.Label[i])] for i in range(len(batch.Label))]


In [None]:
# to project our hidden embeddings to a visualizable space
tsne = TSNE(perplexity=10.0, learning_rate=5.0, n_iter=2000)

# index for each label
colormap = {1: 'DodgerBlue', 2: 'FireBrick'}

# create a tmp source to be updated later
validation_set_size = len(val)
source = ColumnDataSource(data={'x': np.random.randn(validation_set_size),
                                'y': np.random.randn(validation_set_size),
                                'colors': ['green']*validation_set_size,
                                'sentences': ["tmp"]*validation_set_size,
                                'labels': ["unk"]*validation_set_size})

# instance to define hover logic in plot
hover = HoverTool(tooltips=[("Sentence", "@sentences"), ("Label", "@labels")])

# set up the bokeh figure for later visualizations
p = figure(tools=[hover])
p.circle(x='x', y='y', fill_color='colors', size=5, line_color=None, source=source)

def update_plot(meta, layer, handle):
    """ 
    Update existing plot
    
    Parameters
    ----------
    meta: dict
    layer: str
    """
    tsne_acts = tsne.fit_transform(meta[layer])
    source.data['x'] = tsne_acts[:,0]
    source.data['y'] = tsne_acts[:,1]
    source.data['colors'] = [colormap[l] for l in meta['label_idx']]
    
    source.data['sentences'] = meta['sentences']
    source.data['labels'] = meta['labels']
    
    # this updates the given plot
    push_notebook(handle=handle)

In [None]:
max_iter = 3000
eval_every = 1000
log_every = 200

# will be updated while iterating
tsne_plot = show(p, notebook_handle=True)

train_loss, train_accs = [], []

net.train()
for i, batch in enumerate(train_iter):
    if i % eval_every == 0:
        net.eval()
        val_losses, val_accs, val_lengths = 0, 0, 0
        val_meta = {'label_idx': [], 'sentences': [], 'labels': []}
        for val_batch in val_iter:
            output = net(val_batch.Text)
            # batches sizes might vary, which is why we cannot just mean the batch's loss
            # we multiply the loss and accuracies with the batch's size,
            # to later divide by the total size
            val_losses += criterion(output['out'], val_batch.Label) * val_batch.batch_size
            val_accs += accuracy(output['out'], val_batch.Label) * val_batch.batch_size
            val_lengths += val_batch.batch_size
            
            for key, _val in output.items():
                if key not in val_meta:
                    val_meta[key] = []
                val_meta[key].append(get_numpy(_val)) 
            val_meta['label_idx'].append(get_numpy(val_batch.Label))
            val_meta['sentences'].append(construct_sentences(val_batch))
            val_meta['labels'].append(get_labels(val_batch))
        
        for key, _val in val_meta.items():
            val_meta[key] = np.concatenate(_val)
        
        # divide by the total accumulated batch sizes
        val_losses /= val_lengths
        val_accs /= val_lengths
        
        print("valid, it: {} loss: {:.2f} accs: {:.2f}\n".format(i, get_numpy(val_losses), get_numpy(val_accs)))
        update_plot(val_meta, 'bow', tsne_plot)
        
        net.train()
    
    output = net(batch.Text)
    batch_loss = criterion(output['out'], batch.Label)
    
    train_loss.append(get_numpy(batch_loss))
    train_accs.append(get_numpy(accuracy(output['out'], batch.Label)))
    
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()
    
    if i % log_every == 0:        
        print("train, it: {} loss: {:.2f} accs: {:.2f}".format(i, 
                                                               np.mean(train_loss), 
                                                               np.mean(train_accs)))
        # reset
        train_loss, train_accs = [], []
        
    if max_iter < i:
        break

  return Variable(arr, volatile=not train)


valid, it: 0 loss: 1.09 accs: 0.66

train, it: 0 loss: 1.09 accs: 0.53
train, it: 200 loss: 1.00 accs: 0.49
train, it: 400 loss: 0.96 accs: 0.49
train, it: 600 loss: 0.96 accs: 0.51
train, it: 800 loss: 0.96 accs: 0.50


  return Variable(arr, volatile=not train)


valid, it: 1000 loss: 0.96 accs: 0.54

train, it: 1000 loss: 0.96 accs: 0.51
train, it: 1200 loss: 0.96 accs: 0.51
train, it: 1400 loss: 0.96 accs: 0.50
train, it: 1600 loss: 0.96 accs: 0.52
train, it: 1800 loss: 0.95 accs: 0.54


  return Variable(arr, volatile=not train)


valid, it: 2000 loss: 0.97 accs: 0.57

train, it: 2000 loss: 0.89 accs: 0.66
train, it: 2200 loss: 0.75 accs: 0.82
train, it: 2400 loss: 0.69 accs: 0.87
train, it: 2600 loss: 0.66 accs: 0.90
train, it: 2800 loss: 0.65 accs: 0.91


  return Variable(arr, volatile=not train)


valid, it: 3000 loss: 1.00 accs: 0.55



In [None]:
# Vi har prøvet at bruge dropout og weight deca for at forhindre ovefitting.... Det gik ikke så godt, har vi andre ideer?
