### Dataloader 


In this script we will perform a LSTM neural network on the CiteULike dataset. 

In [42]:
#!pip install torchtext

In [43]:
from torchtext.data import Dataset, BucketIterator, Field, TabularDataset, Iterator
from torchtext.vocab import Vocab
import pandas as pd
import numpy as np
import spacy

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch import nn
import torch.nn.utils.rnn as rnn_utils


from torchtext import data
from torchtext import datasets

from torch.autograd import Variable

from torch.nn import Linear, RNN, LSTM

from sklearn.manifold import TSNE

# we'll use the bokeh library to create beautiful plots
# *_notebook functions are needed for correct use in jupyter
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, show, push_notebook
output_notebook()


In [44]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

Defining our dataloader
Splitting the dataset up in a training, validation and test set. 

In [45]:
## Read datasets 
CiteULike_data=pd.read_csv('raw-data.csv')
#Loading small dataset for speed 
User_info = pd.read_csv('user-info-small.csv')


def zero_indexing(nparr):
    uniq = np.unique(nparr)
    name2idx = {o:i for i,o in enumerate(uniq)}
    return np.array([name2idx[o] for o in nparr]), uniq

User_info["user.id"], uniq = zero_indexing(User_info["user.id"].values)

User_info = User_info.sample(frac=1) # Shuffle
User_info["doc.id"] = [CiteULike_data.iloc[int(idx),1] for idx in User_info["doc.id"]]

max_length = max((len(s.split(' ')) for s in CiteULike_data.iloc[:,1]))


# split train, validation and test in respectively 70%,20% and 10% of the data size

n=len(User_info)
trn_len=int(n*0.7)
val_len=int(n*0.2)
test_len=n-trn_len-val_len


train = User_info[:trn_len]
val = User_info[trn_len:-test_len]
test = User_info[trn_len:-val_len]

#Defining unique coloumns and userId and removing the header 
uniq_items = np.unique(CiteULike_data["doc.id"])[:-1]
uniq_users = np.unique(train["user.id"])[:-1]
items = set((x[0],x[1]) for x in train[["user.id","doc.id"]].values)

pairs = []
i = 0
while(i < len(train)):
    
    item = np.random.choice(uniq_items, size = 1)[0]
    user = np.random.choice(uniq_users, size = 1)[0]
    if (user,item) not in items:
        i += 1
        pairs += [(user,item,0)]
        items.add((user,item))

        
        
User_info_Negatives = np.vstack((train, pairs))

train = pd.DataFrame(data = User_info_Negatives,columns = ["user.id","doc.id", "rating"])

val.to_csv('val.csv', header = False, index = False)
test.to_csv('test.csv', header = False, index = False)

del User_info_Negatives, items, uniq_items, uniq_users, pairs, User_info, val, test

for i,val in enumerate(train["doc.id"][n:]):
    train["doc.id"].loc[n+i] = CiteULike_data.iloc[int(val),1]

train = train.sample(frac=1) #shuffle panda style

train.to_csv('train.csv', header = False, index = False)


Tokenizer: ID is the User_ID, Text: is the article's abstract and Label 

In [46]:
spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

#text field is sequential data while label and ID is not. Setting Text data to lower case and setting the fixed length
#to the maximum length so all examples will be padded according to this.
#For Label and ID we set use_vocab=False because it is numerical. 
#include_lengths – Whether to return a tuple of a padded minibatch 
#and a list containing the lengths of each examples, or just a padded minibatch. Default: False.

TEXT = Field(sequential=True, lower=True, include_lengths=True, fix_length=max_length)
LABEL = Field(sequential=False, use_vocab=False)
ID = Field(sequential=False, use_vocab=False)

#Loading data
train, val, test = TabularDataset.splits(
        path='', train='train.csv',
        validation='val.csv', test='test.csv', format='csv',
        fields=[('ID', ID), ('Text', TEXT), ('Label', LABEL)])

#Building vocab
TEXT.build_vocab(train, vectors="glove.6B.100d")
LABEL.build_vocab(train)
ID.build_vocab(train)

In [47]:
print('Text fields:')
#print('keys of TEXT.vocab:', list(TEXT.vocab.__dict__.keys()))
print(' size of vocabulary:', len(TEXT.vocab))
print(" vocabulary's embedding dimension:", TEXT.vocab.vectors.size())
print(' no. times the "fun" appear in the dataset:', TEXT.vocab.freqs['fun'])

print('\nLabel fields:')
#print('keys of LABEL.vocab:', list(LABEL.vocab.__dict__.keys()))
print(" list of vocabulary (int-to-str):", LABEL.vocab.itos)
print(" list of vocabulary (str-to-int):", dict(LABEL.vocab.stoi))

Text fields:
 size of vocabulary: 5025
 vocabulary's embedding dimension: torch.Size([5025, 100])
 no. times the "fun" appear in the dataset: 1

Label fields:
 list of vocabulary (int-to-str): ['<unk>', '0', '1']
 list of vocabulary (str-to-int): {'<unk>': 0, '0': 1, '1': 2}


Choosing batchsize and defining the three iterators and setting sort_within_batch = True so the lengths in the bathsize are sorted in decreasing order. 

In [48]:
batch_size = (45, 45, 45)

train_iter, val_iter, test_iter = BucketIterator.splits(
    (train, val, test), batch_sizes=batch_size, sort_key=lambda x: len(x.Text), sort_within_batch = True)

### LSTM  - Building the model


In [49]:
# size of embeddings
embedding_dim = TEXT.vocab.vectors.size()[1]
num_embeddings = TEXT.vocab.vectors.size()[0]
num_classes = len(LABEL.vocab.itos)
num_users = len(ID.vocab.itos)

n_hidden = 100
l1_hidden = 110
attention_size=100

import sys 

class CFNN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=embedding_dim, n_hidden=n_hidden, l1_hidden=l1_hidden, maxlength = max_length, attention_size=attention_size):
        super(CFNN, self).__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.item_emb = nn.Embedding(num_items, embedding_dim)
        
        # use pretrained embeddings
        #self.item_emb.weight.data.copy_(TEXT.vocab.vectors)
        #self.item_emb.weight.detach_()
        
        self.lin1 = nn.Linear(embedding_dim + n_hidden, l1_hidden)
        self.lin2 = nn.Linear(l1_hidden, 1)
        self.drop0 = nn.Dropout(0.4)
        self.drop1 = nn.Dropout(0.5)
                
        # RNN decoding
        self.rnn = nn.LSTM(embedding_dim, n_hidden, batch_first = False)
        self.rnnlin = nn.Linear(n_hidden, n_hidden)
        
        self.sigmoid = nn.Sigmoid()
        
        # ATTENTION
        self.linA1=nn.Linear(n_hidden, attention_size)
        self.linU1=nn.Linear(n_hidden,attention_size)
                
    
        self.linA2=nn.Linear(attention_size,1)
        self.softmax = nn.Softmax()
        
    def forward(self, u, v, hidden):
        U = self.user_emb(u)
        V = self.item_emb(v)
        
        #V, hidden = self.rnn(V, hidden)
        #hn, cn = hidden
        #V = F.relu(self.rnnlin(hn[-1]))
      
    #_____________________________________________________________________________
        #ATTENTION
        # Steps in self attention
        # Take the LSTM output which has a dimension (len, n_hidden), each column in this output represents a location in the Title sequence.
        # The LSTM output is put through a nn.Linear function
        # It is then put through a tanh-function which gives the dimension (len, attention_size)
        # Again through a linear function
        # Then through a softmax and the dimension is updated to dim(len,1) --> called the alpha weights (sould sum to 1)
        # We then take the alpha weights and the output from LSTM and calculate: ((1/len)*sum_i(alpha_i * LSTMout_i))
        ## Packing, Encoding, Padding
        packed = rnn_utils.pack_padded_sequence(V,seq_lengths)
        rnnOut, (hn, cn) = self.rnn(packed, hidden)
        padded, seq_lengths = rnn_utils.pad_packed_sequence(rnnOut, padding_value=-1e10, total_length=max_length)
        

        itemx1 = self.linA1(padded)
        userx1 = self.linU1(U).unsqueeze(0)
        x2 = torch.tanh(itemx1+userx1)
        x3 = self.linA2(x2)
        alpha = self.softmax(x3,dim=0)
        score= padded*alpha
            
        x = torch.sum(score,dim=0)
        
        
    
        #batchnorm testet here
        
        
        x = self.drop0(x)
        x = F.relu(self.lin1(x))
        x = self.drop1(x)
        #batchnorm testet here
        
        x = self.lin2(x)
        
        x = self.sigmoid(x)
        return x
    
    def init_hidden(self, batch_size):
        init = torch.zeros(1, batch_size, n_hidden)
        return (init,init)

In [50]:
## Training loop
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    for batch_idx, (data, _) in enumerate(train_loader):
        users,(items, seq_lengths),ratings = data
        
        batch_size = len(users)
        users = users.long()
        items = items.long()
        ratings = ratings.float()      
        ratings = ratings.view(-1)
        
        optimizer.zero_grad()
        hidden_size = model.init_hidden(batch_size)
        output = model(users, items, hidden_size)
        output = output.view(-1)

        #print(output)
        
        loss = criterion(output.float(), ratings)
        loss.backward()
        optimizer.step()
        #if batch_idx % len(train_loader.dataset)/10 == 0:
        #    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        #        epoch, batch_idx * len(users), len(train_loader.dataset),
        #        100. * batch_idx / len(train_loader), loss.item()))

def validate(model, val_loader, criterion, epoch):
    model.eval()
    outputlist = []
    val_loss = 0
    TP,FP,TN,FN = 0,0,0,0
    with torch.no_grad():
        for batch_idx, (data, _) in enumerate(val_loader):
            users, (items, seq_lengths), ratings = data
            batch_size = len(users)
            users = users.long()
            items = items.long()
            ratings = ratings.float()
            
            ratings = ratings.unsqueeze(1)
            hidden_size = model.init_hidden(batch_size)
            output = model(users, items, hidden_size)
            output_flat = [0 if o < 0.5 else 1 for o in output.data]
            p = output_flat.count(1)
            TP += p
            FN += len(output_flat) - p
            outputlist += [output]
            val_loss += criterion(output.float(), ratings).item() # sum up batch loss

    #print(TP, FN)
    acc = (TP + TN)/(TP + TN + FP + FN)
    #sens = TP/(TP + FN)
    val_loss /= len(val_loader)
    
    
    print(f'Epoch {epoch}: Validation average loss: {val_loss:.2f} | Accucary: {acc:.2f}')
    return acc, val_loss

def trainLoop(epochs, lr=0.001, wd = 1e-6):
    # Define model    
    model = CFNN(num_users, num_embeddings, embedding_dim)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay = wd)
    
    accs = []
    losses = []
    
    for epoch in range(1, epochs + 1):
        train(model, train_iter, optimizer, criterion, epoch)
        acc, val_loss = validate(model, val_iter, criterion, epoch)
        accs += [acc]
        losses += [val_loss]
    
    
    plt.plot(range(1,epochs+1),accs)
    plt.show()
    plt.plot(range(1,epochs+1),losses)
    plt.show()

In [51]:
trainLoop(epochs=10, lr=0.0001, wd=1e-6)

UnboundLocalError: local variable 'seq_lengths' referenced before assignment

In [None]:
TEXT