# Model 0:

In [None]:
# import all required libraries

import torch
import torchtext
from torch import nn
import time
import copy
import random
random.seed(1)
torch.manual_seed(1)
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict, Counter, OrderedDict
# plotting
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
# plotting cosmetics
%config InlineBackend.figure_format = 'svg' 
#%config InlineBackend.figure_format = 'retina' 
plt.style.use('seaborn')

In [None]:
def train(infile):
    stats = defaultdict(Counter)
    with open(infile) as f:
        for line in f:
            #sense, lemma, _, = l.split(maxsplit=2)
            sense, lemma, position, text= line.split(maxsplit=4)
            stats[lemma][sense][position][text] += 1

    return { line: stats_line.most_common(1)[0][0] for line, stats_line in stats.items() }
    def train(infile):
      stats = defaultdict(Counter)
    with open(infile) as f:
        for l in f:
            sense, lemma, _, = l.split(maxsplit=2)
            stats[lemma][sense] += 1

    return { l: stats_l.most_common(1)[0][0] for l, stats_l in stats.items() }



def train(infile):
    stats = defaultdict(Counter)
    with open(infile) as f:
        for l in f:
            sense, lemma, _, = l.split(maxsplit=2)
            stats[lemma][sense] += 1

    return { l: stats_l.most_common(1)[0][0] for l, stats_l in stats.items() }




def run(model, infile, outfile):
    f_out = open(outfile, 'w')
    with open(infile) as f_in:
        for l in f_in:
            sense, lemma, _, = l.split(maxsplit=2)
            print(model[lemma], file=f_out)
    f_out.close()

TRAIN_DATA = 'wsd_train.txt'
TEST_DATA = 'wsd_test_blind.txt'
OUTPUT = 'dummy_baseline.txt'

model = train(TRAIN_DATA)
run(model, TEST_DATA, OUTPUT)

In [None]:
!python3 evaluate.py wsd_test.txt dummy_baseline.txt

# Model 1: Continuous Bag-of-Words approach
Representing documents for neural networks: 

In [None]:
class CBoWDocumentRepresentation(nn.Module):
    
    def __init__(self, voc_size, emb_dim):
        super().__init__()
        self.embedding = nn.Embedding(voc_size, emb_dim)

    def forward(self, X):
       
        embedded = self.embedding(X)
        cbow_repr = embedded.mean(dim=1)
        return cbow_repr



def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            sense, lemma, position, text= line.split("\t")
            X.append(text)
            Y.append(sense)
    return X, Y
    


# Encoding documents for neural networks

PAD = '___PAD___'
UNKNOWN = '___UNKNOWN___'

class Vocabulary:
    """Manages the numerical encoding of the vocabulary."""
    
    def __init__(self, tokenizer=None, max_voc_size=None):

        # String-to-integer mapping
        self.stoi = None

        # Integer-to-string mapping
        self.itos = None

        # Tokenizer that will be used to split document strings into words.
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = lambda s: s.split()

        # Maximally allowed vocabulary size.
        self.max_voc_size = max_voc_size
        
    def build(self, docs):
        """Builds the vocabulary, based on a set of documents."""
        
        # Sort all words by frequency
        word_freqs = Counter(w for doc in docs for w in self.tokenizer(doc))
        word_freqs = sorted(((f, w) for w, f in word_freqs.items()), reverse=True)

        # Build the integer-to-string mapping. The vocabulary starts with the two dummy symbols,
        # and then all words, sorted by frequency. Optionally, limit the vocabulary size.
        if self.max_voc_size:
            self.itos = [PAD, UNKNOWN] + [ w for _, w in word_freqs[:self.max_voc_size-2] ]
        else:
            self.itos = [PAD, UNKNOWN] + [ w for _, w in word_freqs ]

        # Build the string-to-integer map by just inverting the aforementioned map.
        self.stoi = { w: i for i, w in enumerate(self.itos) }
        
    def encode(self, docs):
        """Encodes a set of documents."""
        unkn_index = self.stoi[UNKNOWN]
        return [[self.stoi.get(w, unkn_index) for w in self.tokenizer(doc)] for doc in docs]

    def get_unknown_idx(self):
        """Returns the integer index of the special dummy word representing unknown words."""
        return self.stoi[UNKNOWN]
    
    def get_pad_idx(self):
        """Returns the integer index of the special padding dummy word."""
        return self.stoi[PAD]
    
    def __len__(self):
        return len(self.itos)
  


# Managing batches
class DocumentDataset(Dataset):
    """A Dataset that stores a list of documents and their corresponding category labels."""
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
        
    def __len__(self):
        return len(self.X)

class DocumentBatcher:
    """A collator that builds a batch from a number of documents."""
    
    def __init__(self, voc):
        # Find the integer index of the dummy padding word.
        self.pad = voc.get_pad_idx()
    
    def __call__(self, XY):
        """Build a batch from a number of documents. Returns two tensors X and Y, where
        X is the document tensor, of shape [n_docs, max_doc_length]

        and 
        
        Y is the label tensor, of shape [n_docs].
        """
        
        # How long is the longest document in this batch?
        max_len = max(len(x) for x, _ in XY)

        # Build the document tensor. We pad the shorter documents so that all documents
        # have the same length.
        Xpadded = torch.as_tensor([x + [self.pad]*(max_len-len(x)) for x, _ in XY])

        # Build the label tensor.
        Y = torch.as_tensor([y for _, y in XY])

        return Xpadded, Y
    



# Implementing the document classifier infrastructure
class TextClassifier:
    """A text classifier based on a neural network."""
    
    def __init__(self, params, model_factory):
        self.params = params
        self.model_factory = model_factory
        
    def epoch(self, batches, optimizer=None):
        """Runs the neural network for one epoch, using the given batches.
        If an optimizer is provided, this is training data and we will update the model
        after each batch. Otherwise, this is assumed to be validation data.
        
        Returns the loss and accuracy over the epoch."""
        n_correct = 0
        n_instances = 0
        total_loss = 0
        
        for Xbatch, Ybatch in batches:
            
            # If we're using the GPU, move the batch there.
            Xbatch = Xbatch.to(self.params.device)
            Ybatch = Ybatch.to(self.params.device)


            # Compute the predictions for this batch.
            scores = self.model(Xbatch)

            # Compute the loss for this batch.
            loss = self.loss(scores, Ybatch)

            total_loss += loss.item()
            n_instances += Ybatch.shape[0]

            # Compute the number of correct predictions, for the accuracy.
            guesses = scores.argmax(dim=1)
            n_correct += (guesses == Ybatch).sum().item()

            # If this is training data, update the model.
            if optimizer:
                optimizer.zero_grad()                
                loss.backward()
                optimizer.step()
           
        return total_loss/len(batches), n_correct/n_instances
    
    def preprocess(self, X, Y):
        """Carry out the document preprocessing, then build `DataLoader`s for the training and validation sets."""
        Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size=0.2, random_state=0)
        
        self.voc = Vocabulary(max_voc_size=self.params.max_voc_size)
        self.voc.build(Xtrain)
        self.lbl_enc = LabelEncoder()
        self.lbl_enc.fit(Ytrain)

        self.voc_size = len(self.voc)
        self.n_classes = len(self.lbl_enc.classes_)
        
        batcher = DocumentBatcher(self.voc)
        
        train_dataset = DocumentDataset(self.voc.encode(Xtrain), self.lbl_enc.transform(Ytrain))
        self.train_loader = DataLoader(train_dataset, self.params.batch_size, shuffle=True,
                                  collate_fn=batcher)
        val_dataset = DocumentDataset(self.voc.encode(Xval), self.lbl_enc.transform(Yval))
        self.val_loader = DataLoader(val_dataset, self.params.batch_size, shuffle=True,
                                collate_fn=batcher)
    
    
    def fit(self, X, Y):
        """Train the model. We assume that a dataset and a model have already been provided."""
        par = self.params

        self.preprocess(X, Y)

        # Call the factory to create the model.
        self.model = self.model_factory(self, par)

        # If we're using a GPU, put the model there.
        self.model.to(par.device)
    
        # Declare a loss function, in this case the cross-entropy.
        self.loss = torch.nn.CrossEntropyLoss()

        # An optimizer for updating the neural network. We use the Adam optimizer.
        optimizer = torch.optim.Adam(self.model.parameters(), lr=par.eta, weight_decay=par.decay)

        # We'll log the loss and accuracy scores encountered during training.
        self.history = defaultdict(list)
        
        for epoch in range(1, par.n_epochs+1):

            t0 = time.time()
            
            # Set the model in training mode, enabling dropout modules.
            self.model.train()
            
            # Run the model on the training data.
            train_loss, train_acc = self.epoch(self.train_loader, optimizer)
            
            # Set the model in evaluation mode, disabling dropout modules.
            self.model.eval()

            # Run the model on the validation data.            
            val_loss, val_acc = self.epoch(self.val_loader)
            
            t1 = time.time()

            self.history['train_loss'].append(train_loss)
            self.history['train_acc'].append(train_acc)
            self.history['val_loss'].append(val_loss)
            self.history['val_acc'].append(val_acc)
            self.history['time'].append(t1-t0)
            
            if epoch % 5 == 0:
                print(f'Epoch {epoch}: train loss:{train_loss:.4f}, train acc: {train_acc:.4f}, '
                      + f'val loss: {val_loss:.4f}, val acc: {val_acc:.4f}, time: {t1-t0:.4f}')        
        
    def predict(self, X):
        """Run a trained document classifier on a set of documents and return the predictions."""
        batcher = DocumentBatcher(self.voc)
        
        # Build a DataLoader to generate the batches, as above.
        dummy_labels = [self.lbl_enc.classes_[0] for x in X]        
        dataset = DocumentDataset(self.voc.encode(X), self.lbl_enc.transform(dummy_labels))
        loader = DataLoader(dataset, self.params.batch_size, collate_fn=batcher)

        # Apply the model to all the batches and aggregate the predictions.
        self.model.eval()
        output = []
        for Xbatch, Ybatch in loader:
            Xbatch = Xbatch.to(self.params.device)
            Ybatch = Ybatch.to(self.params.device)
            scores = self.model(Xbatch)
            guesses = scores.argmax(dim=1)
            output.extend(self.lbl_enc.inverse_transform(guesses.cpu().numpy()))
        return output


# Training the model
class TextClassifierParameters:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_voc_size = None    
    n_epochs = 50
    batch_size = 64    
    eta = 3e-3
    decay = 1e-6
    emb_dim = 64
    dropout = 0.5
    

def factory(clf, params):
    return nn.Sequential(
        CBoWDocumentRepresentation(clf.voc_size, params.emb_dim),
        nn.Dropout(params.dropout),
        nn.Linear(in_features=params.emb_dim, out_features=clf.n_classes)
    )


def main_cbow():
    torch.manual_seed(0)

    X, Y = read_data('wsd_train.txt')   
    params = TextClassifierParameters()
    
    clf = TextClassifier(params, factory)
        
    clf.fit(X, Y)

    plt.figure()
    plt.plot(clf.history['train_loss'])
    plt.plot(clf.history['val_loss'])

    plt.figure()
    plt.plot(clf.history['train_acc'])
    plt.plot(clf.history['val_acc'])
    
    return clf

clf = main_cbow()    

In [None]:
clf.predict(['i watch movie', 'I have spare hour to spend'])

In [None]:
def nearest_neighbors(emb, voc, word, n_neighbors=5):
        
    word_index = torch.as_tensor([voc.stoi[word]])
    word_index = word_index.to(emb.weight.device)
    test_emb = emb(word_index)
    sim_func = nn.CosineSimilarity(dim=1)
    cosine_scores = sim_func(test_emb, emb.weight)
    near_nbr = cosine_scores.topk(n_neighbors+1)
    topk_cos = near_nbr.values[1:]
    topk_indices = near_nbr.indices[1:]
    out = [ (voc.itos[ix.item()], cos.item()) for ix, cos in zip(topk_indices, topk_cos) ]
    return out

In [None]:
clf.model

In [None]:
nearest_neighbors(clf.model[0].embedding, clf.voc, 'author', n_neighbors=10)

In [None]:
def max_magnitude(emb, voc, n_max=5):

    # emb.weight has shape (voc_size, emb_dim)

    sq_all = (emb.weight**2).sum(dim=1)
    
    # sq_all has shape voc_size

    topk_sq, topk_indices = sq_all.topk(n_max)
        
    out = [ (voc.itos[ix.item()], sq.item()) for ix, sq in zip(topk_indices, topk_sq) ]
    return out

In [None]:
max_magnitude(clf.model[0].embedding, clf.voc, n_max=25)

In [None]:
def plot_embeddings_pca(emb, voc, words):
    w_ix = [voc.stoi[word] for word in words]

    example_vectors = emb.weight[w_ix].detach().cpu().numpy()
    example_vectors -= example_vectors.mean(axis=0)
    twodim = TruncatedSVD(n_components=2).fit_transform(example_vectors)
    plt.figure(figsize=(5,5))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r', s=5)
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.02, y, word)
    plt.axis('off')

words = ['positive', 'worst', 'serve', 'holds', 'regular','sees', 'followed', 'job', 'driving','see', 'follow', 'extend','build', 'bad', 'kept','sees']
         
plot_embeddings_pca(clf.model[0].embedding, clf.voc, words)


# Model 2: Unidirectional LSTM

In [None]:
class LSTMClassifier(nn.Module):
    
    def __init__(self, num_embeddings, num_classes, embedding_dim, hidden_size):
        super().__init__()  

        vocab_size = len(num_embeddings.vocab)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)

        self.dropout = nn.Dropout(0.5) # Dropout layer
        
        self.a = nn.Linear( hidden_size,hidden_size)
        num_class = len(num_classes.vocab) 
        self.y = nn.Linear(hidden_size, num_class)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        output, (h_n, c_n) = self.lstm(x)
        return self.softmax(self.y(h_n[-1]))


def read_data(corpus_file, doc_start, with_padding = True):

    text = torchtext.legacy.data.Field(sequential=True, tokenize=lambda x: x.split())
    label = torchtext.legacy.data.LabelField(is_target=True)
    datafields = [('text', text), ('label', label)]
    column_labeling=0
    if with_padding == True:
        pad_string = '<pad>'
        sentence_length = 160
        half_sentence_length = int(sentence_length/2)
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for line in f:
                columns = line.strip().split(maxsplit=doc_start)
                position_of_wordtype = int(columns[2])
                doc = columns[-1]
                doc_string_vector = doc.split()
                temp_pad = [pad_string for x in range(0,80)]

                swap_padding = []
                swap_padding.extend(temp_pad)
                swap_padding.extend(doc_string_vector)
                swap_padding.extend(temp_pad)

                sliced_doc = swap_padding[position_of_wordtype:position_of_wordtype + 160]

                if len(sliced_doc) != 160:
                    print(sliced_doc)
                    raise RuntimeError

                sliced_doc = " ".join(sliced_doc)
                label = columns[column_labeling]

                examples.append(torchtext.legacy.data.Example.fromlist([sliced_doc, label], datafields))
    else:
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for line in f:
                columns = line.strip().split(maxsplit=doc_start)
                doc = columns[-1]
                label = columns[column_labeling]
                examples.append(torchtext.data.Example.fromlist([doc, label], datafields))
    raw_data = torchtext.legacy.data.Dataset(examples, datafields,filter_pred=None)

    # Read complete dataset to get set of word-types. E.i 'keep', 'line'...
    filter_function = None
    lemmas = set()
    for example in raw_data.examples:
        lemmas.add(example.label.split("%", 1)[0])
    lemmas = list(lemmas)

    # Create cleaned datasets for each word-type
    cleaned_datasets = OrderedDict()
    for a_lemma in lemmas:
        filter_function = lambda ex: ex.label.split("%", 1)[0] == a_lemma
        text = torchtext.legacy.data.Field(sequential=True, tokenize=lambda x: x.split())
        label = torchtext.legacy.data.LabelField(is_target=True)
        datafields = [('text', text), ('label', label)]

        cleaned_data_set = torchtext.legacy.data.Dataset(examples, datafields, filter_pred=filter_function)
        cleaned_datasets[a_lemma] = (cleaned_data_set, text, label)
    return cleaned_datasets


def evaluate_validation(scores, loss_function, gold):
    guesses = scores.argmax(dim=1)
    n_correct = (guesses == gold).sum().item()
    return n_correct, loss_function(scores, gold).item()


use_pretrained = True
cleaned_datasets = read_data('wsd_train.txt', doc_start=4)

models = OrderedDict()
max_verifications = OrderedDict()
model_vocabs = OrderedDict()
model_label_vocabs = OrderedDict()

for lemma, cleaned_dataset in cleaned_datasets.items():
    dataset = cleaned_dataset[0]
    text = cleaned_dataset[1]
    label = cleaned_dataset[2]

    train, valid = dataset.split([0.7, 0.3])

    if use_pretrained:
        text.build_vocab(train, vectors="glove.6B.100d")
    else:
        text.build_vocab(train, max_size=10000)
    
    model_vocabs[lemma] = text.vocab
    label.build_vocab(train)
    model_label_vocabs[lemma] = label.vocab
        
    model =LSTMClassifier(text, label, embedding_dim=100, hidden_size=100)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    training_executon = torchtext.legacy.data.Iterator(train, device=device, batch_size=128, repeat=False,train=True,
        sort=False)

    validation_execution = torchtext.legacy.data.Iterator(valid, device=device, batch_size=128, repeat=False,train=False, sort=False)

    loss_function = torch.nn.CrossEntropyLoss()   
    optimizer = torch.optim.SGD(model.parameters(), lr=2, momentum=0.1) 
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.96)


    train_batches = list(training_executon)
    valid_batches = list(validation_execution)

    history = defaultdict(list)
    max_val_acc = -1

    print(f"Training lemma for: {lemma}")
    for i in range(50):
        t0 = time.time()
        loss_sum = 0
        n_batches = 0
        model.train()
        
        for batch in train_batches:         
            scores = model(batch.text)
            loss = loss_function(scores, batch.label)
            optimizer.zero_grad()            
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()
            n_batches += 1
        
        train_loss = loss_sum / n_batches
        history['train_loss'].append(train_loss)
        
        n_correct = 0
        n_valid = len(valid)
        loss_sum = 0
        n_batches = 0

        model.eval()
        
        for batch in valid_batches:
            scores = model(batch.text)
            n_corr_batch, loss_batch = evaluate_validation(scores, loss_function, batch.label)
            loss_sum += loss_batch
            n_correct += n_corr_batch
            n_batches += 1
        val_acc = n_correct / n_valid
        val_loss = loss_sum / n_batches

        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        if val_acc > max_val_acc:
            max_val_acc = val_acc
            models[lemma] = copy.deepcopy(model)
            max_verifications[lemma] = max_val_acc
        
        scheduler.step()

        t1 = time.time()

        if (i+1) % 10 == 0:
            print(f'Epoch {i+1}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}, val acc: {val_acc:.4f}, time = {t1-t0:.4f}, lr = {optimizer.param_groups[0]["lr"]}')

    plt.plot(history['train_loss'])
    plt.plot(history['val_loss'])
    plt.plot(history['val_acc'])
    plt.legend(['training loss', 'validation loss', 'validation accuracy'])
    plt.show()
    

## Test Unidirectional LSTM

In [None]:
def read_test_data(corpus_file, doc_start):
    TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=lambda x: x.split())
    WORDTYPE = torchtext.legacy.data.Field()
    datafields = [('text', TEXT), ('wordtype', WORDTYPE)]
    pad_string = '<pad>'
    sentence_length = 160
    half_sentence_length = int(sentence_length/2)
    with open(corpus_file, encoding='utf-8') as f:
        examples = []
        for line in f:
            columns = line.strip().split(maxsplit=doc_start)
            position_of_wordtype = int(columns[2])

            # Split the long string doc into array and extract words before the wordtype.
            doc = columns[-1]
            doc_string_vector = doc.split()
            temp_pad = [pad_string for x in range(0,80)]

            swap_padding = []
            swap_padding.extend(temp_pad)
            swap_padding.extend(doc_string_vector)
            swap_padding.extend(temp_pad)

            sliced_doc = swap_padding[position_of_wordtype:position_of_wordtype + 160]

            if len(sliced_doc) != 160:
                print(sliced_doc)
                raise RuntimeError

            sliced_doc = " ".join(sliced_doc)

            wordtype = columns[1].split('.')[0]
            examples.append(torchtext.legacy.data.Example.fromlist([sliced_doc, wordtype], datafields))
    dataset = torchtext.legacy.data.Dataset(examples, datafields)
    return (dataset, TEXT, WORDTYPE)

In [None]:
test_dataset = read_test_data('wsd_test_blind.txt', doc_start=4)
for example in test_dataset[0].examples:
    test_example_string = example.text
    example_wordtype = example.wordtype[0]

    # Load vocabs
    example_vocab = model_vocabs[example_wordtype]
    example_label_vocabs = model_label_vocabs[example_wordtype]


    # Encode string
    encoded_example = torch.tensor([example_vocab.stoi[x] for x in test_example_string],device=device, requires_grad = False)

    # Load correct model
    model = models[example_wordtype]
    model.lstm.flatten_parameters()

    # Get prediction
    scores = model(encoded_example.unsqueeze(1))
    prediction = scores.argmax(dim=1)
    
    # Decode prediction
    decoded_prediction = example_label_vocabs.itos[prediction]
    print(decoded_prediction)

# Model 3: Bidirectional LSTM

In [None]:
class BILSTMModel(nn.Module):
    
    def __init__(self, num_embeddings, num_classes, embedding_dim, hidden_dim, update_pretrained=False):
        super().__init__()        

        
        vocab_size = len(num_embeddings.vocab)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # bidirectional LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True )
        
        self.dropout = nn.Dropout(0.5) # Dropout layer
  
        self.a = nn.Linear(2 * hidden_dim, 2 * hidden_dim)  # hidden layer
        # classification layers
        num_class = len(num_classes.vocab) 
        self.y = nn.Linear(2 * hidden_dim, num_class)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)

        self.batch_size = output.size()[1]
        self.num_directions = 2
        output = output.view(-1, self.batch_size,self.num_directions,  self.lstm.hidden_size)
        forward = output[80 - 1, :, 0, :]
        backward = output[80, :, 1, :]
        output = torch.cat([forward, backward], dim=1)
        dropped_output = self.dropout(output)
        hidden = self.a(dropped_output)
        dropped_hidden = self.dropout(hidden)
        return self.softmax(self.y(dropped_hidden))

### This read data function will read the dataset and after reading the dataset, we did data pre-processing.
### We did paading from both right and left side.
def read_data(corpus_file, doc_start, with_padding = True):

    text = torchtext.legacy.data.Field(sequential=True, tokenize=lambda x: x.split())
    label = torchtext.legacy.data.LabelField(is_target=True)
    datafields = [('text', text), ('label', label)]
    column_labeling=0
    if with_padding == True:
        pad_string = '<pad>'
        sentence_length = 160
        half_sentence_length = int(sentence_length/2)
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for line in f:
                columns = line.strip().split(maxsplit=doc_start)
                position_of_wordtype = int(columns[2])

                doc = columns[-1]
                doc_string_vector = doc.split()
                temp_pad = [pad_string for x in range(0,80)]

                swap_padding = []
                swap_padding.extend(temp_pad)
                swap_padding.extend(doc_string_vector)
                swap_padding.extend(temp_pad)

                sliced_doc = swap_padding[position_of_wordtype:position_of_wordtype + 160]

                if len(sliced_doc) != 160:
                    print(sliced_doc)
                    raise RuntimeError

                sliced_doc = " ".join(sliced_doc)
                label = columns[column_labeling]

                examples.append(torchtext.legacy.data.Example.fromlist([sliced_doc, label], datafields))
    else:
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for line in f:
                columns = line.strip().split(maxsplit=doc_start)
                doc = columns[-1]
                label = columns[column_labeling]
                examples.append(torchtext.data.Example.fromlist([doc, label], datafields))
    raw_data = torchtext.legacy.data.Dataset(examples, datafields,filter_pred=None)

    # Read complete dataset to get set of word-types. E.i 'keep', 'line'...
    filter_function = None
    lemmas = set()
    for example in raw_data.examples:
        lemmas.add(example.label.split("%", 1)[0])
    lemmas = list(lemmas)

    # Create clean datasets for each word-type
    cleaned_datasets = OrderedDict()
    for a_lemma in lemmas:
        filter_function = lambda ex: ex.label.split("%", 1)[0] == a_lemma
        text = torchtext.legacy.data.Field(sequential=True, tokenize=lambda x: x.split())
        label = torchtext.legacy.data.LabelField(is_target=True)
        datafields = [('text', text), ('label', label)]

        cleaned_data_set = torchtext.legacy.data.Dataset(examples, datafields, filter_pred=filter_function)
        cleaned_datasets[a_lemma] = (cleaned_data_set, text, label)
    return cleaned_datasets


def evaluate_validation(scores, loss_function, gold):
    guesses = scores.argmax(dim=1)
    n_correct = (guesses == gold).sum().item()
    return n_correct, loss_function(scores, gold).item()


use_pretrained = True
cleaned_datasets = read_data('wsd_train.txt', doc_start=4)

# Vectorizing the data
models = OrderedDict()
max_verifications = OrderedDict()
model_vocabs = OrderedDict()
model_label_vocabs = OrderedDict()

for lemma, cleaned_dataset in cleaned_datasets.items():
    dataset = cleaned_dataset[0]
    text = cleaned_dataset[1]
    label = cleaned_dataset[2]

    train, valid = dataset.split([0.7, 0.3])

    if use_pretrained:
      text.build_vocab(train, vectors="glove.6B.100d")
    else:        
      text.build_vocab(train, max_size=10000)
    
    model_vocabs[lemma] = text.vocab
    label.build_vocab(train)
    model_label_vocabs[lemma] = label.vocab
        
    model = BILSTMModel(text, label, embedding_dim=100, hidden_dim=100, update_pretrained=True)

# Before we start, let's check whether CUDA (the type of GPU needed for neural network training) is available in our system.

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

# Now, after all this preparatory work, let's train the document classifier. 
# We read the data, preprocess the data, declare a BiLSTM, and train the classifier.
# Finally, we plot the losses and accuracies we have seen during training

    training_execution = torchtext.legacy.data.Iterator(train, device=device,batch_size=128, repeat=False,train=True,sort=False)
    validation_execution = torchtext.legacy.data.Iterator(valid,device=device, batch_size=128,repeat=False, train=False,sort=False)

    loss_function = torch.nn.CrossEntropyLoss()   
    optimizer = torch.optim.SGD(model.parameters(), lr=2, momentum=0.1) 
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.96)


    train_batches = list(training_execution)
    valid_batches = list(validation_execution)

    history = defaultdict(list)
    max_val_acc = -1

    print(f"Training lemma for:  {lemma}")
    for i in range(50):
        t0 = time.time()
        loss_sum = 0
        n_batches = 0
        model.train()
        
        for batch in train_batches:         
            scores = model(batch.text)
            loss = loss_function(scores, batch.label)
            optimizer.zero_grad()            
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()
            n_batches += 1
        
        train_loss = loss_sum / n_batches
        history['train_loss'].append(train_loss)
        
        n_correct = 0
        n_valid = len(valid)
        loss_sum = 0
        n_batches = 0

        model.eval()
# Start evaluation
        for batch in valid_batches:
            scores = model(batch.text)
            n_corr_batch, loss_batch = evaluate_validation(scores, loss_function, batch.label)
            loss_sum += loss_batch
            n_correct += n_corr_batch
            n_batches += 1
        val_acc = n_correct / n_valid
        val_loss = loss_sum / n_batches

        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        if val_acc > max_val_acc:
            max_val_acc = val_acc
            models[lemma] = copy.deepcopy(model)
            max_verifications[lemma] = max_val_acc
        
        scheduler.step()

        t1 = time.time()

        if (i+1) % 10 == 0:
            print(f'Epoch {i+1}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}, val acc: {val_acc:.4f}, time = {t1-t0:.4f}, lr = {optimizer.param_groups[0]["lr"]}')

    plt.plot(history['train_loss'])
    plt.plot(history['val_loss'])
    plt.plot(history['val_acc'])
    plt.legend(['training loss', 'validation loss', 'validation accuracy'])
    plt.show()

## Test Bidirectional LSTM

In [None]:
def read_test_data(corpus_file, doc_start):
    TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=lambda x: x.split())
    WORDTYPE = torchtext.legacy.data.Field()
    datafields = [('text', TEXT), ('wordtype', WORDTYPE)]
    pad_string = '<pad>'
    sentence_length = 160
    half_sentence_length = int(sentence_length/2)
    with open(corpus_file, encoding='utf-8') as f:
        examples = []
        for line in f:
            columns = line.strip().split(maxsplit=doc_start)
            position_of_wordtype = int(columns[2])
            doc = columns[-1]
            doc_string_vector = doc.split()
            temp_pad = [pad_string for x in range(0,80)]
            swap_padding = []
            swap_padding.extend(temp_pad)
            swap_padding.extend(doc_string_vector)
            swap_padding.extend(temp_pad)

            sliced_doc = swap_padding[position_of_wordtype:position_of_wordtype + 160]
            if len(sliced_doc) != 160:
                print(sliced_doc)
                raise RuntimeError

            sliced_doc = " ".join(sliced_doc)

            wordtype = columns[1].split('.')[0]
            examples.append(torchtext.legacy.data.Example.fromlist([sliced_doc, wordtype], datafields))
    dataset = torchtext.legacy.data.Dataset(examples, datafields)
    return (dataset, TEXT, WORDTYPE)


In [None]:
test_dataset = read_test_data('wsd_test_blind.txt', doc_start=4)
for example in test_dataset[0].examples:
    test_example_string = example.text
    example_wordtype = example.wordtype[0]

    
    example_vocab = model_vocabs[example_wordtype]
    example_label_vocabs = model_label_vocabs[example_wordtype]

    encoded_example = torch.tensor([example_vocab.stoi[x] for x in test_example_string],device=device, requires_grad = False)

    model = models[example_wordtype]
    model.lstm.flatten_parameters()

    scores = model(encoded_example.unsqueeze(1))
    prediction = scores.argmax(dim=1)
    
    decoded_prediction = example_label_vocabs.itos[prediction]
    print(decoded_prediction)

# Report: 


In this assigment, we have implemented three different models. We applied simple neural netwrok using continuous bag-of-words approach, Unidirectional LSTM and Bidirectional LSTM.

The CBoW, LSTM and BiLSTM models performed well in terms of loss and accuracy and we achived more then base model accuracy. 

We used the following external Python libraries:
PyTorch to implement the neural network and some of the data management
scikit-learn for a couple of simple utilities (LabelEncoder and train_test_split).
matplotlib for plotting

To run the program
, these libraries need to be installed separately using pip or conda


It is worth mentioning that we have used some functions provided during the lectures, we also got some funtions other resurces from google such as github, in order to tokanize the dataset, implement the padding and to create vocabulary.

..
.
