In [146]:
#Libraries

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import re
import json
import string
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from tqdm.autonotebook import tqdm
from functools import partial
import torch
import random
from sklearn.model_selection import train_test_split
#!pip install transformers
#from transformers import BertTokenizer, BertModel
#import spacy

In [147]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

print(f'GPU available: {torch.cuda.is_available()}')
random.seed(10)

Sun May  2 06:10:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    28W /  70W |   2708MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [148]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


## Vocabulary
This is useful only for the decoder; we get the vocab from the complete data

In [243]:
df = pd.read_csv("data.csv")
# df = df.iloc[0:10,:]

text = []
for i in range(len(df)):
  t = df.loc[i][6]
  text.append((t, df.loc[i][5]))

In [244]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
        self.num_words = 4
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab = Vocabulary()
for src, tgt in text:
    vocab.add_words_from_sentence(src)
    vocab.add_words_from_sentence(tgt)
print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 1696


## Create chunks for each publication

In [245]:
# Every publication input will be mapped into a variable numbers of chunks (split by sentence) that are less than chunk_max_len
# These can then be batched by encoding strings, then padding them
chunk_max_len = 512
publication_ids = df['Id']
dataset_label = df['dataset_label']
chunked_text = [[]] * len(df.index) # publication id x chunks - left in string format for flexibility in encoding
chunk_labels = [[]] * len(df.index) # publication id x chunk - if label in chunk, True else False

for i in range(len(df.index)):
    chunked_text[i] = []
    chunk_labels[i] = []
    chunk = ''
    for s in df['text'][i].split('.'):
        # print(s)
        new_chunk = chunk + s.strip() 
        if len(s)>0 and s[-1]!='.':
          new_chunk += '. '
        if len(new_chunk.split(' ')) > chunk_max_len:
            # labels_per_chunk[i].append(True if df['dataset_label'][i] in chunk else False)
            chunk_labels[i].append(1 if df['dataset_label'][i] in chunk else 0)
            chunked_text[i].append(chunk)
            chunk = s
        else:
            chunk = new_chunk
    # labels_per_chunk[i].append(True if df['dataset_label'][i] in chunk else False)
    chunk_labels[i].append(1 if df['dataset_label'][i] in chunk else 0)
    chunked_text[i].append(chunk)

print(len(chunked_text[0]), chunked_text[0])
print(chunk_labels[0])

1 ["This article investigates an important factor in student achievement-parental involvement. Using data from the National Education Longitudinal Study (NELS), we estimate a value-added education production function that includes parental effort as an input. Parental effort equations are also estimated as a function of child, parent, household, and school characteristics. Our results suggest that parental effort has a strong positive effect on achievement that is large relative to the effect of school resources and is not captured by family background variables. Parents appear to reduce their effort in response to increased school resources, suggesting potential ''crowding out'' of school resources. "]
[1]


## Create dataset 
For each publication, it will return a tensor with all the chunks inside
Therefore, each pass of our bi-LSTM will work with one single publication (with all the chunks inside that publication)

In [246]:
from transformers import BertModel, BertTokenizerFast
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
bert_model.eval()
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


In [247]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

class ChunkedDataset(Dataset):
    """
        @author: Alexander Rodriguez
    """

    def __init__(self, publication_ids, chunked_text, chunk_labels, dataset_label, device, tokenizer, bert_model):
        """
        Args:
            chunked_text: list of str, contains all the chunks
            chunk_labels: list booleans, contain whether or not the label is in the chunks
            dataset_label: string, same label for all chunks in the publication
            device: cpu or cuda
        """
        self.publication_ids = publication_ids
        self.chunked_text = chunked_text
        self.chunk_labels = chunk_labels
        self.dataset_label = dataset_label
        self.tokenizer = tokenizer
        self.device = device
        self.bert_model = bert_model
        
    def __len__(self):
        return len(self.publication_ids)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"publication_ids":self.publication_ids[idx], "chunked_text":self.chunked_text[idx], 
                "chunk_labels":self.chunk_labels[idx], "dataset_label":self.dataset_label[idx]}

def collate_fn(data):
    """Creates mini-batch tensors for several publications

    Return: A dictionary for each chunk (read below)

        Each training observation will represent one chunk, therefore we have:

          input_ids: the word ids from the Bert tokenizer
                tensor shape (max_input_sequence_length,batch_size)

          input_tensor: the Bert word embeddings for the sequence (chunk)
                tensor shape (max_input_sequence_length,batch_size,bert_dim)

          attention_mask: useful for knowing where the sequence ends
        
        Each chunk has two labels:

          chunk_labels: (list of 0/1) whether or not the chunk contains the label

          output_ids: the ids that have to be predicted for the target sequence
                tensor shape (max_output_sequence_length,batch_size)

    Sequences are padded to the maximum length of mini-batch sequences (dynamic padding).
    """
    
    chunked_text = []; chunk_labels = []; dataset_label = []
    for publication in data:
        # for chunk in publication:
        chunked_text += [chunk for chunk in publication["chunked_text"] ]
        chunk_labels += [chunk for chunk  in publication["chunk_labels"] ]
        # our dataset_label have to be repeated 
        dataset_label += [publication["dataset_label"] for _ in publication["chunk_labels"] ]

    with torch.no_grad():  # needed for memory

      t = tokenizer(chunked_text, padding=True, truncation=True, return_tensors="pt").to(device)
      outputs = bert_model(**t)
      bert_input_word_embeddings = outputs[0].permute(1,0,2)
      del outputs
      torch.cuda.empty_cache()

    input_ids = t['input_ids'].permute(1,0)
    attention_mask = t['attention_mask']

    def encode(tgt):
        tgt_ids = vocab.get_ids_from_sentence(tgt)
        return tgt_ids
        
    # We will pre-tokenize the dataset labels (output) and save in id lists for later use
    output_ids = [encode(tgt) for tgt in dataset_label]
    output_ids = [torch.LongTensor(e) for e in output_ids]
    output_ids = pad_sequence(output_ids,padding_value=pad_id).to(device)

    # "chunked_text":chunked_text,
    # "dataset_label":dataset_label,
    return {"input_ids":input_ids, "chunk_labels":chunk_labels, \
            "output_ids":output_ids, "input_tensor":bert_input_word_embeddings, \
            'attention_mask':attention_mask}

In [248]:
# do not use, this is only for debugging
# data = pd.read_csv("data.csv")
# with torch.no_grad():
#   t = tokenizer(data['text'].tolist()[0:16], padding=True, truncation=True, return_tensors="pt").to(device)
#   outputs = bert_model(**t)
#   encoded_layers = outputs[0]
#   del outputs
# torch.cuda.empty_cache()


In [249]:
# Create the DataLoader for all publications
dataset = ChunkedDataset(publication_ids, chunked_text, chunk_labels, dataset_label, device, tokenizer, bert_model)
batch_size = 4  # this means it's 4 publications per batch ---too large may not fit in GPU memory
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

## Seq2seq model 
Uses Bert word embeddings
Makes two predictions for each chunk

In [250]:

import torch.nn as nn
class Seq2seq(nn.Module):
    def __init__(self, vocab, bert_dim = 300, emb_dim = 300, hidden_dim = 300, num_layers = 2, dropout=0.1):
        super().__init__()
        """
          @author: Alexander Rodriguez
          
          bert_dim: dimension of Bert embeddings
          emb_dim: dimension of our word embedding (used in decoder)
          hidden_dim: dimension of our GRU hidden states
        """
        
        self.bert_dim = bert_dim
        self.num_words = vocab.num_words
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # neural layers
        self.embedding_layer = nn.Linear(1,self.emb_dim)
        self.encoder = nn.GRU(
            self.bert_dim,self.hidden_dim,self.num_layers,bidirectional=True,dropout=dropout
            )
        self.linear_hidden = nn.Linear(self.hidden_dim,self.hidden_dim)
        self.decoder = nn.GRU(
            self.emb_dim,self.hidden_dim,self.num_layers,bidirectional=False,dropout=dropout
            )
        self.output_layer = nn.Linear(self.hidden_dim,self.num_words)
        self.classifier = nn.Linear(self.hidden_dim, 1)
        self.attn_softmax = nn.Softmax(1) 

    def encode(self, input_embeddings, attention_mask):
        """Encode the source batch using a bidirectional GRU encoder.

        Args:
            input_embeddings: Bert embeddings with shape (max_input_sequence_length,
                  batch_size,bert_dim), e.g. torch.Size([512, 16, 768])
            
            attention_mask: attention mask obtained from Bert tokenizer

        Returns:
            A tuple with three elements:
                encoder_output: The output hidden representation of the encoder 
                    with shape (max_input_sequence_length, batch_size, hidden_size).
                    Can be obtained by adding the hidden representations of both 
                    directions of the encoder bidirectional GRU. 
                encoder_mask: A boolean tensor with shape (max_input_sequence_length,
                    batch_size) indicating which encoder outputs correspond to padding
                    tokens. Its elements should be True at positions corresponding to
                    padding tokens and False elsewhere.
                encoder_hidden: The final hidden states of the bidirectional GRU 
                    (after a suitable projection) that will be used to initialize 
                    the decoder. This should be a tensor h_n with shape 
                    (num_layers, batch_size, hidden_size). Note that the hidden 
                    state returned by the bi-GRU cannot be used directly. Its 
                    initial dimension is twice the required size because it 
                    contains state from two directions.
        """

        batch_size = input_embeddings.shape[1]
        dtype = torch.float
        
        # gru pass
        encoder_output, encoder_hidden = self.encoder(input_embeddings) # seq_len first 

        # sum embeddings from the two GRUs
        encoder_output = encoder_output[:,:,:self.hidden_dim] + encoder_output[:,:,self.hidden_dim:] 

        # hidden embedding
        encoder_hidden = encoder_hidden.view(self.num_layers, 2, batch_size, self.hidden_dim)
        encoder_hidden = encoder_hidden.sum(1) # sum over bi-directional, keep number of layers
        encoder_hidden = self.linear_hidden(encoder_hidden)

        encoder_mask = attention_mask.permute(1,0)

        return encoder_output, encoder_mask, encoder_hidden



    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask, use_classifier=False):
        """Run the decoder GRU for one decoding step from the last hidden state.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing 
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the decoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be 
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.
            use_classifier: (boolean) Whether or not we should classify

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden 
                    representing the updated decoder state after processing the 
                    decoder input.
                attention_weights: This will be implemented later in the attention
                    model, but in order to maintain compatible type signatures, we also
                    include it here. This can be None or any other placeholder value.
        """
        # shared layer
        dtype = torch.float
        input = decoder_input.type(dtype)
        input = self.embedding_layer(input.permute(1,0).unsqueeze(2))

        # attention weights
        max_src_sequence_length = encoder_output.shape[0]
        batch_size = encoder_output.shape[1]
        decoder_output, decoder_hidden = self.decoder(input.permute(1,0,2),last_hidden)  
        # use the decoder output to get attention weights via dot-product
        attention_weights = torch.empty((batch_size,max_src_sequence_length),device=device,dtype=dtype)
        # function for batch dot product taken from https://discuss.pytorch.org/t/dot-product-batch-wise/9746/12
        def bdot(a, b):
            B = a.shape[0]
            S = a.shape[1]
            return torch.bmm(a.view(B, 1, S), b.view(B, S, 1)).reshape(-1)
        for i in range(max_src_sequence_length):
            attention_weights[:,i] = bdot(decoder_output.squeeze(0),encoder_output[i,:,:])
        # softmax
        attention_weights = self.attn_softmax(attention_weights)

        # get context vector
        context = torch.mul(encoder_output.permute(1,0,2), attention_weights.unsqueeze(2))
        context = context.sum(1)

        decoder_output = decoder_output.squeeze(0) + context
        # gru pass
        logits = self.output_layer(decoder_output)

        # use the attention context as input to the classifier along with
        # hidden states from encoder
        if use_classifier:
          out_classifier = self.classifier(last_hidden[0] + last_hidden[1] + context)
        else:
          out_classifier = torch.tensor(0.).to(device)
                  
        return logits, decoder_hidden, attention_weights, out_classifier


    def compute_loss(self, input_tensor, attention_mask, target_seq, target_binary):
        """Run the model on the source and compute the loss on the target.

        Args:
            input_tensor & attention_mask: 
                  Coming from Bert, directly go to encoder
                  See encoder documentation for details

            target_seq: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

            target_binary: Binary indicator for the chunk, indicates if
                the label is in that chunk (it's a list)
                NOTE: this is used as a mask for the sequence loss

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """

        # loss criterion, ignoring pad id tokens
        criterion = nn.CrossEntropyLoss(ignore_index=pad_id,reduction='none')
        criterion_classification = nn.BCEWithLogitsLoss(reduction='sum')
        
        # call encoder
        encoder_output, encoder_mask, encoder_hidden = self.encode(input_tensor, attention_mask)

        # decoder
        max_target_sequence_length = target_seq.shape[0]
        last_hidden = encoder_hidden
        total_loss = torch.tensor(0.).to(device)
        target_binary = torch.tensor(target_binary,dtype=torch.float).to(device)
        for i in range(max_target_sequence_length-1):
            decoder_input = target_seq[[i],]
            # do a forward pass over classifier only for the first 
            use_classifier = True if i==0 else False  
            logits, decoder_hidden, attention_weights, out_classifier = self.decode(decoder_input, last_hidden, encoder_output, encoder_mask, use_classifier)
            # target_binary serves as a mask for the loss
            # we only care about the predicted sequence when we should
            total_loss += (criterion(logits,target_seq[i+1,]) * target_binary).sum() 
            # get classification loss only for the first one (which is where out_classifier is meaningful)
            if use_classifier:
              class_loss = criterion_classification(out_classifier.view(-1),target_binary)
            # now we have to make last_hidden to be hidden embedding of gru
            last_hidden = decoder_hidden
        # denominator of loss
        total_target_tokens = torch.sum(target_seq != pad_id).cpu()
        return total_loss/total_target_tokens + class_loss


In [251]:
import tqdm
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """Train the model for given number of epochs and save the trained model in 
    the final model_file.
    """

    decoder_learning_ratio = 5.0
    
    encoder_parameter_names = ['embedding_layer','encoder','linear_hidden'] 
                               
    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        # print(f"Total training instances = {len(train_dataset)}")
        # print(f"train_data_loader = {len(train_data_loader)} {1180 > len(train_data_loader)/20}")
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                input_tensor = batch_data["input_tensor"]
                attention_mask = batch_data["attention_mask"]
                output_ids = batch_data["output_ids"]
                target_binary = batch_data["chunk_labels"]
                optimizer.zero_grad()
                loss = model.compute_loss(input_tensor, attention_mask, output_ids,target_binary)
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
    # Save the model after training         
    torch.save(model.state_dict(), model_file)

In [253]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 20

model = Seq2seq(vocab,bert_dim=768,emb_dim=256,hidden_dim=256,num_layers=2).to(device)
train(model, data_loader, num_epochs, "seq2seq_model.pt")
# Download the trained model to local for future use


HBox(children=(FloatProgress(value=0.0, description='training', max=20.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='epoch 1', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 2', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 3', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 4', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 5', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 6', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 7', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 8', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 9', max=3.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='epoch 10', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 11', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 12', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 13', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 14', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 15', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 16', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 17', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 18', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 19', max=3.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='epoch 20', max=3.0, style=ProgressStyle(description_width…





## Evaluation
This come is from Alex Wang, I haven't checked it.

Load model

In [None]:
model = Seq2seq(vocab,bert_dim=768,emb_dim=256,hidden_dim=256,num_layers=2).to(device)
model.load_state_dict(torch.load("seq2seq_model.pt"))

<All keys matched successfully>

In [None]:
def predict_greedy(model, sentence, max_length=100):
    """Make predictions for the given input using greedy inference.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: A input string.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        Model's predicted greedy response for the input, represented as string.
    """

    # You should make only one call to model.encode() at the start of the function, 
    # and make only one call to model.decode() per inference step.
    model.eval()
    source = torch.unsqueeze(torch.tensor(sentence).cuda(), 1)
    x, mask, hid = model.encode(source)
    start = bos_id
    sent = [start]
    i = 0
    while start != eos_id and i < 100:
          out, hid, temp = model.decode(torch.unsqueeze(torch.tensor(start).cuda(), 0), hid, x, mask)
          start = torch.argmax(out[0], 0)
          sent.append(start.item())
          i += 1
    sent = vocab.decode_sentence_from_ids(sent)
    
    return sent
score = 0
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))



In [None]:
def predict_beam(model, sentence, k=3, max_length=100, thresh=-9999):
    """Make predictions for the given inputs using beam search.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: An input sentence, represented as string.
        k: The size of the beam.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        A list of k beam predictions. Each element in the list should be a string
        corresponding to one of the top k predictions for the corresponding input,
        sorted in descending order by its final score.
    """

    # Implementation tip: once an eos_token has been generated for any beam, 
    # remove its subsequent predictions from that beam by adding a small negative 
    # number like -1e9 to the appropriate logits. This will ensure that the 
    # candidates are removed from the beam, as its probability will be very close
    # to 0. Using this method, uou will be able to reuse the beam of an already 
    # finished candidate

    # Implementation tip: while you are encouraged to keep your tensor dimensions
    # constant for simplicity (aside from the sequence length), some special care
    # will need to be taken on the first iteration to ensure that your beam
    # doesn't fill up with k identical copies of the same candidate.
    
    # You are welcome to tweak alpha
    alpha = 0.9
    model.eval()
    beams = []
    curr = []
    source = torch.unsqueeze(torch.tensor(sentence).cuda(), 1)
    x, mask, hid = model.encode(source)
    start = bos_id
    
    out, hid, temp = model.decode(torch.unsqueeze(torch.tensor(start).cuda(), 0), hid, x, mask)
    out = torch.log_softmax(out[0], 0)
    values, start = torch.topk(out, k, 0)
    for i in range(len(values)):
      # Each beam contains the log probs at its first index and the hidden states at its last index
      beams.append([values[i], start[i].item(), hid])
    
    generation = []
    i = 0
    while i < k:
      curr = []
      for j in beams:
          out, hid, temp = model.decode(torch.unsqueeze(torch.tensor(j[-2]).cuda(), 0), j[-1], x, mask)
          out = torch.log_softmax(out[0], 0)
          values, start = torch.topk(out, k, 0)
          for z in range(len(values)):
            temp = j.copy()
            temp[0] = values[z] + temp[0]
            temp.insert(-1, start[z].item())
            temp[-1] = hid
            curr.append(temp)
      curr = sorted(curr,reverse=True, key=lambda x: x[0])
      curr = curr[0:k - i]
      beams = []
      for j in curr:
        if j[-2] == eos_id or len(j) > 20:
          generation.append(j[:-1])
          i +=1
        else:
          beams.append(j)
    final = []
    generation = sorted(generation, reverse=True, key=lambda x: x[0]/(len(x)-1)**alpha)
    for i in generation:
      if i[0].item() > thresh:
        final.append(vocab.decode_sentence_from_ids(i[1:]))
    return final



[[tensor(-0.9381, device='cuda:0', grad_fn=<AddBackward0>), 13935, 9059, 1806, 2535, 2], [tensor(-1.5582, device='cuda:0', grad_fn=<AddBackward0>), 6536, 52236, 14165, 51055, 2535, 2], [tensor(-1.7241, device='cuda:0', grad_fn=<AddBackward0>), 13329, 43, 4023, 2]]


In [None]:
score = 0
for i in range(len(predictions)):
  for j in predictions[i]:
    if jaccard(test_y[i], vocab.decode_sentence_from_ids(j[1:])) > 0.5:
      score += 1
      break
print("max accuracy")
print(score/len(predictions))

max accuracy
0.9286287089013633


In [None]:
testing = {}
for i in range(0, len(sent_emb)):
  if np.sum(sent_emb[i]) not in testing.keys():
    predictions = predict_beam(baseline_model, sent_emb[i], thresh=-2.5)
    testing[np.sum(sent_emb[i])] = (predictions, [labels[i]])
  else:
    testing[np.sum(sent_emb[i])][1].append(labels[i])

In [None]:
print(len(testing.keys()))

2392


In [None]:
tp = 0
fp = 0
fn = 0
for i in testing.values():
  prediction = i[0]
  cop = prediction.copy()
  true_pred = i[1].copy()
  check = False
  #check exact match first
  for j in prediction:
    if j in true_pred:
      tp += 1
      true_pred.remove(j)
      cop.remove(j)
  #then check rest for jaccard score
  for j in cop:
    found = False
    removal = 0
    for k in true_pred:
      if jaccard(j, k) >= 0.5:
        found = True
        removal = k
        break
    if found:
      tp += 1
      true_pred.remove(removal)
    else:
      fp += 1
  fn += len(true_pred)

TRAINING PERFORMANCE

In [None]:
print("testing performance")
print("micro F score")
print(fp)
print(fn)
print(tp/(tp + 1/2*(fp+fn)))
print("accuracy")
print(tp/(tp+fn))

testing performance
micro F score
860
277
0.8223159868729489
accuracy
0.9047455295735901


In [None]:
testing = {}
for i in range(0, len(test_x)):
  if np.sum(test_x[i]) not in testing.keys():
    predictions = predict_beam(baseline_model, test_x[i], thresh=-2.5)
    testing[np.sum(test_x[i])] = (predictions, [test_y[i]])
  else:
    testing[np.sum(test_x[i])][1].append(test_y[i])

In [None]:
tp = 0
fp = 0
fn = 0
for i in testing.values():
  prediction = i[0]
  cop = prediction.copy()
  true_pred = i[1].copy()
  check = False
  #check exact match first
  for j in prediction:
    if j in true_pred:
      tp += 1
      true_pred.remove(j)
      cop.remove(j)
  #then check rest for jaccard score
  for j in cop:
    found = False
    removal = 0
    for k in true_pred:
      if jaccard(j, k) >= 0.5:
        found = True
        removal = k
        break
    if found:
      tp += 1
      true_pred.remove(removal)
    else:
      fp += 1
  fn += len(true_pred)

Testing Performance

In [None]:
print("testing performance")
print("micro F score")
print(fp)
print(fn)
print(tp/(tp + 1/2*(fp+fn)))
print("accuracy")
print(tp/(tp+fn))

testing performance
micro F score
792
249
0.6572275271649655
accuracy
0.8003207698476343



Testing on other data that doesn't include exact label in text

In [None]:
data = pd.read_csv("data_false.csv")
data = data[data.text.notnull()].reset_index()

In [None]:
data.head()

Unnamed: 0.1,index,Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text,mask
0,3,3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Federal Reserve Bank of Richmond S1. Accounti...,False
1,4,7,1b21f60a-4022-4b19-95ce-6fd7157d4aa9,Examining Latinos Involvement in the Workforce...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,"In this article, the authors report the resul...",False
2,8,14,7a2d20d9-fe83-4d24-b4ce-992f92f21bd2,Immigrant educators and students’ academic ach...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,ABSTRACT\nUsing a dataset which allows studen...,False
3,9,16,a8c7306f-908c-4d44-92b1-2afb2e066808,Demographic Differences in Patterns of Youth O...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Abstract: Participation in structured out-of-...,False
4,13,21,0ebdaf88-543a-4d88-9185-a24f8dc1e4cf,Grade Inflation,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Although much speculation has been devoted to...,False


" In this article, the authors report the results of two studies examining the participation rates of Latino students in postsecondary technical education (CTE) programs in community colleges and two-year proprietary institutions in the United States in 1994 and 2000. It is believed that the quality of the future U.S. Labor market will depend, to a great extent, on this group's education and job skills. Although Latinos are the fastest growing minority group in the United States, they are also the poorest and most undereducated when compared to other minority groups. Results of both studies show that few Latino students enroll in and graduate from postsecondary CTE programs. Of those students that do enroll in and complete CTE programs at the postsecondary level, very few complete programs that are considered high-skill, high-wage."

In [None]:
sent_emb = []
for j in range(4000):
  x = sent_sep(data.loc[j][7])
  y = list(x.sents)
  z = []
  for i in y:
    z.append(i.text)
  y = sent.encode(z)
  sent_emb.append(y)

In [None]:
labels = []
for j in range(0, 4000):
  labels.append(data.loc[j][6])

In [None]:
score = 0
for i in range(len(sent_emb)):
  predictions = predict_beam(baseline_model, sent_emb[i])
  for j in predictions:
    if jaccard(labels[i], j) > 0.5:
      score += 1
      break

In [None]:
print(score/len(sent_emb))

0.837


In [None]:
testing = {}
for i in range(0, len(sent_emb)):
  if np.sum(sent_emb[i]) not in testing.keys():
    predictions = predict_beam(baseline_model, sent_emb[i], thresh=-2.5)
    testing[np.sum(sent_emb[i])] = (predictions, [labels[i]])
  else:
    testing[np.sum(sent_emb[i])][1].append(labels[i])

In [None]:
tp = 0
fp = 0
fn = 0
for i in testing.values():
  prediction = i[0]
  cop = prediction.copy()
  true_pred = i[1].copy()
  check = False
  #check exact match first
  for j in prediction:
    if j in true_pred:
      tp += 1
      true_pred.remove(j)
      cop.remove(j)
  #then check rest for jaccard score
  for j in cop:
    found = False
    removal = 0
    for k in true_pred:
      if jaccard(j, k) >= 0.5:
        found = True
        removal = k
        break
    if found:
      tp += 1
      true_pred.remove(removal)
    else:
      fp += 1
  fn += len(true_pred)

In [None]:


print("micro F score")
print(fp)
print(fn)
print(tp/(tp + 1/2*(fp+fn)))
print("accuracy")
print(tp/(tp+fn))

micro F score
3328
1140
0.5614448370632116
accuracy
0.715
