In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
# from simpletransformers.classification import ClassificationModel
# from simpletransformers.classification import MultiLabelClassificationModel
import pandas as pd
import logging
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import ast
import numpy as np
import re

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# !unzip "/content/drive/My Drive/Colab Notebooks/output.zip"
multi_label_classification = pd.read_csv("/content/threshold_classifier_data.csv", usecols=["text", "label"])
multi_label_classification.head()

Unnamed: 0,text,label
0,"At week 2, 2005, the district of Saponé in Bur...",1
1,In Burkina Faso the district of Batié is in ep...,2
2,"This week, 3 new districts reached the Alert t...",1
3,Burkina Faso: Batié counts 11 cases with 1 dea...,0
4,In Burkina Faso: Batie is always in epidemic (...,2


In [4]:
# summary_state = summary_state.drop(columns=["State", "Count"])
# summary_state = summary_state.rename(columns={"Summary":"text", "Label":"labels"})
# summary_state = summary_state.reset_index(drop=True)

In [5]:
separation_val = int(0.7 * len(multi_label_classification))
separation_test = int(0.85 * len(multi_label_classification))
train_data = multi_label_classification[:separation_val]
val_data = multi_label_classification[separation_val:separation_test]
test_data = multi_label_classification[separation_test:]
print(len(test_data))
train_data

75


Unnamed: 0,text,label
0,"At week 2, 2005, the district of Saponé in Bur...",1
1,In Burkina Faso the district of Batié is in ep...,2
2,"This week, 3 new districts reached the Alert t...",1
3,Burkina Faso: Batié counts 11 cases with 1 dea...,0
4,In Burkina Faso: Batie is always in epidemic (...,2
...,...,...
345,Ethiopia: 5 Woredas reached the alert thresho...,1
346,Ethiopia: The Woreda of Goba Town reached the...,1
347,Kenya: The Suna West Sub County reached the a...,1
348,Niger: The district of Madarounfa after 4 wee...,2


In [6]:
pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-7t2r1o5r
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-7t2r1o5r
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-2.11.0-cp36-none-any.whl size=747880 sha256=edc3451b5dbda572e0cb32242446e7b750395092002e6670d066fc807bea21e7
  Stored in directory: /tmp/pip-ephem-wheel-cache-ca1jalxq/wheels/70/d3/52/b3fa4f8b8ef04167ac62e5bb2accb62ae764db2a378247490e
Successfully built transformers


In [7]:
#This code does language modeling as the first initial step of fine-tuning on a cusotm WHO report of meningitis
#This is so the the model can learn some of the domain specific knowledge necessary to understand some of the indivial countries' reports
!python3 "run_language_modeling.py"  \
    --output_dir=output \
    --model_type=longformer \
    --model_name_or_path="allenai/longformer-base-4096" \
    --per_device_train_batch_size=1\
    --do_train \
    --train_data_file="finetune.txt" \

python3: can't open file 'run_language_modeling.py': [Errno 2] No such file or directory


In [8]:
device = torch.device("cuda")
from transformers import BertModel, BertTokenizer #LongformerModel, LongformerTokenizer, LongformerConfig

In [9]:
# allenai/longformer-base-4096
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
model.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [10]:
torch.cuda.empty_cache()
#note when encoded the start of every token is added with an <s> for classification purposes
input_ids = tokenizer.encode("Hello, my dog is cute")  # Batch size 1
print(tokenizer.decode(input_ids))
print(tokenizer.decode(input_ids[0]))

[CLS] hello, my dog is cute [SEP]
[ C L S ]


In [11]:
tokenized = multi_label_classification["text"]\
                                    .apply(lambda x: tokenizer.encode(x, add_special_tokens_tokens=True))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
print(max_len)
#padded according to max_length so we can feed the model in batches - padding works as such, add 0s after the length
#of the original sequence l such that the new length is = max_len = 2530
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])




226


In [12]:
np.array(padded.shape)
#Add masking so that know BERT knows where to attend to => so it can ignore padding
#adds a 1 everwhere where the condition of padded != 0 is satisfied, in other words everywhere there isn't padding
attention_mask = np.where(padded != 0, 1, 0)
#add global attention represented with 2 for Longformer- for now just add 1 at the classification <s> token
# attention_mask[:, [0, -1]] = 2
print(attention_mask)
device = torch.device("cuda")

input_ids = torch.tensor(padded).to(device)
attention_mask = torch.tensor(attention_mask).to(device)

def create_save_feature_representation():
  first_time = True
  for i in range(0,len(input_ids),2):
    with torch.no_grad():
      #predictions, hidden_states = model(input_ids[i:i+2], attention_mask=attention_mask[i:i+2])
      predictions = model(input_ids[i:i+2], attention_mask=attention_mask[i:i+2])
      scores = predictions[0][:,0,:].cpu().numpy()
      if not first_time:
        curr = np.load("lang_features.npy")
        scores = np.concatenate((curr, scores))
      else:
        first_time = False
      np.save("lang_features.npy",scores)

create_save_feature_representation()

[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]


In [13]:
features = np.load("lang_features.npy")
print(multi_label_classification.iloc[0]["label"])
#convert the labels into numpy arrays since they are stored as string representations
# multi_label_classification["label"] = multi_label_classification["label"].apply(lambda x: np.fromstring(x[1:-1], dtype=np.float32, sep=','))

1


In [14]:
train_features, val_features, test_features = features[:separation_val], features[separation_val:separation_test], features[separation_test:]
hidden_size = features.shape[1]

#convert to float datatype that can be then cast to a tensor in PyTorch
labels = np.array([label for label in np.array(multi_label_classification["label"].values)])
train_labels, val_labels, test_labels = labels[:separation_val], labels[separation_val:separation_test], labels[separation_test:]
multi_label_classification.iloc[0]["label"].shape

train_features, val_features, test_features = torch.tensor(train_features).to(device).float(), torch.tensor(val_features).to(device).float(),  torch.tensor(test_features).to(device).float()
train_labels, val_labels, test_labels = torch.tensor(train_labels).to(device).long(), torch.tensor(val_labels).to(device).long(), torch.tensor(test_labels).to(device).long()

In [15]:
def test_model(features, labels):
  preds = neural_network(features).cpu().data.numpy()
  correct = labels.cpu().numpy()
  preds = np.argmax(preds, axis=1).astype(np.float32)
  return np.sum(correct==preds)/len(labels)


epochs = 50
neural_network = nn.Sequential(
                    nn.Linear(hidden_size, 1000),
                    nn.ReLU(),
                    nn.BatchNorm1d(1000),
                    nn.Linear(1000, 800),
                    nn.ReLU(),
                    nn.BatchNorm1d(800),
                    nn.Linear(800, 800),
                    nn.ReLU(),
                    nn.BatchNorm1d(800),
                    nn.Linear(800, 600),
                    nn.ReLU(),
                    nn.BatchNorm1d(600),
                    nn.Linear(600, 400),
                    nn.ReLU(),
                    nn.BatchNorm1d(400),
                    nn.Linear(400, 400),
                    nn.ReLU(),
                    nn.BatchNorm1d(400),
                    nn.Linear(400, 200),
                    nn.ReLU(),
                    nn.BatchNorm1d(200),
                    nn.Linear(200, 3),
                    nn.Softmax(),
                    )
neural_network.cuda()
dtype = torch.cuda.FloatTensor
neural_network.type = dtype

criterion = nn.CrossEntropyLoss()
#2e-3
optimizer = optim.Adam(neural_network.parameters(), lr=2.18e-3)

print(train_features.dtype)

for epoch in range(epochs):
  optimizer.zero_grad()
  output = neural_network(train_features)
  loss = criterion(output, train_labels)
  print("Epoch {0} with training loss: {1} val accuracy: {2}".format(epoch, loss, test_model(val_features, val_labels)))
  loss.backward()
  optimizer.step()    

torch.float32
Epoch 0 with training loss: 1.1105479001998901 val accuracy: 0.38666666666666666
Epoch 1 with training loss: 0.8044092655181885 val accuracy: 0.64
Epoch 2 with training loss: 0.6996657252311707 val accuracy: 0.8
Epoch 3 with training loss: 0.6282626390457153 val accuracy: 0.8933333333333333
Epoch 4 with training loss: 0.5923308730125427 val accuracy: 0.9066666666666666
Epoch 5 with training loss: 0.580305814743042 val accuracy: 0.92
Epoch 6 with training loss: 0.5714171528816223 val accuracy: 0.92
Epoch 7 with training loss: 0.5677136778831482 val accuracy: 0.92
Epoch 8 with training loss: 0.5625181198120117 val accuracy: 0.92
Epoch 9 with training loss: 0.5583311915397644 val accuracy: 0.9066666666666666
Epoch 10 with training loss: 0.5676904320716858 val accuracy: 0.88
Epoch 11 with training loss: 0.5558918118476868 val accuracy: 0.88
Epoch 12 with training loss: 0.5556126236915588 val accuracy: 0.9066666666666666


  input = module(input)


Epoch 13 with training loss: 0.5555440187454224 val accuracy: 0.9066666666666666
Epoch 14 with training loss: 0.5553212761878967 val accuracy: 0.8933333333333333
Epoch 15 with training loss: 0.5552805662155151 val accuracy: 0.9066666666666666
Epoch 16 with training loss: 0.5567911863327026 val accuracy: 0.8933333333333333
Epoch 17 with training loss: 0.554970383644104 val accuracy: 0.9066666666666666
Epoch 18 with training loss: 0.5549374222755432 val accuracy: 0.8933333333333333
Epoch 19 with training loss: 0.554884135723114 val accuracy: 0.92
Epoch 20 with training loss: 0.5548152327537537 val accuracy: 0.9333333333333333
Epoch 21 with training loss: 0.5547935366630554 val accuracy: 0.9333333333333333
Epoch 22 with training loss: 0.5547341704368591 val accuracy: 0.9466666666666667
Epoch 23 with training loss: 0.5546978712081909 val accuracy: 0.9466666666666667
Epoch 24 with training loss: 0.554673969745636 val accuracy: 0.9333333333333333
Epoch 25 with training loss: 0.55465906858444

In [16]:
def convert_to_thresholds(data):
  return data>0.5



print(test_model(test_features, test_labels))

0.9866666666666667


  input = module(input)


##Tests on simplified 2 state binary classification problem
**Initial results tested with 20 labels:**
F1 Score: 0.53
ROC-AUC: 0.51

**Tested on only the most present 8 labels** - as done with the forecasting pipeline (to check if this is a signal to noise ratio)
F1 Score: 0.558
ROC-AUC: 0.501

So the issue here isn't due to noise - the Longformer is just not able to learn a meaningful enough feature representation that can map to the labels.

Let's try to finetune our Longformer model first on some domain-specific text - reports from the WHO that outline how they plan to defeat Meningitis by 2030. After finetuning on this data, let's try to regenerate our feature representations of our report text and see if that changes our results with the multilabel classifier

**After training a pre-trained model to do language modeling**
F1 Score: 0.5329
ROC-AUC: 0.566

The results did improve but not significantly

**SIDE TEST** 
Labels were changed to reflect if epidemic was mentioned (i.e. reaching epidemic level) = 2, alert level = 1, otherwise 0 and the model does surprisingly well achieving 85-91% accuracy
Conclusion = BERT is able to capture some semantic, latent meaning but the labels might be otherwise too noisy to do so. Question -> what tools do we extract in order to provide potentially meaningful information to our regression?

Idea rn: symbolic NLU system - looks for epidemic vs. alert - tries to localize location, what else?

In [17]:
#As input to finetune model - need to tokenize and feed in sentences into longformer
#In order to feed text into the longformer - need to pass in 3 things:
#1. input_ids which are the tokenized sentences (with the correct tags e.g. <s>)
#2. input_masks telling model where to pay attention to - i.e. where to ignore padding etc.
#3. token type ids which tell models about relationship of different sentences (e.g. relevant to question and answering datasets) 
# since our input consists of single sentences, we don't need it however

def tokenize_sentences(sentences):
  input_ids, attention_masks = [], []
  for sentence in sentences:
    #encodings is a dictionary with the encoded sequence and additional information like the attention mask
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=4096, pad_to_max_length=True, return_attention_mask=True)
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])
  return input_ids, attention_masks

with open("finetune.txt", "r") as f:
  corpus = f.read().strip()
  corpus = corpus.replace('."', '".')
  #split the corpus into an array of sentences - each sentence will be its own item in the array
  sentences = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", corpus)
  sep = int(0.8*len(sentences))
  #save the last 20% of the data for test
  train_dataset, test_dataset = sentences[:sep], sentences[sep:]

input_ids, attention_masks = tokenize_sentences(train_dataset)
print(len(input_ids)*4096)

train_dataset, test_dataset = map(tokenize_sentences, train_sentences), map(tokenize_sentences, test_sentences)
print(train_dataset)

# config = LongformerConfig(dropout=0.2, attention_dropout=0.2)
# config.output_hidden_states = False
#note we're gonna be using the Longformer model we defined above so we don't need to define a new model here

torch.cuda.empty_cache()

FileNotFoundError: ignored

In [None]:

# Visualizing Attention from Model
!git clone https://github.com/jessevig/bertviz.git

In [18]:
from bertviz.bertviz import head_view

In [19]:
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [20]:
def show_head_view(model, tokenizer, sentence_a, sentence_b=None):
    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True).to(device)
    input_ids = inputs['input_ids']
    if sentence_b:
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]
        sentence_b_start = token_type_ids[0].tolist().index(1)
    else:
        attention = model(input_ids)[-1]
        sentence_b_start = None
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)    
    head_view(attention, tokens, sentence_b_start)

In [24]:
sen_1, sen_2 = multi_label_classification["text"].iloc[1], multi_label_classification["text"].iloc[2]
sen_42 = multi_label_classification["text"].iloc[85]
print(sen_42)

call_html()
show_head_view(model, tokenizer, sen_42)

 Niger: District in epidemic : Agadez, Tessaoua, Keita, Goure, Magaria, Matameye, Mirriah, Tanout Madarounfa, et Zinder.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>