In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
SAVE = '/content/drive/My Drive/COLAB/BertClassifier/'

In [3]:
# !pip install -r colab_requirements.txt
!pip install transformers==3.0.2
!pip install torch==1.5.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
# It may be required to restart the runtime after executing this cell

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.0/769.0 KB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp38-cp38-manylinux1_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 M

In [4]:
import os
import re
import csv
import math
import time
import random
from tqdm import tqdm
import pickle as pkl

import numpy as np
import pandas as pd
import scipy.stats as stats

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import AutoModel, AutoTokenizer
from transformers import modeling_bert

In [5]:
class PubMedBERT(nn.Module):
  """
  The PubMed implementation does not include a classification head
  so one must be added. The classification head is added in the same manner
  as the other HuggingFace BERT models for consistency.
  """
  def __init__(self):
    super().__init__()
    self.weight_path = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
    self.bert = AutoModel.from_pretrained(self.weight_path)
    self.drop = nn.Dropout(0.1, False)
    self.fc_out = nn.Linear(768, 2, True)

  def forward(self, input_ids, token_type_ids, attention_masks,
              return_dict=None):
    outputs = self.bert(input_ids, 
                             attention_mask=attention_masks,
                             token_type_ids=token_type_ids
                             )
    pooled_output = outputs[1]
    pooled_output = self.drop(pooled_output)
    logits = self.fc_out(pooled_output)

    loss = None
    if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

    return modeling_bert.SequenceClassifierOutput(loss=loss, logits=logits,
                                    hidden_states=outputs.hidden_states, 
                                    attentions=outputs.attentions)

In [6]:
bert = PubMedBERT()  # Best performing model
bert.load_state_dict(torch.load(SAVE+'bert_cord_semmeddb2020_semantic_predication_filter.pth'))
# bert.load_state_dict(torch.load(SAVE+'bert_semmeddbds_semantic_predication_filter.pth'))
# bert.load_state_dict(torch.load(SAVE+'bert_cord_semmeddb2020_semantic_predication_filter.pth',map_location=torch.device('cpu')))
bert.eval()

Downloading:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

PubMedBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

bert.to(device)

PubMedBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [8]:
with open(SAVE+"filtered_triples_final.pkl", 'rb') as f:
    triples = pkl.load(f)

In [9]:
sentences = []

for k,v in triples.items():
  sentences.append(v[14])

In [10]:
tokenizer = AutoTokenizer.from_pretrained(bert.weight_path)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [11]:
# set batch size
import datetime

test_batch = 10

preds = []
i = 0
n = len(sentences)

while i < n:
  gap = n - i
  if gap <= test_batch:
    end = n
  else:
    end = test_batch + i
 
  encoding = tokenizer(sentences[i:end], return_tensors='pt', padding=True, truncation=True, verbose=False)

  input_ids = encoding['input_ids'].to(device)
  token_type_ids = encoding['token_type_ids'].to(device)
  attn_mask = encoding['attention_mask'].to(device)


  output = bert(input_ids, token_type_ids, attn_mask)
  preds.append(torch.argmax(F.softmax(output[0], dim=1), dim=1).cpu().detach().tolist())

  i += test_batch

  if i%10000 == 0 or end == n:
    preds_np = np.array(preds)
    np.save(SAVE+"number.npy", preds_np) 

    current_time = datetime.datetime.now()
    print(str(current_time))
    print(i)


2023-01-14 13:47:46.906466
10000
2023-01-14 13:48:36.439073
20000
2023-01-14 13:49:27.892182
30000
2023-01-14 13:50:18.285844
40000
2023-01-14 13:51:10.257700
50000
2023-01-14 13:52:04.210845
60000
2023-01-14 13:52:54.886751
70000
2023-01-14 13:53:44.976833
80000
2023-01-14 13:54:34.943585
90000
2023-01-14 13:55:25.724415
100000
2023-01-14 13:56:16.610854
110000
2023-01-14 13:57:07.199727
120000
2023-01-14 13:57:59.235310
130000
2023-01-14 13:58:49.466735
140000
2023-01-14 13:59:41.094059
150000
2023-01-14 14:00:31.939923
160000
2023-01-14 14:01:22.234770
170000
2023-01-14 14:02:11.108854
180000
2023-01-14 14:03:02.482800
190000
2023-01-14 14:03:53.251073
200000
2023-01-14 14:04:43.875367
210000
2023-01-14 14:05:35.236348
220000
2023-01-14 14:06:25.543962
230000
2023-01-14 14:07:16.452380
240000
2023-01-14 14:08:07.154072
250000
2023-01-14 14:08:58.641164
260000
2023-01-14 14:09:49.672728
270000
2023-01-14 14:10:39.828168
280000
2023-01-14 14:11:31.370336
290000
2023-01-14 14:12:23.005

In [None]:
breakfile = np.load(SAVE+"number.npy")

i = len(breakfile)*10000

pred = list(breakfile)

In [16]:
# Save

preds_np = np.array(preds)
np.save(SAVE+"cls_results.npy", preds_np) 