## Compulsion using BioBERT Model with expert test

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
df = pd.read_csv("/content/drive/MyDrive/OCD/dataset/posts_with_ontology_labe_and_negative.csv")
df.head()

Unnamed: 0,post,prefLabel,compulsion,obsession,expand1
0,Hi -\nSo I haven't been on here since December...,"['depression', 'weight gain', 'Medication', 'a...",0,1,"Aggressive obsession, Contamination obsession,..."
1,"Hi all, hope you're all having a wonderful ban...","['compulsion', 'anger', 'symptom', 'compassion...",1,1,"Aggressive intrusive thoughts, Contamination i..."
2,"Hi, \nFirst, I hope everyone managed to have s...","['hope', 'happiness', 'guilt', 'fear', 'obsess...",0,1,"Aggressive obsession, Contamination obsession,..."
3,Hello everyone. I could really use your help r...,"['Treatment', 'hope', 'Thought', 'obsession', ...",0,1,"Aggressive obsession, Contamination obsession,..."
4,"Though it comes in many flavors, one of the mo...","['quality', 'Intrusive thoughts', 'fall', 'beh...",1,1,"Aggressive obsession, Contamination obsession,..."


In [5]:
import nltk
nltk.download('stopwords')
sw = stopwords.words('english')

def clean_text(text):

    text = str(text).lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
df['post'] = df['post'].apply(lambda x: clean_text(x))

In [7]:
#######Considering obsession#######################
posts = df.post.values
labels = df.compulsion.values

In [8]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1') #"emilyalsentzer/Bio_ClinicalBERT")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [9]:
max_len = 0
count = 0
# For every sentence...
for sent in posts:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    if len(input_ids) < 128:
      count +=1

print('Max sentence length: ', max_len, count)


Max sentence length:  1459 6119


In [10]:
input_ids = []
attention_masks = []
max_len = 128 # experiment with 150 and 512
# For every post...
for post in posts:
    # `encode_plus` will: Tokenize the sentence, Prepend the `[CLS]` token to the start, Append the `[SEP]` token to the end.
    #   Map tokens to their IDs, Pad or truncate the sentence to `max_length, Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        post,                    # Sentence to encode.
                        truncation = True,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', posts[0])
print('Token IDs:', input_ids[0])



Original:  hi since december since recovering prozac since depression diagnosed ocd january psych bumped higher dosage prozac mg great job wanted go making gain weight freaking ocd revolves around cheating like knowand recent obsession put quotes around cuz question whether obsession real well let start beginning went fianc friends place drinks played card games fiance whole time except went bathroom remember everything night despite alcohol system came home continued drinking fiance eventually blacked ended dream cheating partner someone seen earlier night well really cheating taking guys hands placing odd place image head nothing else really remember waking realize dream, think stirred went right back sleep dont even remember keep thinking dream actually happened fiance keeps telling whole time nothing happened think dream image day driving nuts always question whether really ocd doc diagnosed make feel better everything true real appt psych tomorrow think going write script zoloftap

In [11]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

#Training dataset split for training and validation based on 80-20.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
#val_size = int(0.2 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

7,597 training samples
1,900 validation samples


In [12]:
# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 16 #recommended in BioBERT is 32 but this coudl result in memory full

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
            )

In [13]:
from transformers import BertForSequenceClassification
# Load BertForSequenceClassification, the pretrained BioBERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels = 2, output_attentions = False, output_hidden_states = False, )
# if device == "cuda:0":
# # Tell pytorch to run this model on the GPU.
#     model = model.cuda()
model = model.to(device)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 0.1e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
                  weight_decay=0.01
                )

In [15]:
# Number of training epochs. The BioBERT authors recommend between 2 and 4.
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4 #recommended 2 to 5

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [16]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [17]:
def format_time(elapsed):
    #Takes a time in seconds and returns a string hh:mm:ss
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [18]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader.
        # As we unpack the batch, we'll also copy each tensor to the device using  `to` method.
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = output.loss
        total_train_loss += loss
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
    model.eval()
    # Tracking variables
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            output= model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        })
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...

  Average training loss: 0.56
  Training epcoh took: 0:02:50

Running Validation...
  Accuracy: 0.74

Training...

  Average training loss: 0.47
  Training epcoh took: 0:02:48

Running Validation...
  Accuracy: 0.85

Training...

  Average training loss: 0.25
  Training epcoh took: 0:02:47

Running Validation...
  Accuracy: 0.95

Training...

  Average training loss: 0.18
  Training epcoh took: 0:02:47

Running Validation...
  Accuracy: 0.95

Training complete!
Total training took 0:12:19 (h:mm:ss)


In [19]:
####saving the model in the drive
model = torch.load('bert_model')
torch.save(model, '/content/drive/MyDrive/OCD/model/BioBERT_model_compulsion_expert.h5')

In [20]:
#######loading the model from the drive
model = torch.load('/content/drive/MyDrive/OCD/model/BioBERT_model_compulsion_expert.h5', map_location=torch.device('cuda'))

In [21]:
#########Testing the BioBert fine-tune model with expert evaluation
import pandas as pd
df_test = pd.read_csv('/content/drive/MyDrive/OCD/dataset/ontology_labeled_data_all_after_Embedding.csv')
df_test['post'] = df_test['post'].apply(lambda x:clean_text(x))
test_posts = df_test['post'].values

In [22]:
df_test.head()

Unnamed: 0,post,prefLabel,postAnnotations,mergedAnnotations,enriched-ontology-obsession,enriched-ontology-compulsion,obsession-expert,Compulsion-expert,Explanation
0,january year let go job ocd spiked calling sic...,"['OCD', 'depression']",['spike'],"['spike', 'depression', 'OCD']",1.0,,1.0,0.0,My OCD had spiked and I was calling out sick f...
1,hello everyone two year old niece us week actu...,"['dizziness', 'Thought', 'hallway', 'OCD', 'ou...",['Rituals'],"['outside', 'hallway', 'bathroom', 'fear', 'Th...",,1.0,0.0,1.0,"The truth is, I feel really upset and feel I'm..."
2,"hello everyone feeling bit moment yeah, know f...",['drop'],['theme'],"['drop', 'theme']",1.0,,0.0,0.0,
3,wanted share little illustration reassurance s...,"['rash', 'Rituals', 'hope', 'Bullying', 'joy',...",['Rituals'],"['OCD', 'Bullying', 'rash', 'Rituals', 'joy', ...",,1.0,0.0,1.0,I had an explosion of ocd/ I started panickin...
4,"so, admitted might ptsd bpd, apart ocd, admitt...","['anxiety', 'anxiety', 'insomnia', 'depression...",['theme'],"['talking', 'quality', 'anxiety', 'mental diso...",1.0,,0.0,0.0,"I might have PTSD and BPD, apart from OCD, whi..."


In [23]:
test_posts = df_test['post'][:300].values

In [24]:
max_len = 128
test_input_ids = []
test_attention_masks = []
for post in test_posts:
    encoded_dict = tokenizer.encode_plus(
                        post,
                        add_special_tokens = True,
                        truncation = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)



In [25]:
batch_size = 16
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [26]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()

            predictions.extend(list(pred_flat))

In [27]:
#predictions
df_exp_mod = pd.DataFrame()
df_exp_mod['post'] = df_test['post'][:300]
df_exp_mod['compulsion-expert'] = df_test['Compulsion-expert'][:300]
df_exp_mod['compulsion-BioBert-model-pred'] = predictions
df_exp_mod.head()

Unnamed: 0,post,compulsion-expert,compulsion-BioBert-model-pred
0,january year let go job ocd spiked calling sic...,0.0,0
1,hello everyone two year old niece us week actu...,1.0,0
2,"hello everyone feeling bit moment yeah, know f...",0.0,0
3,wanted share little illustration reassurance s...,1.0,0
4,"so, admitted might ptsd bpd, apart ocd, admitt...",0.0,0


In [28]:
df_exp_mod[df_exp_mod['compulsion-expert']==df_exp_mod['compulsion-BioBert-model-pred']]

Unnamed: 0,post,compulsion-expert,compulsion-BioBert-model-pred
0,january year let go job ocd spiked calling sic...,0.0,0
2,"hello everyone feeling bit moment yeah, know f...",0.0,0
4,"so, admitted might ptsd bpd, apart ocd, admitt...",0.0,0
5,went rumination bout something always gets thr...,0.0,0
8,morning made resolution try post asking reassu...,0.0,0
...,...,...,...
292,"know others suffer this, started become big pr...",0.0,0
294,yesterday posted bad spike child work fallen b...,0.0,0
296,well cbt months going obviously bring thoughts...,0.0,0
297,somtimes worry something may able dismiss some...,0.0,0


In [29]:
#from expert evalation of 300 only 292 are valid the remaining droped as they contain Na
df_exp_mod_cln = df_exp_mod.dropna()

In [30]:
print(classification_report(df_exp_mod_cln['compulsion-expert'], df_exp_mod_cln['compulsion-BioBert-model-pred']))

              precision    recall  f1-score   support

         0.0       0.60      0.99      0.75       177
         1.0       0.60      0.03      0.05       118

    accuracy                           0.60       295
   macro avg       0.60      0.51      0.40       295
weighted avg       0.60      0.60      0.47       295



In [31]:
##########Confusion matrix########################3
import numpy as np
import matplotlib.pyplot as plt

pred_labels = df_exp_mod_cln['compulsion-BioBert-model-pred']
true_labels = df_exp_mod_cln['compulsion-expert']

#TP is True Positive: predict a label of 1 (positive), and the true label is 1.
TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))
# TN: predict a label of 0 (negative), and the true label is 0.
TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))
# FP: predict a label of 1 (positive), but the true label is 0.
FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))
# FN: predict a label of 0 (negative), but the true label is 1.
FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))
print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP,FP,TN,FN))

TP: 3, FP: 2, TN: 175, FN: 115


In [32]:
training_stats

[{'epoch': 1,
  'Training Loss': tensor(0.5562, device='cuda:0', grad_fn=<DivBackward0>),
  'Valid. Loss': 0.5058504793824268,
  'Valid. Accur.': 0.7391456582633052,
  'Training Time': '0:02:50',
  'Validation Time': '0:00:14'},
 {'epoch': 2,
  'Training Loss': tensor(0.4742, device='cuda:0', grad_fn=<DivBackward0>),
  'Valid. Loss': 0.3610073538387523,
  'Valid. Accur.': 0.8532913165266107,
  'Training Time': '0:02:48',
  'Validation Time': '0:00:14'},
 {'epoch': 3,
  'Training Loss': tensor(0.2492, device='cuda:0', grad_fn=<DivBackward0>),
  'Valid. Loss': 0.16399202862081408,
  'Valid. Accur.': 0.9501050420168067,
  'Training Time': '0:02:47',
  'Validation Time': '0:00:14'},
 {'epoch': 4,
  'Training Loss': tensor(0.1810, device='cuda:0', grad_fn=<DivBackward0>),
  'Valid. Loss': 0.15753201589363963,
  'Valid. Accur.': 0.9530812324929973,
  'Training Time': '0:02:47',
  'Validation Time': '0:00:14'}]

In [None]:
########################BioBert model fine-tune for compulsion ################################################################################