<a href="https://colab.research.google.com/github/alexander-fichtl/diversifying_KELMs/blob/main/diversifying_KELMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# mount your drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# check gpu availability

!nvidia-smi

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
# clone MoP code base

!git clone https://github.com/cambridgeltl/mop.git

# move to directory
%cd drive/MyDrive/your_directory

In [None]:
# install python3.8 to make MoP code base compatible

!sudo apt-get update -y
!sudo apt-get install python3.8 python3.8-dev python3.8-distutils libpython3.8-dev

# change alternatives
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1

# install pip
!curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
!python3 get-pip.py --force-reinstall

# install dependencies
!python3 -m pip install ipython ipython_genutils ipykernel jupyter_console prompt_toolkit httplib2 astor

# link to google package
!ln -s /usr/local/lib/python3.8/dist-packages/google # \ /usr/local/lib/python3.7/dist-packages/google
!python --version

In [None]:
#install and import torch first to avoid problems with metis
!pip3 install torch==1.7.0 # torchvision torchaudio torchtext

import torch
!ls
!pip3 install pandas
!pip3 install scipy
!pip3 install tabulate
!pip3 install protobuf==3.20.*
!pip3 install wandb
!pip3 install cmake
!pip3 install scikit-learn
!pip3 install nltk

# Change working directory

%cd src/metis-5.1.0

!make config shared=1 prefix=~/.local/
!make install
!cp ~/.local/lib/libmetis.so /usr/lib/libmetis.so
!export METIS_DLL=/usr/lib/libmetis.so
!pip3 install metis-python


%cd ..

!ls
!pip3 install -e adapter-transformers

In [None]:
%cd knowledge_infusion/entity_prediction/

MODEL="dmis-lab/biobert-v1.1"
TOKENIZER="dmis-lab/biobert-v1.1"

# MODEL="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
# TOKENIZER="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
# MODEL="michiyasunaga/BioLinkBERT-base"
# TOKENIZER="michiyasunaga/BioLinkBERT-base"

INPUT_DIR="../../../kg_dir/"
OUTPUT_DIR="model_dir"
KG_NAME="ontochem_full_rel_data_cropped"
ADAPTER_NAMES="entity_predict"
PARTITION=20

!python3 run_pretrain.py \
--model $MODEL \
--tokenizer $TOKENIZER \
--input_dir $INPUT_DIR$KG_NAME \
--output_dir $OUTPUT_DIR \
--n_partition $PARTITION \
--use_adapter \
--non_sequential \
--adapter_names  $ADAPTER_NAMES \
--amp \
--cuda \
--num_workers 32 \
--max_seq_length 64 \
--batch_size 256 \
--lr 1e-04 \
--epochs 1 \
--save_step 2000

In [None]:
%cd evaluate_tasks/med_qa/pubmedqa
Dataset="PubMedQA"
MODEL_DIR="../../../../model_dir/"
DATA_DIR="../../../../data/BLURB/data/pubmedqa_new"

BASE_MODEL="dmis-lab/biobert-v1.1"
TOKENIZER="dmis-lab/biobert-v1.1"
# BASE_MODEL="michiyasunaga/BioLinkBERT-base"
# BASE_MODEL= "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
# MODEL="PubMedBERT-base_pure_S20Rel"
# BioLinkBERT-base_pubmedqa_S20Rel_new
MODEL="onto3_BioBERT"
LR=0.5e-5 # 1e-5
PRE_TRAIN_EPOCH=0
T=1
BATCH_SIZE=4 #4
SEQ_LENGTH=512
EPOCHS=30 # 25

!python eval_pubmedqa.py \
    --data_dir $DATA_DIR   \
    --model_dir $MODEL_DIR \
    --tokenizer $BASE_MODEL   \
    --base_model $BASE_MODEL \
    --cuda \
    --temperature $T \
    --model $MODEL   \
    --pretrain_epoch $PRE_TRAIN_EPOCH \
    --max_seq_length $SEQ_LENGTH   \
    --batch_size $BATCH_SIZE \
    --lr $LR   \
    --epochs $EPOCHS \
    --repeat_runs 10 \
    --patience 4

In [None]:
%cd evaluate_tasks/med_qa/bioasq7b

DATASET="BioAsq"
MODEL_DIR="../../../../model_dir/"
DATA_DIR="../../../../data/BLURB/data/BioASQ/"
# BASE_MODEL= "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
# BASE_MODEL="dmis-lab/biobert-v1.1"
BASE_MODEL="michiyasunaga/BioLinkBERT-base"
MODEL="ontochem_15_full_rel_adapters"
T=1
LR=0.5e-5
TRAIN_MODE="fusion"

!python predict.py \
--train_mode $TRAIN_MODE \
--model_dir $MODEL_DIR \
--data_dir $DATA_DIR  \
--base_model $BASE_MODEL \
--tokenizer $BASE_MODEL  \
--model $MODEL  \
--max_seq_length 512   \
--batch_size 4 \
--lr $LR   \
--pretrain_epoch 0 \
--epochs 25 \
--temperature $T \
--cuda \
--repeat_runs 2

In [None]:
%cd evaluate_tasks/med_qa/medqa

DATASET="MEDQA"
MODEL_DIR="../../../../model_dir/"
DATA_DIR="../../../../data/BLURB/data/medqa/data_clean/questions/US/4_options/"

BASE_MODEL="michiyasunaga/BioLinkBERT-base"
MODEL="BioLinkBERT-base_pubmedqa_S20Rel_new"
T=1
LR=1e-5
BATCH_SIZE=4
TRAIN_MODE="fusion"

!python run_medqa.py \
    --train_mode $TRAIN_MODE \
    --model_dir $MODEL_DIR \
    --data_dir $DATA_DIR  \
    --base_model $BASE_MODEL \
    --tokenizer $BASE_MODEL  \
    --model $MODEL  \
    --max_seq_length 512   \
    --batch_size $BATCH_SIZE \
    --lr $LR   \
    --repeat_runs 2 \
    --pretrain_epoch 0 \
    --epochs 25 \
    --temperature $T \
    --cuda \
    --repeat_runs 2 \
    --patience 3 \
    --gradient_accumulation_steps 3

In [None]:
%cd evaluate_tasks/med_doc_cls/hoc

DATASET="Hoc"
MODEL_DIR="../../../../model_dir/"
DATA_DIR="../../../../data/BLURB/data/HoC/"
# BASE_MODEL="michiyasunaga/BioLinkBERT-base"
# BASE_MODEL= "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
BASE_MODEL="dmis-lab/biobert-v1.1"
MODEL="onto3_BioBERT"

T=1
LR=1e-5
BATCH_SIZE=16
TRAIN_MODE="fusion"

!python eval_hoc.py \
--train_mode $TRAIN_MODE \
--model_dir $MODEL_DIR \
--data_dir $DATA_DIR  \
--base_model $BASE_MODEL \
--tokenizer $BASE_MODEL  \
--model $MODEL  \
--max_seq_length 128   \
--batch_size $BATCH_SIZE \
--lr $LR   \
--pretrain_epoch 0 \
--epochs 20 \
--repeat_runs 5 \
--temperature $T \
--cuda


In [None]:
%cd evaluate_tasks/med_nli

DATASET="MEDNLI"
MODEL_DIR="../../../model_dir/"
DATA_DIR="../../../data/mednli/"

BASE_MODEL="michiyasunaga/BioLinkBERT-base"

# MODEL="BioLinkBERT-base_S20Rel_256bs_1e"
# BASE_MODEL= "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
# MODEL="PubMedBERT-base_pure_S20Rel"
# BASE_MODEL="michiyasunaga/BioLinkBERT-base"

MODEL="ontochem_15_full_rel_adapters"

""""
LR=1e-5
TRAIN_MODE="fusion"
python eval_nli.py \
--train_mode $TRAIN_MODE \
--model_dir $MODEL_DIR \
--data_dir $DATA_DIR  \
--base_model $BASE_MODEL \
--tokenizer $BASE_MODEL  \
--model $MODEL  \
--max_seq_length 256   \
--batch_size 16 \
--lr $LR   \
--repeat_runs 3 \
--pretrain_epoch 0 \
--epochs 25 \
--cuda
"""
LR=0.5e-5
TRAIN_MODE="fusion"
!python eval_nli.py \
    --train_mode $TRAIN_MODE \
    --model_dir $MODEL_DIR \
    --data_dir $DATA_DIR  \
    --base_model $BASE_MODEL \
    --tokenizer $BASE_MODEL  \
    --model $MODEL  \
    --max_seq_length 256   \
    --batch_size 8 \
    --lr $LR   \
    --pretrain_epoch 0 \
    --epochs 25 \
    --cuda \
    --repeat_runs 3 \
    --patience 3 \
    # --gradient_accumulation_steps 3


In [None]:
from google.colab import runtime
runtime.unassign()