## Medical Question-Answering - PubMed HuggingFace Dataset

In [32]:
!pip install transformers trl datasets peft accelerate bitsandbytes sentencepiece

Collecting trl
  Downloading trl-0.7.10-py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m2.0 MB/s[0m eta [36m

In [1]:
import os
import gc
from datasets import load_dataset
import torch
import torch.nn as nn
import bitsandbytes as bnb
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TrainingArguments,BitsAndBytesConfig
from peft import LoraConfig, PeftModel, get_peft_config
from trl import SFTTrainer


## Load PubMed QA data

In [3]:
data = load_dataset("pubmed_qa","pqa_labeled")

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})

In [5]:
data['train']['question'][0]

'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?'

In [6]:
data['train']['long_answer'][0]

'Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of our knowledge, this is the first report of mitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD. Also, for the first time, we have shown the feasibility for the use of CsA in a whole plant system. Overall, our findings implicate the mitochondria as playing a critical and early role in developmentally regulated PCD in the lace plant.'

In [7]:
data['train']['context'][0]

{'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
  'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoT

In [8]:
#hyperparameters
LORA_ALPHA = 32
LORA_DROPOUT = 0.2
LORA_R = 4

LEARNING_RATE = 2e-4
NUM_EPOCHS = 1
BATCH_SIZE = 1
WEIGHT_DECAY = 0.001
MAX_GRAD_NORM = 0.3
gradient_accumulation_steps = 16
STEPS = 1
OPTIM = "adam"
MAX_STEPS = 512
OUTPUT_DIR = "./results"

## Quantization configuration using Bitsandbytes

In [9]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
torch.cuda.get_device_capability()
device_map = "cuda:0"

## Define model and tokenization

In [10]:
model_name = "nousresearch/llama-2-7b-chat-hf"

In [11]:
model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map=device_map,
        )
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [13]:
torch.cuda.empty_cache()

## Define LoRA adapter

In [14]:
peft_config = LoraConfig(
      lora_alpha= LORA_ALPHA,
      lora_dropout= LORA_DROPOUT,
      r= LORA_R,
      bias="none",
      task_type="CAUSAL_LM",
  )

## Setup training parameters

In [15]:
training_args = TrainingArguments(
      output_dir= OUTPUT_DIR,
      per_device_train_batch_size=BATCH_SIZE,
      gradient_accumulation_steps= gradient_accumulation_steps,
      learning_rate= LEARNING_RATE,
      logging_steps= STEPS,
      num_train_epochs= NUM_EPOCHS,
      max_steps= MAX_STEPS,
)

In [16]:
torch.cuda.empty_cache()

In [17]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data['train'],
        peft_config=peft_config,
        dataset_text_field= "question",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_args,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
trainer.train()

Step,Training Loss
1,3.2842
2,3.4235
3,3.0604
4,2.8402
5,3.1441
6,3.2934
7,2.9667
8,2.7637
9,2.6996
10,2.7225


TrainOutput(global_step=512, training_loss=1.6290329191833735, metrics={'train_runtime': 8119.5839, 'train_samples_per_second': 1.009, 'train_steps_per_second': 0.063, 'total_flos': 7997119295004672.0, 'train_loss': 1.6290329191833735, 'epoch': 8.19})

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

## Test the model

### Using Inference pipeline

In [24]:
from transformers import pipeline
pipe = pipeline(task="text-generation",model=model,tokenizer=tokenizer,max_length=500)

In [25]:
prompt = "Who is at risk for Prostate Cancer?"

In [26]:
template = f"""<s>[INST] <<SYS>>
You are a honest Medical assistant bot.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information.
<</SYS>>

{prompt} [/INST]
"""

In [27]:
result = pipe(template)



In [28]:
response = result[0]['generated_text']
index = response.find("[/INST]")+len("[/INST]")

In [29]:
print(response[index:].strip())

Thank you for your question. Prostate cancer can affect anyone, regardless of gender, race, or ethnicity. However, some factors can increase a person's risk of developing prostate cancer. These include:

1. Age: The risk of prostate cancer increases with age, with most cases diagnosed in men over the age of 50.
2. Family history: Men who have a family history of prostate cancer are at higher risk of developing the disease.
3. Race: Prostate cancer is more common in African American men than in men of other races.
4. Genetic factors: Men with a family history of breast or ovarian cancer, which are linked to genetic mutations, may also be at higher risk of prostate cancer.
5. Diet and lifestyle: A diet high in fat and low in fruits and vegetables may increase the risk of prostate cancer.

It's important to note that not all cases of prostate cancer are preventable, and some men who develop the disease may not have any known risk factors. If you have concerns about your risk of prostate c

## Without using pipeline

In [30]:
from peft import get_peft_model

In [31]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [33]:
inputs = tokenizer(template, return_tensors="pt").to(device)
model = model.to(device)
outputs = model.generate(**inputs, max_new_tokens=1024)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

[INST] <<SYS>>
You are a honest Medical assistant bot.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information.
<</SYS>>

Who is at risk for Prostate Cancer? [/INST]
Thank you for asking! Prostate cancer can affect anyone, regardless of age, gender, or ethnicity. However, some factors can increase a person's risk of developing prostate cancer. These include:

1. Age: The risk of developing prostate cancer increases with age, with most cases occurring in men over the age of 50.
2. Family history: Men who have a family history of prostate cancer are at higher risk of developing the disease. This includes men who have a father or brother who was diagnosed

In [44]:
import os

# Set the locale to UTF-8
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LANGUAGE'] = 'en_US.UTF-8'


In [59]:
!pip install pyngrok
from google.colab import files
import pandas as pd
from io import StringIO
import streamlit as st
from pyngrok import ngrok

# Setting up the environment variable
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LANGUAGE'] = 'en_US.UTF-8'

!pip install pyngrok

# Upload file
uploaded_file = files.upload()
file_name = list(uploaded_file.keys())[0]
data = pd.read_csv(StringIO(uploaded_file[file_name].decode('utf-8')))

# Streamlit app
def main():
    st.title("CSV File Viewer and Translator")

    # Display the uploaded data
    st.table(data)

    # Button to translate text
    if st.button('Translate'):
        # Assuming you have a column named 'text' for translation
        translator = Translator(to_lang="id")
        translated_text = translator.translate(data['text'][0])
        st.write(f"Translated Text: {translated_text}")

# Run Streamlit app using bash
!nohup streamlit run PubMed_HF_dataset_Fine_Tune .py &

# Set up ngrok tunnel
public_url = ngrok.connect(port='8501')

# Display ngrok tunnel URL
public_url

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [58]:
pip install pyngrok


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [55]:
import os
from google.colab import files
import pandas as pd
from io import StringIO
from pyngrok import ngrok
import streamlit as st

# Setting up the environment variable
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LANGUAGE'] = 'en_US.UTF-8'


# Upload file
uploaded_file = files.upload()
file_name = list(uploaded_file.keys())[0]
data = pd.read_csv(StringIO(uploaded_file[file_name].decode('utf-8')))

# Streamlit app
def main():
    st.title("CSV File Viewer and Translator")

    # Display the uploaded data
    st.table(data)

    # Button to translate text
    if st.button('Translate'):
        # Assuming you have a column named 'text' for translation
        translator = Translator(to_lang="id")
        translated_text = translator.translate(data['text'][0])
        st.write(f"Translated Text: {translated_text}")

# Run Streamlit app using bash
!nohup streamlit run PubMed_HF_dataset_Fine_Tune .py &

# Set up ngrok tunnel
public_url = ngrok.connect(port='8501')

# Display ngrok tunnel URL
public_url

ModuleNotFoundError: No module named 'pyngrok'

In [None]:
!pip install streamlit

In [None]:
pip install streamlit translate

In [None]:
pip install streamlit translate

In [None]:
%save -f "PubMed_HF_dataset_Fine_Tune.py" /content/drive/MyDrive/SKRIPSIWET/Fine-Tuning-LLMs/PubMed_HF_dataset_Fine_Tune.py

In [None]:
!streamlit run "drive/My Drive/SKRIPSIWET/Fine-Tuning-LLMs/PubMed_HF_dataset_Fine_Tune.py" --server.port 8502