In [1]:
from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/MyDrive/Projects/Medical_chatbot

Mounted at /content/drive/
/content/drive/MyDrive/Projects/Medical_chatbot


In [6]:
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import torch

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [8]:
input_ids = tokenizer.encode('Hey I was good at basketball but  ', return_tensors="pt")
output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)



In [9]:
output[0]

tensor([10814,   314,   373,   922,   379,  9669,   475,   220,   220,  1849,
           40,  1422,   470,   760,   703,   284,   711,  9669,    13,   314])

In [12]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Hey I was good at basketball but   I didn't know how to play basketball. I


In [13]:
start_of = "<startoftext> "
end_of = " <endoftext>"
bot = " <bot>: "

In [14]:
class DocBotDataset(Dataset):
  def __init__(self, path, tokenizer, max_length):
    self.df = pd.read_csv(path)
    self.tokenizer = tokenizer
    self.max_length = max_length

    self.tokenizer.pad_token = self.tokenizer.eos_token

    self.input_ids = []
    self.attn_masks = []

    for i in range(len(self.df)):
      text = start_of + self.df.iloc[i]['Questions'] + bot + self.df.iloc[i]['Answers'] + end_of
      text_encoded = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length)
      self.input_ids.append(torch.tensor(text_encoded['input_ids']))
      self.attn_masks.append(torch.tensor(text_encoded['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return (self.input_ids[idx], self.attn_masks[idx])


In [15]:
tokenizer.add_special_tokens(
    {
        'pad_token': '<pad>',
        'bos_token': '<startoftext>',
        'eos_token': '<endoftext>',
    }
)

tokenizer.add_tokens(["<bot>:"])

1

In [16]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [None]:
dataset = DocBotDataset('final_medical_dataset.csv', tokenizer, 512)

In [None]:
train_size = int(0.85 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

39,230 training samples
6,924 validation samples


In [None]:
len(dataset[0][0])

512

In [None]:
batch_size = 4
num_workers = 2

train_loader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size, # Trains with this batch size.
            num_workers = num_workers
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
val_loader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size, # Evaluate with this batch size.
            num_workers = num_workers
        )

In [None]:
# tokenizer.decode(dataset[0][0], skip_special_tokens=True)

In [None]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU.


In [None]:
model = model.to(device)

In [None]:
from torch.optim import AdamW

epochs = 10
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
check_path = './checkpoints'

# this produces sample output every 100 steps
sample_every = 100

In [None]:
from tqdm.notebook import tqdm

In [None]:
optim = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [None]:
from torch.optim.lr_scheduler import StepLR

# Define your scheduler
scheduler = StepLR(optim, step_size=3, gamma=.5)

In [None]:
total_steps = len(train_loader)
total_steps

9808

In [None]:
train_loader[4]

TypeError: 'DataLoader' object is not subscriptable

In [None]:
# Define your epochs loop
for i in range(epochs):
    total_train_loss = 0
    total_val_loss = 0

    # Training loop
    model.train()
    for step, batch in tqdm(enumerate(train_loader)):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        optim.zero_grad()

        outputs = model(b_input_ids,
                        labels=b_labels,
                        attention_mask=b_masks,
                        token_type_ids=None)

        loss = outputs.loss

        loss.backward()
        optim.step()

        batch_loss = loss.item()
        total_train_loss += batch_loss

    # Validation loop
    model.eval()
    for step, batch in tqdm(enumerate(val_loader)):
        v_input_ids = batch[0].to(device)
        v_labels = batch[0].to(device)
        v_masks = batch[1].to(device)

        optim.zero_grad()

        outputs = model(v_input_ids,
                        labels=v_labels,
                        attention_mask=v_masks,
                        token_type_ids=None)

        loss = outputs.loss

        loss.backward()
        optim.step()

        batch_loss = loss.item()
        total_val_loss += batch_loss

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    # Update scheduler
    scheduler.step()

    print('Epoch {}: train loss: {}, val loss: {}'.format(i, avg_train_loss, avg_val_loss))


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 0: train loss: 0.9270634705209907, val loss: 0.8737342224119028


0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0300b4d5a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0300b4d5a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

0it [00:00, ?it/s]

Epoch 1: train loss: 0.8374034157843249, val loss: 0.7606504448296224


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 2: train loss: 0.7674747842529616, val loss: 0.6752700139286889


0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0300b4d5a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0300b4d5a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

0it [00:00, ?it/s]

Epoch 3: train loss: 0.6433338052167772, val loss: 0.45858015138967956


0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
model_save_path = 'docbot_GPT.pth'
torch.save(model.state_dict(), model_save_path)

In [17]:
load_model_path = 'docbot_GPT.pth'
model.load_state_dict(torch.load(load_model_path))

<All keys matched successfully>

In [21]:
model = model.cuda()

In [28]:
model.eval()
input_prompt = "what is appendix?"

prompt = f"{start_of} {input_prompt} \n{bot}"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(torch.device("cuda"))

sample_outputs = model.generate(
                                generated,
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,
                                top_k=50,
                                max_length = 100,
                                top_p=0.95,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:   what is appendix? 
  <bot>:   An appendix is a small, tubelike organ where a small amount of tissue or small sac (micrognathia) attached to the main intestines.  An abdomen is felt as the end of the abdomen above the navel.  The intestines are located between the chest and abdomen.  The intestines are held behind by the large intestines.  The appendix is in the pelvis  the most common organ in the body


1:   what is appendix? 
  <bot>:   appendix is a round firm oval cell that is a tubelike instrument that hangs off the appendix.   The appendix is a long tube about 5 inches long that ends at the top of the neck below the navel where the appendix meets the stomach.   The appendix is a large tubelike organ that hangs off the appendix. The appendix is the tube that carries food and fluid from the small intestine to the large intestine.  The


2:   what is appendix? 
  <bot>:   appendix is an extra bone growth that is present in the intestines of an appendages or small intestine.  th