In [1]:
! pip install torch torchtext transformers sentencepiece pandas tqdm datasets

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)


In [2]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from tqdm import tqdm
import ast
import time
import datasets

In [6]:
# Loading dataset

data_sample = load_dataset("QuyenAnhDE/Diseases_Symptoms")

Repo card metadata block was not found. Setting CardData to empty.


In [7]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

In [10]:
updated_data = [{'Name': item['Name'], 'Symptoms': item['Symptoms']} for item in data_sample['train']]

In [11]:
df = pd.DataFrame(updated_data)

In [12]:
df.head(5)

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [14]:
df['Symptoms'] = df['Symptoms'].apply(lambda x: ', '.join(x.split(', ')))

In [16]:
df['Symptoms'][0]

'Palpitations, Sweating, Trembling, Shortness of breath, Fear of losing control, Dizziness'

In [17]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split

In [19]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    try:
        device = torch.device("mps")
    except Exception:
        device = torch.device("cpu")

In [20]:
device

device(type='cuda')

In [21]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [22]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [24]:
BATCH_SIZE = 8

In [25]:
df.describe()

Unnamed: 0,Name,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


In [29]:
# Data Preparation

class DiseaseSymptomDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.labels = df.columns
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)
        self.max_length = x

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, df):
      '''
      This function computes maximum sequence length for the dataset.
      '''
      max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
      x = 2
      while x < max_length: x = x * 2
      return x

In [30]:
data_sample = DiseaseSymptomDataset(df, tokenizer)

In [31]:
data_sample

<__main__.DiseaseSymptomDataset at 0x7eda4c59d4d0>

In [32]:
train_size = int(0.8 * len(data_sample))
val_size = len(data_sample) - train_size
train_data, val_data = random_split(data_sample, [train_size, val_size])

In [34]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

In [35]:
num_epochs = 8

In [36]:
batch_size = BATCH_SIZE
model_name = "distilgpt2"
gpu = 0

In [37]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) # Prevents the model from being penalized for predictions made on artificial padding tokens.
optimizer = optim.AdamW(model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

In [38]:
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu', 'training_loss', 'validation_loss', 'epoch_duration_sec '])

In [39]:
# Model Training Loop

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    average_epoch_training_loss = epoch_training_loss / len(train_iterator)
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss
            valid_iterator.set_postfix({'Validation Loss': loss.item()})
            epoch_validation_loss += loss.item()
    average_epoch_validation_loss = epoch_validation_loss / len(valid_iterator)
    end_time = time.time()
    epoch_duration_sec = end_time - start_time

    new_row = {
        'transformer': model_name,
        'batch_size': batch_size,
        'gpu': gpu,
        'epoch': epoch + 1,
        'training_loss': average_epoch_training_loss,
        'validation_loss': average_epoch_validation_loss,
        'epoch_duration_sec': epoch_duration_sec
    }

    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch+1}, Validation Loss: {total_loss/len(val_loader)}")

Training Epoch 1/8 Batch Size: 8, Transformer: distilgpt2:   0%|          | 0/40 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Training Epoch 1/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.66it/s, Training Loss=0.943]
Validation Epoch 1/8: 100%|██████████| 10/10 [00:00<00:00, 18.36it/s, Validation Loss=0.755]


Epoch: 1, Validation Loss: 0.7456490397453308


Training Epoch 2/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.29it/s, Training Loss=0.465]
Validation Epoch 2/8: 100%|██████████| 10/10 [00:00<00:00, 18.69it/s, Validation Loss=0.738]


Epoch: 2, Validation Loss: 0.7253366112709045


Training Epoch 3/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.17it/s, Training Loss=0.345]
Validation Epoch 3/8: 100%|██████████| 10/10 [00:00<00:00, 18.02it/s, Validation Loss=0.736]


Epoch: 3, Validation Loss: 0.7382476329803467


Training Epoch 4/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.04it/s, Training Loss=0.438]
Validation Epoch 4/8: 100%|██████████| 10/10 [00:00<00:00, 17.90it/s, Validation Loss=0.793]


Epoch: 4, Validation Loss: 0.7701788544654846


Training Epoch 5/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.98it/s, Training Loss=0.283]
Validation Epoch 5/8: 100%|██████████| 10/10 [00:00<00:00, 17.33it/s, Validation Loss=0.833]


Epoch: 5, Validation Loss: 0.8223657011985779


Training Epoch 6/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.01it/s, Training Loss=0.211]
Validation Epoch 6/8: 100%|██████████| 10/10 [00:00<00:00, 18.07it/s, Validation Loss=0.881]


Epoch: 6, Validation Loss: 0.8576126098632812


Training Epoch 7/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.08it/s, Training Loss=0.161]
Validation Epoch 7/8: 100%|██████████| 10/10 [00:00<00:00, 18.25it/s, Validation Loss=0.919]


Epoch: 7, Validation Loss: 0.9018468856811523


Training Epoch 8/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.16it/s, Training Loss=0.152]
Validation Epoch 8/8: 100%|██████████| 10/10 [00:00<00:00, 17.86it/s, Validation Loss=0.965]

Epoch: 8, Validation Loss: 0.9539993405342102





In [40]:
input_str = "Kidney Stones"

In [41]:
input_ids = tokenizer.encode(input_str, return_tensors="pt").to(device)

In [42]:
input_ids

tensor([[48374,  1681, 26596]], device='cuda:0')

In [48]:
attention_mask = (input_ids != tokenizer.pad_token_id).long()

In [49]:
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=20,
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.5,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id
)

In [50]:
output

tensor([[48374,  1681, 26596,   930,  1001,  4119, 32692,  2356,    11,  2910,
           287, 18922,    11, 10792,  2956,  1883, 50256]], device='cuda:0')

In [51]:
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

In [52]:
decoded_output

'Kidney Stones | Severe abdominal pain, blood in urine, frequent urination'

In [55]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [56]:
from huggingface_hub import notebook_login
from transformers import AutoModelForCausalLM, AutoTokenizer

notebook_login()

repo_name = "aniketsalunkhe15/SLM-distilgpt2-disease-symptoms-predictor"

# Push model
model.push_to_hub(repo_name)

# Push tokenizer
tokenizer.push_to_hub(repo_name)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aniketsalunkhe15/SLM-distilgpt2-disease-symptoms-predictor/commit/48a3bb5779a2aa6bb4e69f9704c0871d52de15ff', commit_message='Upload tokenizer', commit_description='', oid='48a3bb5779a2aa6bb4e69f9704c0871d52de15ff', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aniketsalunkhe15/SLM-distilgpt2-disease-symptoms-predictor', endpoint='https://huggingface.co', repo_type='model', repo_id='aniketsalunkhe15/SLM-distilgpt2-disease-symptoms-predictor'), pr_revision=None, pr_num=None)

In [59]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer from Hugging Face Hub
repo_id = "aniketsalunkhe15/SLM-distilgpt2-disease-symptoms-predictor"

model = AutoModelForCausalLM.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id)

# Prepare input
input_str = "Kidney Stones"
input_ids = tokenizer.encode(input_str, return_tensors="pt")

# Optional: attention mask
attention_mask = (input_ids != tokenizer.pad_token_id).long()

# Generate output
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=20,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.7,
    repetition_penalty=1.2
)

# Decode result
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Kidney Stones | Severe abdominal pain, blood in the side or back, blood in urine
