In [1]:
!pip install torch torchtext transformers sentencepiece pandas tqdm datasets

Collecting torchtext
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [None]:
# Inference from Pre-trained model

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained DistilGPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# Set the padding token to the end-of-sequence token (common practice for GPT-2-based models)
tokenizer.pad_token = tokenizer.eos_token

# Define the input query
input_query = "What are the symptoms of Chicken pox?"

# Tokenize the input query
input_tokens = tokenizer.encode_plus(
    input_query,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=50  # Adjust max_length if needed
)

# Generate response using the pre-trained model
output_tokens = model.generate(
    input_ids=input_tokens["input_ids"],
    attention_mask=input_tokens["attention_mask"],
    max_length=50,  # Adjust max_length if needed
    num_return_sequences=1,
    do_sample=True,  # Sampling adds randomness for diverse outputs
    top_k=8,  # Keep top 8 most probable tokens at each step
    top_p=0.95,  # Consider tokens with a cumulative probability of 0.95
    temperature=0.7,  # Adjust temperature for response diversity
    repetition_penalty=1.2,  # Penalize repetitive token generations
    pad_token_id=tokenizer.pad_token_id  # Handle padding gracefully
)

# Decode the generated output to human-readable text
decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Print the results
print("Pre-trained DistilGPT-2 Response:")
print(decoded_output)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Pre-trained DistilGPT-2 Response:
What are the symptoms of Chicken pox?
The most common form of chickenpox is a mild, but sometimes fatal infection. The disease can be spread through bites or wounds with no obvious cause: fever; vomiting and diarrhea – all signs of serious


In [None]:
# Fine-tuning SLM on symptoms-disease dataset

In [3]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [4]:
# Load the dataset
dataset = load_dataset("prognosis/symptoms_disease_v1")

combined_disease_prediction_symptom.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/10110 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'reference', 'output', 'instruction'],
        num_rows: 10110
    })
})

In [6]:
# Convert to a pandas dataframe
updated_data = [{'Input': item['instruction'], 'Disease': item['output']} for item in dataset['train']]
df = pd.DataFrame(updated_data)

In [7]:
df.head()

Unnamed: 0,Input,Disease
0,What are the symptoms of hypertensive disease?,The following are the symptoms of hypertensive...
1,I am having the following symptoms: pain ches...,The symptoms listed indicates that the patient...
2,What are the symptoms of diabetes?,The following are the symptoms of diabetes: po...
3,"I am having the following symptoms: polyuria, ...",The symptoms listed indicates that the patient...
4,What are the symptoms of depressive disorder?,The following are the symptoms of depressive d...


In [8]:
# # Just extract the Symptoms
# df['Symptoms'] = df['Symptoms'].apply(lambda x: ', '.join(x.split(', ')))
# display(df.head())

In [9]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [10]:
# If you have an NVIDIA GPU attached, use 'cuda'
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')

In [11]:
# The tokenizer turns texts to numbers (and vice-versa)
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

# The transformer
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)

In [12]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [13]:
df.describe()

Unnamed: 0,Input,Disease
count,10110,10110
unique,615,615
top,What are the symptoms of Acne?,The symptoms listed indicates that the patient...
freq,120,120


In [14]:
# Dataset Prep
class LanguageDataset(Dataset):
    """
    An extension of the Dataset object to:
      - Make training loop cleaner
      - Make ingestion easier from pandas df's
    """
    def __init__(self, df, tokenizer):
        self.labels = df.columns
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)  # Fix here
        self.max_length = x

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, df):  # Fix here
        """
        Smallest power of two larger than the longest term in the data set.
        Important to set up max length to speed training time.
        """
        max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x * 2
        return x

# Cast the Huggingface data set as a LanguageDataset we defined above
data_sample = LanguageDataset(df, tokenizer)


In [15]:
data_sample

<__main__.LanguageDataset at 0x791f5aa68e00>

In [16]:
# Create train, valid
train_size = int(0.8 * len(data_sample))
valid_size = len(data_sample) - train_size
train_data, valid_data = random_split(data_sample, [train_size, valid_size])

In [25]:
# Make the iterators
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

In [26]:
# Set the number of epochs
num_epochs = 2
# Model params
BATCH_SIZE = 8

In [27]:
# Training parameters
batch_size = BATCH_SIZE
model_name = 'distilgpt2'
gpu = 0

In [28]:
# Set the learning rate and loss function
## CrossEntropyLoss measures how close answers to the truth.
## More punishing for high confidence wrong answers
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

In [29]:
# Init a results dataframe
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu',
                                'training_loss', 'validation_loss', 'epoch_duration_sec'])

In [30]:
# The training loop
for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for the epoch

    # Training
    ## This line tells the model we're in 'learning mode'
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    # Validation
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(valid_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()  # Convert tensor to scalar
            valid_iterator.set_postfix({'Validation Loss': loss.item()})
            epoch_validation_loss += loss.item()

    avg_epoch_validation_loss = epoch_validation_loss / len(valid_loader)

    end_time = time.time()  # End the timer for the epoch
    epoch_duration_sec = end_time - start_time  # Calculate the duration in seconds

    new_row = {'transformer': model_name,
               'batch_size': batch_size,
               'gpu': gpu,
               'epoch': epoch+1,
               'training_loss': avg_epoch_training_loss,
               'validation_loss': avg_epoch_validation_loss,
               'epoch_duration_sec': epoch_duration_sec}  # Add epoch_duration to the dataframe

    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch+1}, Validation Loss: {total_loss/len(valid_loader)}")

Training Epoch 1/2 Batch Size: 8, Transformer: distilgpt2:   0%|          | 0/1011 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Training Epoch 1/2 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 1011/1011 [03:05<00:00,  5.45it/s, Training Loss=0.0509]
Validation Epoch 1/2: 100%|██████████| 253/253 [00:14<00:00, 17.98it/s, Validation Loss=0.055]


Epoch: 1, Validation Loss: 0.07707558037794154


Training Epoch 2/2 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 1011/1011 [03:08<00:00,  5.36it/s, Training Loss=0.0491]
Validation Epoch 2/2: 100%|██████████| 253/253 [00:14<00:00, 18.07it/s, Validation Loss=0.0546]

Epoch: 2, Validation Loss: 0.07303201477871582





In [31]:
# Define the input string
input_str = "What are the symptoms of Chicken pox?"

# Encode the input string with padding and attention mask
encoded_input = tokenizer.encode_plus(
    input_str,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=50  # Adjust max_length as needed
)

# Move tensors to the appropriate device
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)

# Set the pad_token_id to the tokenizer's eos_token_id
pad_token_id = tokenizer.eos_token_id

# Generate the output
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=50,  # Adjust max_length as needed
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.5,
    repetition_penalty=1.2,
    pad_token_id=pad_token_id
)

# Decode and print the output
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)


What are the symptoms of Chicken pox? | The following are the symptoms of Chicken pox: itching, skin rash, fatigue, lethargy, high fever, headache, loss of appetite, mild fever, swelled lymph nodes, mala


In [32]:
torch.save(model, 'SmallMedLM.pt')