In [None]:
!pip install transformers datasets peft bitsandbytes



Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
[0mCollecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
# prompt: give me thecode to mount my drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType

# Step 1: Load the tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Step 2: Load the custom dataset from CSV
dataset = load_dataset("csv", data_files="/content/drive/MyDrive/cleaned_skindiseasesdataset.csv", split="train")

# Step 3: Check dataset columns (just to verify)
print(dataset.column_names)  # Expected output: ['Disease name', 'Text']

# Step 4: Convert 'Disease name' to numeric labels
labels = dataset['Disease name']
label2id = {label: idx for idx, label in enumerate(sorted(set(labels)))}

# Step 5: Map the labels to numeric values, making sure that each entry is processed individually
def map_labels(example):
    # Ensure we are accessing 'Disease name' correctly as a string
    disease_name = example['Disease name']
    return {'labels': label2id[disease_name]}

# Map the function to the dataset
dataset = dataset.map(map_labels, batched=False)  # Use batched=False to ensure individual processing

# Step 6: Tokenize the text data in the 'Text' column
def tokenize_function(examples):
    return tokenizer(examples["Text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 7: Load the model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label2id))

# Step 8: Apply LoRA configuration for efficient training
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    r=8,                        # Low-rank dimension
    lora_alpha=32,              # Scaling factor
    lora_dropout=0.1,           # Dropout rate
    bias="none",                # Do not train bias
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]  # Specify LoRA target modules in attention layers
)

# Step 9: Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Step 10: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save results
    evaluation_strategy="no",        # Disable evaluation during training
    learning_rate=2e-5,              # Learning rate for optimization
    per_device_train_batch_size=16,  # Batch size for training
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Regularization to avoid overfitting
)

# Step 11: Initialize the Trainer with the model, training arguments, and tokenized dataset
trainer = Trainer(
    model=model,                     # The model to be trained
    args=training_args,              # Training arguments
    train_dataset=tokenized_dataset, # The tokenized dataset for training
)

# Step 12: Train the model
trainer.train()


['Disease name', 'Text']


Map:   0%|          | 0/483 [00:00<?, ? examples/s]

Map:   0%|          | 0/483 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=93, training_loss=2.6239240092615925, metrics={'train_runtime': 9.9108, 'train_samples_per_second': 146.205, 'train_steps_per_second': 9.384, 'total_flos': 48993982313472.0, 'train_loss': 2.6239240092615925, 'epoch': 3.0})

In [None]:
# Step 12: Save the model after training
model_save_path = "/content/drive/MyDrive/saved_model"
model.save_pretrained(model_save_path)  # Save the model's state after the last epoch

# Optionally, you can also save the tokenizer if needed
tokenizer.save_pretrained(model_save_path)


('/content/drive/MyDrive/saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/saved_model/vocab.txt',
 '/content/drive/MyDrive/saved_model/added_tokens.json',
 '/content/drive/MyDrive/saved_model/tokenizer.json')

#Just change the value in checkpoint_dir and the num_of_epochs to continue training from the last epoch onwards the num_of_epochs


In [None]:
# Specify the directory of the checkpoint you want to resume from
checkpoint_dir = "/content/drive/MyDrive/saved_model/checkpoint-<200>"  # Replace <X> with the checkpoint number

# Load the model and tokenizer from the saved directory
model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=14, ignore_mismatched_sizes=True)

# Apply LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    r=8,                        # Low-rank dimension
    lora_alpha=32,              # Scaling factor
    lora_dropout=0.1,           # Dropout rate
    bias="none",                # Do not train bias
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]  # Specify LoRA target modules in attention layers
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Define training arguments and enable resume from checkpoint
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save results
    evaluation_strategy="no",        # Disable evaluation during training
    learning_rate=2e-5,              # Learning rate for optimization
    per_device_train_batch_size=16,  # Batch size for training
    num_train_epochs=300,              # Number of epochs to train
    weight_decay=0.01,               # Regularization to avoid overfitting
    resume_from_checkpoint=checkpoint_dir,  # Manually specify checkpoint to resume from
)

# Initialize the Trainer with the model, training arguments, and tokenized dataset
trainer = Trainer(
    model=model,                     # The model to be trained
    args=training_args,              # Training arguments
    train_dataset=tokenized_dataset, # The tokenized dataset for training
)

# Train the model and resume from the specified checkpoint
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,2.3106
1000,1.3435
1500,0.797
2000,0.5029
2500,0.3171
3000,0.2051
3500,0.1295
4000,0.0818
4500,0.0488
5000,0.0318


TrainOutput(global_step=9300, training_loss=0.31413974697871877, metrics={'train_runtime': 495.6869, 'train_samples_per_second': 292.322, 'train_steps_per_second': 18.762, 'total_flos': 4899398231347200.0, 'train_loss': 0.31413974697871877, 'epoch': 300.0})

#Testing the code on input


In [None]:
from transformers import DistilBertForSequenceClassification, AutoTokenizer
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/saved_model", num_labels=14)

# Define the label2id mapping based on your dataset (replace with actual disease names)
label2id = {
    'Eczema': 0,
    'Psoriasis': 1,
    'Scabies': 2,
    'Acne': 3,
    'Rosacea': 4,
    'Vitiligo': 5,
    'Ringworm (Tinea Corporis)': 6,
    'Hives(Urticaria)': 7,
    'Folliculitis': 8,
    "Athlete's Foot (Tinea Pedis)": 9,
    'Contact Dermatitis': 10,
    'Impetigo': 11,
    'Shingles': 12,
    'Ringworm': 13
}

# Reverse mapping for id to label
id2label = {v: k for k, v in label2id.items()}

# Function to predict the disease from input text
def predict_disease(input_line):
    # Step 1: Tokenize the input
    inputs = tokenizer(input_line, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Step 2: Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Step 3: Get the predicted label (index of the highest score)
    logits = outputs.logits
    predicted_class_idx = torch.argmax(logits, dim=-1).item()

    # Step 4: Convert the numeric label back to the disease name
    predicted_disease = id2label[predicted_class_idx]

    return predicted_disease

# Example input
input_line = "I've been feeling extra itchy lately, especially on my inner thighs and abdomen. I'm on medication for my immune system, so I'm worried it might be a fungal infection or something else."
predicted_disease = predict_disease(input_line)

# Print the predicted disease
print(f"Predicted Disease: {predicted_disease}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Disease: Contact Dermatitis
