In [None]:
!pip install transformers datasets accelerate pandas scikit-learn -q

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
import os

In [3]:
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        print("CUDA is available")
        torch.cuda.manual_seed_all(seed)
set_seed(42)

CUDA is available


In [4]:
csv_file_path = "/kaggle/input/diseaseandsymptoms/DiseaseAndSymptoms.csv"

In [5]:
try:
    df = pd.read_csv(csv_file_path)
    print(f"Original dataset loaded. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: {csv_file_path} not found. Please upload it to your Kaggle input directory or run the cell to create dummy data.")

Original dataset loaded. Shape: (4920, 18)


In [6]:
for col in df.columns:
    if 'Symptom_' in col:
        df[col] = df[col].str.replace('_', ' ').str.strip()

In [7]:
def combine_symptoms(row):
    symptoms = [
        str(row[col]) for col in df.columns
        if 'Symptom_' in col and pd.notna(row[col]) and str(row[col]).strip() not in ['', 'nan']
    ]
    return ", ".join(symptoms)

In [8]:
df['symptom_text'] = df.apply(combine_symptoms, axis=1)

In [9]:
df['text'] = "Symptoms: " + df['symptom_text'] + ". Disease: " + df['Disease'] + "."
df['text'] = df['text'].str.replace('  ', ' ') # Clean up extra spaces

In [10]:
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['Disease'])

In [11]:
id_to_label = {id: label for id, label in enumerate(label_encoder.classes_)}
label_to_id = {label: id for id, label in enumerate(label_encoder.classes_)}

In [12]:
print(f"\nExample of processed data:")
print(df[['symptom_text', 'Disease', 'labels', 'text']].head())
print(f"Number of unique diseases: {len(label_encoder.classes_)}")
print(f"Disease classes: {list(label_encoder.classes_)}")


Example of processed data:
                                        symptom_text           Disease  \
0  itching, skin rash, nodal skin eruptions, disc...  Fungal infection   
1  skin rash, nodal skin eruptions, dischromic  p...  Fungal infection   
2  itching, nodal skin eruptions, dischromic  pat...  Fungal infection   
3            itching, skin rash, dischromic  patches  Fungal infection   
4           itching, skin rash, nodal skin eruptions  Fungal infection   

   labels                                               text  
0      15  Symptoms: itching, skin rash, nodal skin erupt...  
1      15  Symptoms: skin rash, nodal skin eruptions, dis...  
2      15  Symptoms: itching, nodal skin eruptions, disch...  
3      15  Symptoms: itching, skin rash, dischromic patch...  
4      15  Symptoms: itching, skin rash, nodal skin erupt...  
Number of unique diseases: 41
Disease classes: ['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthrit

In [13]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Disease'])

In [14]:
train_dataset = Dataset.from_pandas(train_df[['text', 'labels']])
test_dataset = Dataset.from_pandas(test_df[['text', 'labels']])

In [15]:
print(f"\nTrain dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


Train dataset size: 3936
Test dataset size: 984


In [16]:
model_name = "distilbert-base-uncased" # A good small model for this
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

In [18]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3936 [00:00<?, ? examples/s]

Map:   0%|          | 0/984 [00:00<?, ? examples/s]

In [19]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text", "__index_level_0__"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text", "__index_level_0__"])

In [20]:
print("\nExample of tokenized data:")
print(tokenized_train_dataset[0])


Example of tokenized data:
{'labels': 7, 'input_ids': [101, 8030, 1024, 2067, 3255, 1010, 11251, 1999, 10726, 1010, 3300, 3255, 1010, 4487, 29212, 1010, 3279, 1997, 5703, 1012, 4295, 1024, 28711, 11867, 15422, 8516, 12650, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_encoder.classes_),
    id2label=id_to_label, # Store mappings for easier inference
    label2id=label_to_id
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to: {device}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to: cuda


In [22]:
output_dir = "/kaggle/working/results" # Directory to save checkpoints and model
logging_dir = "/kaggle/working/logs"    # Directory for logs

In [None]:
!pip install --upgrade transformers


In [25]:
import transformers
print(transformers.__version__)


4.55.4


In [28]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir=logging_dir,
    num_train_epochs=5,                  # Number of training epochs
    per_device_train_batch_size=16,      # Batch size for training
    per_device_eval_batch_size=16,       # Batch size for evaluation
    warmup_steps=500,                    # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                   # Strength of weight decay
    logging_steps=100,                   # Log every 100 steps
    eval_strategy="epoch",         # Evaluate every epoch
    save_strategy="epoch",               # Save model every epoch
    load_best_model_at_end=True,         # Load the best model at the end of training
    metric_for_best_model="accuracy",    # Metric to use for early stopping and best model
    report_to="none"                     # Disable reporting to W&B, MLflow, etc.
)

In [29]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    # Using 'weighted' to handle potential class imbalance
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\nStarting training...")
trainer.train()
print("Training complete!")

  trainer = Trainer(



Starting training...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,3.6389,2.825759,0.944106,0.927805,0.920635,0.944106
2,2.4656,0.667322,1.0,1.0,1.0,1.0
3,0.8455,0.044486,1.0,1.0,1.0,1.0
4,0.1216,0.012255,1.0,1.0,1.0,1.0
5,0.015,0.007771,1.0,1.0,1.0,1.0




Training complete!


In [31]:
final_model_path = "/kaggle/working/fine_tuned_disease_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"\nFine-tuned model and tokenizer saved to: {final_model_path}")


Fine-tuned model and tokenizer saved to: /kaggle/working/fine_tuned_disease_model


In [32]:
import shutil

shutil.make_archive("fine_tuned_disease_model", 'zip', final_model_path)

'/kaggle/working/fine_tuned_disease_model.zip'

In [37]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "/kaggle/working/fine_tuned_disease_model"  # path to extracted folder

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Example inference
inputs = tokenizer("""Chest pain when you breathe or cough
Confusion or changes in mental awareness (in adults age 65 and older)
Cough, which may produce phlegm
Fatigue
Fever, sweating and shaking chills
Lower than normal body temperature (in adults older than age 65 and people with weak immune systems)
Nausea, vomiting or diarrhea
Shortness of breath""", return_tensors="pt")
outputs = model(**inputs)
# print(outputs.logits)

import torch
import torch.nn.functional as F

logits = outputs.logits
probs = F.softmax(logits, dim=-1)[0]

topk = torch.topk(probs, 10)

print("\nTop 10 predicted diseases:")
for i, (prob, idx) in enumerate(zip(topk.values, topk.indices)):
    disease_name = id_to_label[int(idx)]
    print(f"{i+1}. {disease_name}: {prob.item():.4f}")



Top 10 predicted diseases:
1. Bronchial Asthma: 0.0679
2. Common Cold: 0.0659
3. Typhoid: 0.0418
4. Tuberculosis: 0.0412
5. Hypoglycemia: 0.0408
6. Dengue: 0.0391
7. Gastroenteritis: 0.0369
8. Pneumonia: 0.0343
9. Migraine: 0.0333
10. GERD: 0.0283


In [38]:

import json

with open(f"{final_model_path}/label_mappings.json", "w") as f:
    json.dump({"id_to_label": id_to_label, "label_to_id": label_to_id}, f)
