In [17]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient("mongodb://127.0.0.1:27017/")
db = client["medical_db"]
collection = db["mayo_diseases"]

cursor = collection.find({})

data = []

for doc in cursor:

    sections = doc.get("sections", {})

    data.append({
        "disease_name": doc.get("disease_name", ""),

        "overview": sections.get("overview", ""),
        "symptoms": sections.get("symptoms", ""),
        "causes": sections.get("causes", ""),

        # Sometimes you saved "risk factors" or "risk factors"
        "factors": (
            sections.get("risk factors", "") or
            sections.get("risk_factor", "") or
            sections.get("risk", "")
        )
    })

df = pd.DataFrame(data)

print("Dataset size:", df.shape)
df.head()

Dataset size: (718, 5)


Unnamed: 0,disease_name,overview,symptoms,causes,factors
0,Chronic sinusitis,Chronic sinusitis Chronic sinusitis Chronic si...,Common symptoms of chronic sinusitis include: ...,Nasal polyps Nasal polyps Nasal polyps Nasal p...,The following factors raise the risk of gettin...
1,Pneumothorax,Collapsed and normal lung Collapsed and normal...,The main symptoms of a pneumothorax are sudden...,A pneumothorax can be caused by: Chest injury....,"In general, men are far more likely to have a ..."
2,Male infertility,"Nearly 1 in 7 couples is infertile, which mean...",The main sign of male infertility is the inabi...,Male fertility is a complex process. To get yo...,Risk factors linked to male infertility includ...
3,Body lice,,,,
4,Hurthle cell cancer,,,,


In [20]:
import pandas as pd
import random

# ==============================
# 1️⃣ Clean text safely
# ==============================
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype(str).str.strip()
    df = df[df[col] != ""]

df = df.reset_index(drop=True)


# ==============================
# 2️⃣ Text Augmentation Functions
# ==============================

def random_word_dropout(text, drop_prob=0.1):
    words = text.split()
    if len(words) <= 3:
        return text
    new_words = [w for w in words if random.random() > drop_prob]
    return " ".join(new_words) if new_words else text


def random_word_swap(text, swap_prob=0.1):
    words = text.split()
    if len(words) < 2:
        return text
    
    words = words.copy()
    for i in range(len(words) - 1):
        if random.random() < swap_prob:
            words[i], words[i+1] = words[i+1], words[i]
    
    return " ".join(words)


def augment_text(text):
    text = random_word_dropout(text, drop_prob=0.1)
    text = random_word_swap(text, swap_prob=0.1)
    return text


# ==============================
# 3️⃣ Main Augmentation Function
# ==============================

def enlarge_dataset(original_df, samples_per_row=3):
    augmented_rows = []

    for _, row in original_df.iterrows():
        # Keep original
        augmented_rows.append({
            "disease_name": row["disease_name"],
            "factors": row["factors"]
        })

        # Generate augmented samples
        for _ in range(samples_per_row):
            augmented_text = augment_text(row["factors"])

            augmented_rows.append({
                "disease_name": row["disease_name"],
                "factors": augmented_text
            })

    return pd.DataFrame(augmented_rows)


# ==============================
# 4️⃣ Generate Bigger Dataset
# ==============================

new_dataset = enlarge_dataset(df, samples_per_row= 15)
new_dataset.to_csv("data/factors_dataset.csv", index= False)

print("Original size:", df.shape)
print("New size:", new_dataset.shape)
print(new_dataset.head())

Original size: (503, 5)
New size: (8048, 2)
        disease_name                                            factors
0  Chronic sinusitis  The following factors raise the risk of gettin...
1  Chronic sinusitis  following The factors raise the risk of gettin...
2  Chronic sinusitis  The factors following risk the getting of chro...
3  Chronic sinusitis  The factors following raise the risk of gettin...
4  Chronic sinusitis  The following factors raise the risk of gettin...


In [1]:
import pandas as pd

data = pd.read_csv("/content/factors_dataset.csv")
data.head()

Unnamed: 0,disease_name,factors
0,Chronic sinusitis,The following factors raise the risk of gettin...
1,Chronic sinusitis,The following factors raise the risk of gettin...
2,Chronic sinusitis,following factors raise the risk getting A inf...
3,Chronic sinusitis,The following factors raise the risk of gettin...
4,Chronic sinusitis,The factors raise the of risk getting chronic ...


In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
labels = le.fit_transform(data["disease_name"])

X_train, X_test, y_train, y_test = train_test_split(data['factors'], labels, test_size= 0.2, random_state= 42)

In [3]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_enc = tokenizer(list(X_train), truncation= True, padding= True, max_length= 128)

test_enc = tokenizer(list(X_test), truncation= True, padding= True, max_length= 128 )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_ds = Dataset(train_enc, y_train)
test_ds = Dataset(test_enc, y_test)

In [5]:
from transformers import AutoModelForSequenceClassification

num_classes = len(set(labels))

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels= num_classes)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [7]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(output_dir= "out", eval_strategy= "epoch", per_device_train_batch_size= 8,
                         per_device_eval_batch_size= 8, num_train_epochs= 3, logging_steps= 50, optim= "adamw_torch")

trainer = Trainer(model= model, args= args, train_dataset= train_ds, eval_dataset= test_ds)

trainer.train()



Epoch,Training Loss,Validation Loss
1,3.199431,2.852855
2,0.790183,0.641025
3,0.344701,0.290684


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



TrainOutput(global_step=2415, training_loss=2.2920721788584077, metrics={'train_runtime': 291.1803, 'train_samples_per_second': 66.351, 'train_steps_per_second': 8.294, 'total_flos': 645333588048384.0, 'train_loss': 2.2920721788584077, 'epoch': 3.0})

In [8]:
trainer.save_model("factors_model")
tokenizer.save_pretrained("factors_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('factors_model/tokenizer_config.json', 'factors_model/tokenizer.json')

In [13]:
import torch

text = "The following factors raise the risk of getting chronic sinusitis: A infection. dental A fungal infection. Regularly being around cigarette smoke or other pollutants."
inputs = tokenizer(text, return_tensors= "pt", truncation= True, padding= True)

inputs = {k: v.to(model.device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

predicted_class = torch.argmax(outputs.logits, dim= 1)
predicted_label = le.inverse_transform([predicted_class.item()])[0]

print("Predicted class index:", predicted_label)

Predicted class index: Chronic sinusitis


In [14]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("models/factors_models")

tokenizer = AutoTokenizer.from_pretrained("models/factors_models")

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

In [15]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSelfAttention(
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [16]:
tokenizer

BertTokenizer(name_or_path='models/factors_models', vocab_size=30522, model_max_length=512, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)