In [15]:
import pandas as pd

data = pd.read_csv("/content/overview_dataset.csv")
data.head()

Unnamed: 0,disease_name,overview
0,Chronic sinusitis,Chronic sinusitis Chronic sinusitis Chronic si...
1,Chronic sinusitis,sinusitis Chronic sinusitis Chronic sinusitis ...
2,Chronic sinusitis,Chronic sinusitis Chronic sinusitis Chronic si...
3,Chronic sinusitis,Chronic sinusitis Chronic sinusitis Chronic si...
4,Chronic sinusitis,Chronic sinusitis Chronic sinusitis Chronic An...


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = data['overview'].astype(str)

le = LabelEncoder()
y = le.fit_transform(data["disease_name"])
num_classes = len(le.classes_)

x_train, x_test, y_train, y_test = train_test_split(
    X.tolist(),
    y,
    test_size=0.2,
    random_state=42
)

In [17]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

MAX_LEN = 32

def encode_texts(texts):
    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

    return encodings["input_ids"], encodings["attention_mask"]

train_inputs, train_masks = encode_texts(x_train)
test_inputs, test_masks = encode_texts(x_test)

In [18]:
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

In [19]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, input_ids, masks, labels):
        self.input_ids = input_ids
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx]
        }

train_dataset = TextDataset(train_inputs, train_masks, train_labels)
test_dataset = TextDataset(test_inputs, test_masks, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [20]:
from transformers import BertForSequenceClassification
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_classes
)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [22]:
epochs = 5

model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader)}")

Epoch 1 Loss: 4.433969888142853
Epoch 2 Loss: 3.6091303074034506
Epoch 3 Loss: 2.867292837824478
Epoch 4 Loss: 2.1927566167438592
Epoch 5 Loss: 1.6092088982721713


In [23]:
import numpy as np
from sklearn.metrics import classification_report

model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch["labels"].numpy())

print(classification_report(true_labels, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         4
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         3
          13       1.00      1.00      1.00         6
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00         2
          16       1.00      1.00      1.00         4
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
import joblib

model_path = "models/overview_model"

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

joblib.dump(le, model_path + "/label_encoder.pkl")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

['overview_model/label_encoder.pkl']

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib

model_path = "models/overview_model"

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

label_encoder = joblib.load(model_path + "/label_encoder.pkl")

model.eval()

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [2]:
model 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [3]:
tokenizer 

BertTokenizer(name_or_path='models/overview_model', vocab_size=30522, model_max_length=512, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
label_encoder

In [14]:
import torch

def predict_texts(texts, model, tokenizer, label_encoder, max_len=32):
    model.eval()

    if isinstance(texts, str):
        texts = [texts]

    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )

    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]

    predictions = []

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        predictions = label_encoder.inverse_transform(preds)

    return predictions

In [17]:
sample_texts = [
    "Collapsed and normal Collapsed lung and normal lung Collapsed and normal lung In a collapsed lung",
    "own. Products & Services Book: Mayo Clinic Book Health Mayo Newsletter: Clinic Letter â€” Digital Edition Show more products from Mayo Clinic",
    "Dry is a painful dental condition that sometimes happens after you have a tooth removed. a tooth removed is an extraction. ",
    "Transposition of great arteries Transposition of great arteries Transposition the great arteries transposition In of the the main arteries leading away from the heart"
]

results = predict_texts(
    sample_texts,
    model,
    tokenizer,
    label_encoder
)

print("Predictions:", results)

Predictions: ['Pneumothorax' 'Cervical cancer' 'Dry socket'
 'Transposition of the great arteries']
