# An NLP Model for Multilabel Classification of Herbal Plants Based on Symptoms Using ClinicalBERT

In [1]:
#Import Necessary Libraries
import numpy as np
import pandas as pd
import os

from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = pd.read_csv("dataset/dataset.csv")
# ds_train, ds_valid, ds_test = np.split(dataset.sample(frac=1, random_state=42), [int(.6*len(dataset)), int(.8*len(dataset))])
dataset = load_dataset("csv", data_files="dataset/dataset.csv")

In [3]:
ds_train = dataset['train'].train_test_split(test_size=0.2, seed=42)
ds_test = ds_train['test'].train_test_split(test_size=0.5, seed=42)

In [4]:
dataset = DatasetDict({
    'train': ds_train['train'],
    'valid': ds_test['train'],
    'test': ds_test['test']
})

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['index', 'SYMPTOMS', 'JACKFRUIT', 'SAMBONG', 'LEMON', 'JASMINE', 'MANGO', 'MINT', 'AMPALAYA', 'MALUNGGAY', 'GUAVA', 'LAGUNDI'],
        num_rows: 1088
    })
    valid: Dataset({
        features: ['index', 'SYMPTOMS', 'JACKFRUIT', 'SAMBONG', 'LEMON', 'JASMINE', 'MANGO', 'MINT', 'AMPALAYA', 'MALUNGGAY', 'GUAVA', 'LAGUNDI'],
        num_rows: 136
    })
    test: Dataset({
        features: ['index', 'SYMPTOMS', 'JACKFRUIT', 'SAMBONG', 'LEMON', 'JASMINE', 'MANGO', 'MINT', 'AMPALAYA', 'MALUNGGAY', 'GUAVA', 'LAGUNDI'],
        num_rows: 136
    })
})

In [6]:
dataset['train'][0]

{'index': 195,
 'SYMPTOMS': 'I need a herbal for ulcers',
 'JACKFRUIT': 1,
 'SAMBONG': 0,
 'LEMON': 0,
 'JASMINE': 0,
 'MANGO': 0,
 'MINT': 0,
 'AMPALAYA': 0,
 'MALUNGGAY': 0,
 'GUAVA': 0,
 'LAGUNDI': 0}

In [7]:
dataset.shape

{'train': (1088, 12), 'valid': (136, 12), 'test': (136, 12)}

In [8]:
dataset.column_names

{'train': ['index',
  'SYMPTOMS',
  'JACKFRUIT',
  'SAMBONG',
  'LEMON',
  'JASMINE',
  'MANGO',
  'MINT',
  'AMPALAYA',
  'MALUNGGAY',
  'GUAVA',
  'LAGUNDI'],
 'valid': ['index',
  'SYMPTOMS',
  'JACKFRUIT',
  'SAMBONG',
  'LEMON',
  'JASMINE',
  'MANGO',
  'MINT',
  'AMPALAYA',
  'MALUNGGAY',
  'GUAVA',
  'LAGUNDI'],
 'test': ['index',
  'SYMPTOMS',
  'JACKFRUIT',
  'SAMBONG',
  'LEMON',
  'JASMINE',
  'MANGO',
  'MINT',
  'AMPALAYA',
  'MALUNGGAY',
  'GUAVA',
  'LAGUNDI']}

In [9]:
dataset = dataset.remove_columns('index')

In [10]:
dataset['train'][0]

{'SYMPTOMS': 'I need a herbal for ulcers',
 'JACKFRUIT': 1,
 'SAMBONG': 0,
 'LEMON': 0,
 'JASMINE': 0,
 'MANGO': 0,
 'MINT': 0,
 'AMPALAYA': 0,
 'MALUNGGAY': 0,
 'GUAVA': 0,
 'LAGUNDI': 0}

In [11]:
cols = dataset['train'].column_names
dataset = dataset.map(lambda x : {"labels": [x[c] for c in cols if c != "SYMPTOMS"]})
dataset

DatasetDict({
    train: Dataset({
        features: ['SYMPTOMS', 'JACKFRUIT', 'SAMBONG', 'LEMON', 'JASMINE', 'MANGO', 'MINT', 'AMPALAYA', 'MALUNGGAY', 'GUAVA', 'LAGUNDI', 'labels'],
        num_rows: 1088
    })
    valid: Dataset({
        features: ['SYMPTOMS', 'JACKFRUIT', 'SAMBONG', 'LEMON', 'JASMINE', 'MANGO', 'MINT', 'AMPALAYA', 'MALUNGGAY', 'GUAVA', 'LAGUNDI', 'labels'],
        num_rows: 136
    })
    test: Dataset({
        features: ['SYMPTOMS', 'JACKFRUIT', 'SAMBONG', 'LEMON', 'JASMINE', 'MANGO', 'MINT', 'AMPALAYA', 'MALUNGGAY', 'GUAVA', 'LAGUNDI', 'labels'],
        num_rows: 136
    })
})

In [12]:
dataset['train'][2]

{'SYMPTOMS': 'My need concerns with rich in iron',
 'JACKFRUIT': 0,
 'SAMBONG': 0,
 'LEMON': 1,
 'JASMINE': 0,
 'MANGO': 1,
 'MINT': 0,
 'AMPALAYA': 0,
 'MALUNGGAY': 0,
 'GUAVA': 0,
 'LAGUNDI': 0,
 'labels': [0, 0, 1, 0, 1, 0, 0, 0, 0, 0]}

In [13]:
model_ckpt = 'medicalai/ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, problem_type="multi-label-classification")

In [14]:
def tokenize_and_encode(examples):
    return tokenizer(examples["SYMPTOMS"], truncation=True)

In [15]:
cols = dataset["train"].column_names
cols.remove("labels")
ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)
ds_enc

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1088
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 136
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 136
    })
})

In [16]:
ds_enc['train']['labels'][0]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [17]:
ds_enc.set_format("torch")
ds_enc = (ds_enc.map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"]).rename_column("float_labels", "labels"))

In [18]:
num_labels = 10
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, problem_type='multi_label_classification').to('cpu')

  return self.fget.__get__(instance, owner)()
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
ds_enc["train"][0]

{'input_ids': tensor([   101,    177,  17367,    169, 108016,  10415,  10142,  16600,  17505,
          10107,    102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}

Edit Epochs Below if Needed

In [20]:
args = TrainingArguments('/kaggle/working/', evaluation_strategy="epoch", num_train_epochs=5)

In [21]:
trainer = Trainer(model=model, args=args, train_dataset=ds_enc["train"], eval_dataset=ds_enc["test"], tokenizer=tokenizer)

In [22]:
trainer.train()

                                                 
 20%|██        | 136/680 [04:20<16:27,  1.81s/it]

{'eval_loss': 0.3053303360939026, 'eval_runtime': 2.258, 'eval_samples_per_second': 60.231, 'eval_steps_per_second': 7.529, 'epoch': 1.0}


                                                 
 40%|████      | 272/680 [09:05<12:57,  1.91s/it]

{'eval_loss': 0.12252853810787201, 'eval_runtime': 2.2839, 'eval_samples_per_second': 59.547, 'eval_steps_per_second': 7.443, 'epoch': 2.0}


                                                 
 60%|██████    | 408/680 [13:34<09:14,  2.04s/it]

{'eval_loss': 0.06379109621047974, 'eval_runtime': 2.4016, 'eval_samples_per_second': 56.629, 'eval_steps_per_second': 7.079, 'epoch': 3.0}


 74%|███████▎  | 500/680 [17:01<07:17,  2.43s/it]Checkpoint destination directory /kaggle/working/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2073, 'learning_rate': 1.323529411764706e-05, 'epoch': 3.68}


                                                 
 80%|████████  | 544/680 [18:54<05:43,  2.52s/it]

{'eval_loss': 0.0442207008600235, 'eval_runtime': 1.6437, 'eval_samples_per_second': 82.741, 'eval_steps_per_second': 10.343, 'epoch': 4.0}


                                                 
100%|██████████| 680/680 [23:25<00:00,  2.07s/it]

{'eval_loss': 0.03903823345899582, 'eval_runtime': 2.2677, 'eval_samples_per_second': 59.974, 'eval_steps_per_second': 7.497, 'epoch': 5.0}
{'train_runtime': 1405.0622, 'train_samples_per_second': 3.872, 'train_steps_per_second': 0.484, 'train_loss': 0.16459316085366643, 'epoch': 5.0}





TrainOutput(global_step=680, training_loss=0.16459316085366643, metrics={'train_runtime': 1405.0622, 'train_samples_per_second': 3.872, 'train_steps_per_second': 0.484, 'train_loss': 0.16459316085366643, 'epoch': 5.0})

In [23]:
trainer.evaluate()

  0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:02<00:00,  7.04it/s]


{'eval_loss': 0.03903823345899582,
 'eval_runtime': 2.5804,
 'eval_samples_per_second': 52.706,
 'eval_steps_per_second': 6.588,
 'epoch': 5.0}

In [24]:
dataset['train'][420]

{'SYMPTOMS': 'Herbal for bloating',
 'JACKFRUIT': 0,
 'SAMBONG': 0,
 'LEMON': 0,
 'JASMINE': 0,
 'MANGO': 0,
 'MINT': 1,
 'AMPALAYA': 0,
 'MALUNGGAY': 0,
 'GUAVA': 0,
 'LAGUNDI': 0,
 'labels': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]}

FOR PREDICTIONS

In [25]:
input_text = 'Are there herbals I can brew for tea'
encoding = tokenizer(input_text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
outputs = trainer.model(**encoding)
logits = outputs.logits
logits.shape

torch.Size([1, 10])

In [26]:
#Use Sigmoid and RoundUp
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1

#Convert Predicted Values into Actual Herbal Name
ans_cols = cols[1:]
predicted_labels = [ans_cols[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['SAMBONG', 'LEMON', 'JASMINE', 'MALUNGGAY', 'GUAVA']


FORM ANOTHER ONE

In [27]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    print("sigmoid",sigmoid)
    probs = sigmoid(torch.Tensor(predictions))
    print("probs",probs)
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    print("y_pred",y_pred)
    y_pred[np.where(probs >= threshold)] = 1
    print("y_pred_1",y_pred)
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [28]:
ds_enc['train']['input_ids'][0]

tensor([   101,    177,  17367,    169, 108016,  10415,  10142,  16600,  17505,
         10107,    102])

In [29]:
ds_enc['train']['input_ids'][0].unsqueeze(0)

tensor([[   101,    177,  17367,    169, 108016,  10415,  10142,  16600,  17505,
          10107,    102]])

In [30]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [31]:
tokenizer

DistilBertTokenizerFast(name_or_path='medicalai/ClinicalBERT', vocab_size=119547, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [32]:
# Set Forward Pass
outputs = model(input_ids=ds_enc['train']['input_ids'][0].unsqueeze(0).to('cpu'),
                labels=ds_enc['train'][0]['labels'].unsqueeze(0).to('cpu'))
outputs

SequenceClassifierOutput(loss=tensor(0.0256, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 3.1644, -3.5153, -3.8399, -3.3947, -3.4720, -5.2633, -3.3614, -3.1363,
         -4.5567, -4.8522]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [33]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_enc["train"],
    eval_dataset=ds_enc["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

  0%|          | 0/680 [00:00<?, ?it/s]

 20%|██        | 136/680 [05:24<21:41,  2.39s/it]
 20%|██        | 136/680 [05:26<21:41,  2.39s/it]

sigmoid Sigmoid()
probs tensor([[5.9173e-02, 9.3240e-01, 5.4493e-02,  ..., 7.1827e-01, 8.7019e-01,
         4.1726e-02],
        [1.1489e-03, 4.5832e-03, 2.4644e-03,  ..., 3.0459e-03, 2.5443e-03,
         2.6859e-03],
        [8.7004e-03, 2.6035e-03, 7.2895e-03,  ..., 9.4709e-03, 3.0662e-03,
         1.6471e-03],
        ...,
        [1.3766e-03, 9.9448e-01, 2.6341e-03,  ..., 2.6283e-04, 8.8918e-04,
         2.0239e-03],
        [2.4071e-02, 7.1208e-03, 1.7303e-02,  ..., 1.6787e-02, 1.0361e-02,
         1.4279e-03],
        [4.4966e-03, 8.7232e-03, 3.9154e-03,  ..., 3.1982e-03, 5.4552e-03,
         2.8682e-03]])
y_pred [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_pred_1 [[0. 1. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
{'eval_loss': 0.022337954491376877, 'eval_f1': 0.983

 40%|████      | 272/680 [09:43<12:28,  1.83s/it]
 40%|████      | 272/680 [09:45<12:28,  1.83s/it]

sigmoid Sigmoid()
probs tensor([[9.2686e-02, 9.1928e-01, 3.5328e-02,  ..., 9.0161e-01, 9.2714e-01,
         3.9109e-02],
        [5.1849e-04, 7.0650e-03, 1.6342e-03,  ..., 9.5279e-04, 1.8164e-03,
         1.9119e-03],
        [4.2330e-03, 2.3978e-03, 3.4701e-03,  ..., 1.8656e-03, 2.4611e-03,
         1.3561e-03],
        ...,
        [9.4216e-04, 9.9890e-01, 1.0678e-03,  ..., 1.5747e-04, 4.3586e-04,
         3.4060e-03],
        [5.2788e-03, 5.6621e-03, 9.9387e-03,  ..., 1.2311e-02, 3.2367e-03,
         1.5755e-03],
        [2.2460e-03, 3.5554e-03, 4.7996e-03,  ..., 5.8995e-03, 5.0400e-03,
         4.7685e-03]])
y_pred [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_pred_1 [[0. 1. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
{'eval_loss': 0.016285326331853867, 'eval_f1': 0.988

 60%|██████    | 408/680 [14:08<08:56,  1.97s/it]
 60%|██████    | 408/680 [14:10<08:56,  1.97s/it]

sigmoid Sigmoid()
probs tensor([[3.2347e-02, 9.4569e-01, 2.8016e-02,  ..., 9.5058e-01, 9.4128e-01,
         2.8061e-02],
        [4.4083e-04, 2.9096e-03, 8.0028e-04,  ..., 1.0486e-03, 1.8372e-03,
         1.7197e-03],
        [2.8095e-03, 1.0561e-03, 1.2648e-03,  ..., 1.9643e-03, 1.5004e-03,
         5.5536e-04],
        ...,
        [3.5562e-04, 9.9903e-01, 1.2828e-03,  ..., 1.1223e-04, 3.9790e-04,
         9.7926e-04],
        [4.3161e-03, 2.9930e-03, 3.4580e-03,  ..., 4.5541e-03, 2.4901e-03,
         7.4905e-04],
        [1.6517e-03, 2.3489e-03, 2.0419e-03,  ..., 2.8269e-03, 2.0227e-03,
         2.3203e-03]])
y_pred [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_pred_1 [[0. 1. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
{'eval_loss': 0.013721118681132793, 'eval_f1': 0.983

 74%|███████▎  | 500/680 [17:14<05:51,  1.95s/it]Checkpoint destination directory /kaggle/working/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0216, 'learning_rate': 1.323529411764706e-05, 'epoch': 3.68}


 80%|████████  | 544/680 [18:45<04:35,  2.03s/it]
 80%|████████  | 544/680 [18:47<04:35,  2.03s/it]

sigmoid Sigmoid()
probs tensor([[3.5900e-02, 9.3508e-01, 2.6729e-02,  ..., 9.5932e-01, 9.4892e-01,
         1.9854e-02],
        [3.2704e-04, 2.2199e-03, 8.1387e-04,  ..., 7.4144e-04, 1.2766e-03,
         1.5855e-03],
        [2.2147e-03, 1.5784e-03, 1.8158e-03,  ..., 1.3827e-03, 1.9746e-03,
         7.2728e-04],
        ...,
        [3.1269e-04, 9.9925e-01, 1.4349e-03,  ..., 9.1020e-05, 2.5273e-04,
         5.7125e-04],
        [2.5818e-03, 3.4903e-03, 4.9799e-03,  ..., 3.4977e-03, 2.0146e-03,
         6.6423e-04],
        [1.1704e-03, 2.3162e-03, 2.9334e-03,  ..., 2.1558e-03, 1.7144e-03,
         2.1984e-03]])
y_pred [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_pred_1 [[0. 1. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
{'eval_loss': 0.012016482651233673, 'eval_f1': 0.983

100%|██████████| 680/680 [23:24<00:00,  1.98s/it]
100%|██████████| 680/680 [23:27<00:00,  2.07s/it]

sigmoid Sigmoid()
probs tensor([[3.3371e-02, 9.5721e-01, 2.3542e-02,  ..., 9.5292e-01, 9.6057e-01,
         2.0241e-02],
        [3.5965e-04, 2.0868e-03, 7.5439e-04,  ..., 7.4150e-04, 1.4139e-03,
         1.3398e-03],
        [3.2450e-03, 1.5768e-03, 1.4693e-03,  ..., 1.5894e-03, 2.0969e-03,
         6.2363e-04],
        ...,
        [3.5651e-04, 9.9952e-01, 1.5661e-03,  ..., 1.0239e-04, 3.0214e-04,
         4.8614e-04],
        [3.1307e-03, 3.7804e-03, 4.7453e-03,  ..., 2.9852e-03, 2.1802e-03,
         5.7075e-04],
        [1.4988e-03, 2.3499e-03, 2.2758e-03,  ..., 2.0129e-03, 1.8623e-03,
         1.9619e-03]])
y_pred [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_pred_1 [[0. 1. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
{'eval_loss': 0.010787110775709152, 'eval_f1': 0.983




TrainOutput(global_step=680, training_loss=0.018756714989157285, metrics={'train_runtime': 1407.1225, 'train_samples_per_second': 3.866, 'train_steps_per_second': 0.483, 'train_loss': 0.018756714989157285, 'epoch': 5.0})

In [35]:
trainer.evaluate()

100%|██████████| 17/17 [00:01<00:00, 11.08it/s]

sigmoid Sigmoid()
probs tensor([[3.3371e-02, 9.5721e-01, 2.3542e-02,  ..., 9.5292e-01, 9.6057e-01,
         2.0241e-02],
        [3.5965e-04, 2.0868e-03, 7.5439e-04,  ..., 7.4150e-04, 1.4139e-03,
         1.3398e-03],
        [3.2450e-03, 1.5768e-03, 1.4693e-03,  ..., 1.5894e-03, 2.0969e-03,
         6.2363e-04],
        ...,
        [3.5651e-04, 9.9952e-01, 1.5661e-03,  ..., 1.0239e-04, 3.0214e-04,
         4.8614e-04],
        [3.1307e-03, 3.7804e-03, 4.7453e-03,  ..., 2.9852e-03, 2.1802e-03,
         5.7075e-04],
        [1.4988e-03, 2.3499e-03, 2.2758e-03,  ..., 2.0129e-03, 1.8623e-03,
         1.9619e-03]])
y_pred [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_pred_1 [[0. 1. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]





{'eval_loss': 0.010787110775709152,
 'eval_f1': 0.9833729216152018,
 'eval_roc_auc': 0.9930432866781123,
 'eval_accuracy': 0.9852941176470589,
 'eval_runtime': 1.7184,
 'eval_samples_per_second': 79.145,
 'eval_steps_per_second': 9.893,
 'epoch': 5.0}

# TRYING THE MODEL

In [36]:
text = "What herbal can I use that is rich in Vitamin C"
encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
predictions
# turn predicted id's into actual label names
ans_cols = cols[1:]
predicted_labels = [ans_cols[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['JACKFRUIT', 'LEMON', 'MANGO', 'MALUNGGAY']


In [37]:
#SAVE THE NLP MODEL

trainer.save_model("./herbal_nlpMultilabelClassModel")

In [46]:
# UPLOAD MODEL TO MY HUGGINGFACE ACCOUNT
from huggingface_hub import login
login(token="")
trainer.push_to_hub("khygopole/NLP_HerbalMultiLabelClassificationModel")


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\KHYLE MATTHEW\.cache\huggingface\token
Login successful


model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]


[A[A[A

[A[A
[A



model.safetensors:   0%|          | 8.19k/541M [00:00<4:34:14, 32.9kB/s]


model.safetensors:   0%|          | 81.9k/541M [00:00<42:29, 212kB/s]   

[A[A
events.out.tfevents.1706877101.LAPTOP-KMPG041102.944.2: 100%|██████████| 7.38k/7.38k [00:00<00:00, 16.7kB/s]
events.out.tfevents.1706866949.LAPTOP-KMPG041102.8084.0: 100%|██████████| 5.70k/5.70k [00:00<00:00, 10.7kB/s]
events.out.tfevents.1706877100.LAPTOP-KMPG041102.944.1: 100%|██████████| 359/359 [00:00<00:00, 620B/s]  
events.out.tfevents.1706875693.LAPTOP-KMPG041102.944.0: 100%|██████████| 6.59k/6.59k [00:00<00:00, 11.0kB/s]
model.safetensors:   1%|          | 3.24M/541M [00:01<01:46, 5.06MB/s]
events.out.tfevents.1706879275.LAPTOP-KMPG041102.944.3: 100%|██████████| 508/508 [00:00<00:00, 1.76kB/s]
training_args.bin: 100%|██████████| 4.73k/4.73k [00:00<00:00, 14.3kB/s]
model.safetensors: 100%|██████████| 541M/541M [01:08<00:00, 7.90MB/s] 





CommitInfo(commit_url='https://huggingface.co/khygopole/working/commit/7ed27e379a1e768999125bc2e3cb3227aba53d3b', commit_message='khygopole/NLP_HerbalMultiLabelClassificationModel', commit_description='', oid='7ed27e379a1e768999125bc2e3cb3227aba53d3b', pr_url=None, pr_revision=None, pr_num=None)