## Fine Tune the model BERT with my data.
- Takes around 28 minutes with smaller gpt2 version: distilgpt2
- Saves to 'output' file
- Can be very finnicky with torch environment
- Ensure all packages dependencies with transformers are up to date, torch, torch audio etc are up to date.

## 2. Get inputs ready for training

In [1]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments, EvalPrediction
from transformers import TextDataset, DataCollatorForLanguageModeling
import numpy as np

# ! pip install -U accelerate
# ! pip install -U transformers

import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


No GPU available, using the CPU instead.


In [84]:
import pandas as pd
import json

with open("TUSHARKHETE_intents.json", "r") as read_file:
    data = (json.load(read_file))['intents']

labels = []
inputs = []
for item in data:
   labels.append(item['tag'])
   inputs.append(item['patterns'])

print("We have:",len(set(labels)),"labels")

inputs

We have: 54 labels


[['Hi', 'Hey', 'How are you', 'Is anyone there?', 'Hello', 'Good day'],
 ['What to do if Cuts?',
  'How to cure Cuts?',
  'Which medicine to apply for Cuts?',
  'what to apply on cuts?',
  'Cuts'],
 ['how do you treat abrasions?',
  'Do Abrasions cause scars?',
  'Abrasions',
  'what to do if abrasions?',
  'Which medicine to apply for abrasions?',
  'How to cure abrasions?'],
 ['How do you treat Sting?',
  'Stings',
  'What to do if you get a sting?',
  'Which medicine to apply if sting?'],
 ['How to remove Splinters',
  'How to cure Splinters?',
  'What to do if I have splinters?',
  'How do you bring a splinter to the surface?'],
 ['How do you treat a sprain?',
  'what to do if i get a sprain?',
  'Which cream to apply if i get a sprain?',
  'Which medicine to apply if I get a sprain?'],
 ['How do you treat a strain?',
  'what to do if i get a strain?',
  'Which cream to apply if i get a strain?',
  'Which medicine to apply if I get a strain?',
  'How do you diagnose a strain?',
  '

In [92]:

inputs_new = []
labels_new = []

for sentence_list, label in zip(inputs, labels):
  for sentence in sentence_list:
    inputs_new.append(sentence)
    labels_new.append(label)

print(labels_new)
print(inputs_new)

['greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'Cuts', 'Cuts', 'Cuts', 'Cuts', 'Cuts', 'Abrasions', 'Abrasions', 'Abrasions', 'Abrasions', 'Abrasions', 'Abrasions', 'stings', 'stings', 'stings', 'stings', 'Splinter', 'Splinter', 'Splinter', 'Splinter', 'Sprains', 'Sprains', 'Sprains', 'Sprains', 'Strains', 'Strains', 'Strains', 'Strains', 'Strains', 'Strains', 'Fever', 'Fever', 'Fever', 'Fever', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Cough', 'Cough', 'Cough', 'Cough', 'Cough', 'Sore Throat', 'Sore Throat', 'Sore Throat', 'Sore Throat', 'Gastrointestinal problems', 'Gastrointestinal problems', 'Gastrointestinal problems', 'Gastrointestinal problems', 'Skin problems', 'Skin problems', 'Skin problems', 'Skin problems', 'Chickenpox', 'Chickenpox', 'Chickenpox', 'Chickenpox', 'Allergies', 'Allergies', 'Allergies', 'Allergies', 'Abdonominal Pain', 'Abdonominal Pain', 'Abdonominal Pain'

#### Encode the labels
So that the model can use them (string not accepted)

In [93]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder
label_encoder = LabelEncoder()
print("Before encoding:",labels_new)
# Encode the labels (text to numbers). NEED THIS FOR evaluation SO REMEMBER TO PICKLE
labels_new = label_encoder.fit_transform(labels_new)
(print("after encoding:", labels_new))

import pickle
# Save the label encoder
try:
    with open("label_encoder.pkl", "wb") as f:
        pickle.dump(label_encoder, f)
except Exception as e:
    print("Error saving pickle file:", e)


Before encoding: ['greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'Cuts', 'Cuts', 'Cuts', 'Cuts', 'Cuts', 'Abrasions', 'Abrasions', 'Abrasions', 'Abrasions', 'Abrasions', 'Abrasions', 'stings', 'stings', 'stings', 'stings', 'Splinter', 'Splinter', 'Splinter', 'Splinter', 'Sprains', 'Sprains', 'Sprains', 'Sprains', 'Strains', 'Strains', 'Strains', 'Strains', 'Strains', 'Strains', 'Fever', 'Fever', 'Fever', 'Fever', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Nasal Congestion', 'Cough', 'Cough', 'Cough', 'Cough', 'Cough', 'Sore Throat', 'Sore Throat', 'Sore Throat', 'Sore Throat', 'Gastrointestinal problems', 'Gastrointestinal problems', 'Gastrointestinal problems', 'Gastrointestinal problems', 'Skin problems', 'Skin problems', 'Skin problems', 'Skin problems', 'Chickenpox', 'Chickenpox', 'Chickenpox', 'Chickenpox', 'Allergies', 'Allergies', 'Allergies', 'Allergies', 'Abdonominal Pain', 'Abdonominal Pain', '

In [49]:
# testing_data = ['Hey how are you','How to clean cuts?','how to treat abrasion?',
#                 'what medicine for a sting?','i have splinters', 'what to do if i get a sprain', "what to do if i get a strain",
#                 "what to do if i get a high fever?", "How do you treat nasal Congestio",
#                 "what to do if i get a cough?","what to do if i get a sore throat?",
#                 "I have some gas problems","what to do get a skin allergy?",
#                 "How contagious is chickenpox","what are the symptoms of allergies",
#                 "Which medicine to take if I get a Abdonominal Pain?","what to do if i get a Bruise?",
#                 "what to do if i have a Broken Toe?","How do you treat Choking?","how to treat a wound",
#                 "i've had bad diarrhea",'how to treat frost bite?',
#                 " treat heat exhaustion?", "how do u treat heat stroke?",
#                 "how do you treat insect bites?","how do you treat a nose bleed?",
#                 "how do you treat a pulled muscle?","what is rectal bleeding?",
#                 "sunburn treatment?","what is testicle pain","how to treat vertigo",
#                 "what to treat bleeding","my eye is injured how do i treat it",
#                 "chemical burn treatment?","what is poison treatment","what to do if my teeth broke","i'm having a seizure how to cure","how to treat head injuries?","i keep fainting, how to cure fainting?","how to treat a mild headache?","howto treat a cold?","how should i treat a rash","How should i treat a snake bite","ive got an animal bite","what to do if someone is drowning","How to do CPR","how to treat fracture on someone","anxiety i think i have it","how doi know if im depressed","i want you to diagnose my symptoms for me","what is neutrino about?","Thank you!","Can i make an appointment please","bYE BYE!!"]
# testing_data2 = ['good morning','ive got a bad cut what do i do?','how do you treat abrasion? its painful',
#                 'what medicine should i take for a sting?','i have splinters, help!', 'what to do if i get a ankle sprain', "what to do if i get a strain in my muscle",
#                 "what to do if i get a high fever?", "I've had bad nasal congestions for the past week",
#                 "how do i ease a cough?","I need medicine for a sore throat",
#                 "I have some gas problems","what to do get a skin allergy?",
#                 "How contagious is chickenpox","i have pretty bad allergies",
#                 "I got Pain in my abdomen","what to do if i get a Bruise?",
#                 "what to do if i have Broken my Toe?","How do you treat Choking emergency?","I have a wound",
#                 "what to do if i get diarrhea",'how to treat frost bite?',
#                 " help me treat heat exhaustion?", "I have heat stroke",
#                 "how do you treat insect bites?","how do you treat a nose bleed?",
#                 "how do you treat a pulled muscle?","i have rectal bleeding?",
#                 "sunburn treatment im feeling hot?","I think i have testicular torsion","i've had vertigo",
#                 "how to stop bleeding","my eye is injured how do i treat it",
#                 "chemical burn treatment?","what is poison treatment","i broke a tooth","i always have seizures, how to stop them","how to treat head injuries?","i keep fainting, how to cure fainting?","how to treat a mild headache?","howto treat a cold?","how should i treat a rash","How should i treat a snake bite","ive got an animal bite","what to do if someone is drowning","How to do CPR","how to treat fracture on someone","anxiety i think i have it","how doi know if im depressed","i want you to diagnose my symptoms for me","what is neutrino about?","Thank you!","Can i make an appointment please","bYE BYE!!"]

# testing_data = np.asarray(testing_data)
# # with open('label_encoder.pkl', 'rb') as f:
# #     label_encoder = pickle.load(f)

# testing_labels = label_encoder.transform(labels)
# testing_data = np.append(testing_data,testing_data2)
# testing_labels = np.append(testing_labels,testing_labels)

# testing_labels

EOFError: Ran out of input

In [8]:
from transformers import BertTokenizer, AdamW
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in inputs_new:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', inputs_new[2])
print('Token IDs:', input_ids[2])
print('Max sentence length: ', max([len(sen) for sen in input_ids]))


Original:  How are you
Token IDs: [101, 2129, 2024, 2017, 102]
Max sentence length:  16


In [9]:

from keras.preprocessing.sequence import pad_sequences


MAX_LEN = 16

#Padding the input to the max length that is 16
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
                          value=0, truncating="post", padding="post")

In [10]:
# Creating the attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:

    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]

    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

len(input_ids)

239

#### Split data and assign to trainer

In [12]:

from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets (adjust test_size as needed)
train_inputs, input_val,validation_inputs, validation_labels = train_test_split(input_ids,labels_new, test_size=0.25, random_state=42)
train_masks, validation_masks,_, _ = train_test_split(attention_masks, labels_new,random_state=42, test_size=0.25)


In [13]:
#Converting the input data to the tensor , which can be feeded to the model
train_inputs = torch.tensor(input_train)
validation_inputs = torch.tensor(input_val)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


In [14]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [16]:

from transformers import BertTokenizer, AdamW
#Creating the DataLoader which will help us to load data into the CPU in my case

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 54,
    output_attentions = False,
    output_hidden_states = False, )

# AdamW is an optimizer which is a Adam Optimzier with weight-decay-fix
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 )


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 15

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 100, # Default value in run_glue.py
                                            num_training_steps = total_steps)




Define a helper code for accuracy

In [18]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## 3. Train model

In [19]:
#Let's start the training process

import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 20 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)

        # The call to `model` always returns a tuple, so we need to pull the
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    # ========================================
    #               Validation
    # ========================================


    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Add batch to CPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1


    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))


print("")
print("Training complete!")


Training...

  Average training loss: 4.06
  Training epoch took: 0:01:00

Running Validation...
  Accuracy: 0.02
  Validation took: 0:00:03

Training...

  Average training loss: 4.01
  Training epoch took: 0:00:56

Running Validation...
  Accuracy: 0.02
  Validation took: 0:00:03

Training...

  Average training loss: 4.01
  Training epoch took: 0:00:54

Running Validation...
  Accuracy: 0.00
  Validation took: 0:00:04

Training...

  Average training loss: 3.98
  Training epoch took: 0:00:56

Running Validation...
  Accuracy: 0.05
  Validation took: 0:00:04

Training...

  Average training loss: 3.94
  Training epoch took: 0:00:55

Running Validation...
  Accuracy: 0.05
  Validation took: 0:00:03

Training...

  Average training loss: 3.87
  Training epoch took: 0:00:56

Running Validation...
  Accuracy: 0.04
  Validation took: 0:00:03

Training...

  Average training loss: 3.75
  Training epoch took: 0:00:56

Running Validation...
  Accuracy: 0.09
  Validation took: 0:00:03

Train

In [21]:
model_path = "./gdrive/MyDrive/bert_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
# from google.colab import drive
# drive.mount('/gdrive')
# !cp -r output /gdrive/MyDrive

Mounted at /gdrive
cp: cannot stat 'output': No such file or directory


In [None]:
from transformers import BertForSequenceClassification
from google.colab import drive
drive.mount('/content/drive')
model = BertForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer alongside the model (if applicable)
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(model_path)

def compute_metrics(p: EvalPrediction):
    logits = p.predictions
    labels = p.label_ids
    probabilities = softmax(logits, axis=-1)
    loss = log_loss(labels.flatten(), probabilities.reshape(-1, probabilities.shape[-1]), labels=[i for i in range(logits.shape[-1])])
    perplexity = np.exp(loss)
    return {"perplexity": perplexity}

## Evaluation


In [None]:
sentences = testing_data
labels = testing_labels


# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )

    input_ids.append(encoded_sent)

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN,
                          dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.
batch_size = 16

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
#Evaluating our model on the test set

# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions , true_labels = [], []

# Predict
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)

  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch

  # Telling the model not to compute or store gradients, saving memory and
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [None]:
from sklearn.metrics import matthews_corrcoef
# put labels into array (they stored in arrays of batches)
predslabels = []
matthews_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):

  # The predictions for this batch are a 2-column ndarray (one column for "0"
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  preslabels = predslabels.append(pred_labels_i)
  # Calculate and store the coef for this batch.
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
  matthews_set.append(matthews)

In [None]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]

# Calculate the MCC
flat_true_labels = np.asarray(flat_true_labels)
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

# print(multilabel_confusion_matrix(flat_true_labels, flat_predictions))
# print(accuracy_score(flat_true_labels, flat_predictions))

print(flat_true_labels)
print(flat_predictions)
# cm = confusion_matrix(flat_true_labels, flat_predictions)

# cm_disp = ConfusionMatrixDisplay(cm, display_labels=clf.classes_)
#     # Needs matplotlib installed to run
# cm_disp.plot()
# plt.show()
# cm_disp.plot()

## 2. Use model

In [22]:
def preprocess_generation(generated_response):
  #Find the position of A in assistant and add 10 characters
  assistant_index = generated_response.find("Assistant:") + 10

  # Check if 'ASSISTANT:' is found in the text and the text starts with "Assistant"
  if assistant_index != -1:
      # Remove everything before ':', also removes repetition of user input
      generated_text = generated_response[assistant_index+1:]
      # do recursively until no assistant index is found? if it keeps printing multiple

  # given that string 'User:' comes after the desired response
  user_index = generated_text.find("User:")
  if user_index != -1:
      generated_text = generated_text[:user_index]


  return generated_text

'TESTING COOL BEANS NOT SO COOL. '

In [90]:
def get_prediction(text,tokenizer,model):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  # Your text for classification
  text = "What to do if Cuts?"

  # Encode the sentence with special tokens and padding
  encoded_dict = tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=64,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors="pt",
  )

  # Extract input IDs and attention mask
  input_ids = encoded_dict["input_ids"]
  attention_mask = encoded_dict["attention_mask"]

  # Pass the input through the model
  with torch.no_grad():
    outputs = model(**{"input_ids": input_ids, "attention_mask": attention_mask})

  # Get logits (prediction scores) for each class
  logits = outputs.logits.squeeze(0)  # Remove batch dimension

  # Get the predicted class label (argmax)
  predicted_class = torch.argmax(logits).item()
  pred = [predicted_class]
  prediction = label_encoder.inverse_transform(pred)[0]
  return prediction




'Cuts'

In [94]:
with open('label_encoder.pkl', 'rb') as file:
  try:
    # Attempt to load the label encoder
    loaded_encoder = pickle.load(file)
    print("Label encoder loaded successfully!")
  except EOFError:
    print("Error: Pickle file seems to be empty.")
  except pickle.UnpicklingError as e:
    print("Error loading pickle file:", e)

# Load the model
model = BertForSequenceClassification.from_pretrained("bert_model")
tokenizer = BertTokenizer.from_pretrained("bert_model")
print(get_prediction(user_input,tokenizer,model))


Label encoder loaded successfully!


'Symptoms'

In [96]:
!cp label_encoder.pkl /gdrive/MyDrive