<a href="https://colab.research.google.com/github/adamhecktman/ML/blob/master/Outcomes_BERTforSequenceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch


# Load file and convert pandas series to lists.

In [None]:
myFile = "/content/descirptions_outcomes.csv"
df = pd.read_csv(myFile)

texts = df['Descriptions'].tolist()
labels = df['Outcomes'].tolist()
labels_forAccuracy = df['Outcomes'].tolist()

In [None]:
len(labels)

32417

In [None]:
import numpy as np
my_array = np.array(labels_forAccuracy)

# Ensure labels are encoded as integers (which they are) - Skipped this

In [None]:
unique_labels = list(set(labels))
label_map = {label: idx for idx, label in enumerate(unique_labels)}
labels = [label_map[label] for label in labels]

In [None]:
setLabels = set(labels)
len(setLabels)

293

#Initalize the BERT tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Tokenize the Texts


In [None]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt', add_special_tokens = True)
len(encodings)

3

In [None]:
type(encodings)

# Convert labels to tensor

In [None]:
labels = torch.tensor(labels)

In [None]:
labels.shape

torch.Size([32417])

# Create the dataset class


In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset using encodings and labels

In [None]:
dataset = TextDataset(encodings, labels)


In [None]:
len(dataset)

32417

# Split test/train data

In [None]:
train_size = 0.8
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(train_size * len(dataset)), len(dataset) - int(train_size * len(dataset))])


In [None]:
type(train_dataset)

In [None]:
type(val_dataset)

# Load BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels))
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Train the model

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,4.3437
1000,3.1931
1500,2.9495
2000,2.8507
2500,2.7439
3000,2.6262
3500,2.4103
4000,2.36
4500,2.2475
5000,2.2545


TrainOutput(global_step=9726, training_loss=2.37552427103214, metrics={'train_runtime': 2189.8055, 'train_samples_per_second': 35.528, 'train_steps_per_second': 4.441, 'total_flos': 5130814961887488.0, 'train_loss': 2.37552427103214, 'epoch': 3.0})

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 2.3872010707855225, 'eval_runtime': 44.2533, 'eval_samples_per_second': 146.52, 'eval_steps_per_second': 18.326, 'epoch': 3.0}


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get predictions from the model
predictions = trainer.predict(val_dataset)


# Evaluate and print results

In [None]:
# prompt: how do i get an accuracy score from transformers.trainer_utils.PredictionOutput

from transformers.trainer_utils import PredictionOutput
import numpy as np

# Assuming 'predictions' is your PredictionOutput object
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.4487970388648982
