In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
from subjects import label2id, id2label
from sklearn.metrics import precision_score, recall_score


2024-03-28 19:57:31.771790: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-28 19:57:31.814385: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_checkpoint = 'distilbert/distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=8, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset = load_dataset("csv", data_files={"train": "train_dataset_1.csv", "validation": "validation_dataset_1.csv"})
test_dataset = load_dataset("csv", data_files={"test":"test_dataset_1.csv"})

In [4]:
def tokenize(data_to_tokenize):
    text = data_to_tokenize["text"]
    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_inputs
    
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    
tokenized_dataset = dataset.map(tokenize, batched=True)
tester_tokenized_dataset = test_dataset.map(tokenize, batched=True)

In [5]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels),
            "f1" : f1.compute(predictions=predictions, references=labels, average="weighted"),
           }
 

In [6]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4, # intrinsic rank of trainable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.1, # probablity of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer only
model = get_peft_model(model, peft_config)

lr = 1e-3
batch_size = 4
num_epochs = 8

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.025,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [7]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [8]:

def run_test_set(trainer_obj):
    test_res = trainer_obj.predict(tester_tokenized_dataset["test"])
    test_preds = np.argmax(test_res.predictions, axis=1)
    test_labels = test_res.label_ids
    
    precision = precision_score(test_labels, test_preds, average=None)
    reacall = recall_score(test_labels, test_preds, average=None)
    print(f"Queries Per Second: : {test_res.metrics['test_samples_per_second']}")
    print(f"Accuracy: {test_res.metrics['test_accuracy']['accuracy']}")
    print("\nPercision per class")
    for index, each_subject in enumerate(id2label):
        print(f"{id2label[each_subject]}: {precision[index]}")
    print("\nRecall per class")
    for index, each_subject in enumerate(id2label):
        print(f"{id2label[each_subject]}: {reacall[index]}")
print("Base Model Test Metrics")
print("-----------------------")

run_test_set(trainer)

Base Model Test Metrics
-----------------------


Queries Per Second: : 532.519
Accuracy: 0.1399548532731377

Percision per class
Math: 0.0
Science: 0.36363636363636365
Language: 0.13707451701931922
Physical Education: 0.0
Social Studies: 0.0
Health: 0.0
Computers: 0.0
Leadership: 0.0

Recall per class
Math: 0.0
Science: 0.04067796610169491
Language: 1.0
Physical Education: 0.0
Social Studies: 0.0
Health: 0.0
Computers: 0.0
Leadership: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2669,0.221021,{'accuracy': 0.95483288166215},{'f1': 0.954845659695672}
2,0.2784,0.234417,{'accuracy': 0.9561878952122854},{'f1': 0.9563375370779424}
3,0.2195,0.185548,{'accuracy': 0.9697380307136405},{'f1': 0.9697921271737132}
4,0.1739,0.161759,{'accuracy': 0.9751580849141824},{'f1': 0.9751018872206434}
5,0.0995,0.174551,{'accuracy': 0.975609756097561},{'f1': 0.9755642047287072}
6,0.0975,0.151179,{'accuracy': 0.979223125564589},{'f1': 0.9792226762516094}
7,0.0426,0.145422,{'accuracy': 0.9805781391147245},{'f1': 0.9805558656038241}
8,0.0277,0.14287,{'accuracy': 0.981029810298103},{'f1': 0.9810119008459938}


Trainer is attempting to log a value of "{'accuracy': 0.95483288166215}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.954845659695672}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9561878952122854}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.9563375370779424}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9697380307136405}" of type <class 'dict'> for k

TrainOutput(global_step=35432, training_loss=0.1648490309419199, metrics={'train_runtime': 552.3501, 'train_samples_per_second': 256.576, 'train_steps_per_second': 64.148, 'total_flos': 4265303822740800.0, 'train_loss': 0.1648490309419199, 'epoch': 8.0})

In [11]:
print("Post Fine Tuning Test Metrics")
print("-----------------------------")
run_test_set(trainer)

Post Fine Tuning Test Metrics
-----------------------------


Queries Per Second: : 608.236
Accuracy: 0.9769751693002258

Percision per class
Math: 0.9938650306748467
Science: 0.9795918367346939
Language: 0.9736842105263158
Physical Education: 0.9856459330143541
Social Studies: 0.9935691318327974
Health: 0.9642857142857143
Computers: 0.9495798319327731
Leadership: 0.9688888888888889

Recall per class
Math: 0.9938650306748467
Science: 0.976271186440678
Language: 0.9932885906040269
Physical Education: 0.9809523809523809
Social Studies: 0.9809523809523809
Health: 0.9737704918032787
Computers: 0.9783549783549783
Leadership: 0.9276595744680851


In [None]:
model.to('cpu')
text_list = ["how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges"," how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges",
"Who is Alexander the great?", "Preventing heart attacks can be done by many methods", "is a student-centered cultural anthropology mini textbook built with an equity lens. We are excited to share this with you all. This book attempts to address the lack of current, reliable, and relevant resources for introductory anthropology courses that center equity and anti-racism.",
"How would you describe ?", "What is the importance of cells", "In what year was Armenia invdaded?", "Matter can change its state under different conditions. We have solids, like ice and rocks, which have a definite shape and volume. Liquids, such as water and juice, take the shape of their container but maintain a constant volume. Gases, like the air we breathe, have neither a definite shape nor volume and fill the space they occupy.", "The cradle of civilization was in the fertile valleys of rivers like the Tigris and Euphrates in Mesopotamia, the Nile in Egypt, the Indus in the Indian subcontinent, and the Yellow River in China. These early civilizations developed complex societies, writing systems, and technologies that shaped the course of history."]
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])


In [None]:
trainer.save_model("saved-model1")

# 