In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
from subjects import label2id, id2label

2024-03-28 08:18:51.308298: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-28 08:18:51.350427: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_checkpoint = 'distilbert/distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=8, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset = load_dataset("csv", data_files={"train": "train_dataset_1.csv", "validation": "validation_dataset_1.csv"})

In [4]:
def tokenize(data_to_tokenize):
    text = data_to_tokenize["text"]
    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_inputs
    
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    
tokenized_dataset = dataset.map(tokenize, batched=True)

In [5]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions,
                                          references=labels)}

In [6]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4, # intrinsic rank of trainable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.1, # probablity of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer only
model = get_peft_model(model, peft_config)

lr = 1e-3
batch_size = 4
num_epochs = 8

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.025,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [7]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3139,0.215787,{'accuracy': 0.9625112917795845}
2,0.3162,0.207179,{'accuracy': 0.9561878952122854}
3,0.1723,0.176222,{'accuracy': 0.9715447154471545}
4,0.182,0.173681,{'accuracy': 0.9683830171635049}
5,0.0982,0.165955,{'accuracy': 0.973803071364047}
6,0.0808,0.139311,{'accuracy': 0.9796747967479674}
7,0.046,0.146433,{'accuracy': 0.980126467931346}
8,0.0283,0.143333,{'accuracy': 0.980126467931346}


Trainer is attempting to log a value of "{'accuracy': 0.9625112917795845}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9561878952122854}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9715447154471545}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9683830171635049}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.973803071364047}" of

TrainOutput(global_step=35432, training_loss=0.16083369660953498, metrics={'train_runtime': 552.9524, 'train_samples_per_second': 256.297, 'train_steps_per_second': 64.078, 'total_flos': 4265303822740800.0, 'train_loss': 0.16083369660953498, 'epoch': 8.0})

In [8]:
test_dataset = load_dataset("csv", data_files={"test":"test_dataset_1.csv"})
tester_tokenized_dataset = test_dataset.map(tokenize, batched=True)
trainer.predict(tester_tokenized_dataset["test"])

PredictionOutput(predictions=array([[-20.199799 , -20.917608 ,  26.548565 , ..., -17.77113  ,
        -23.250957 , -27.736086 ],
       [-10.285996 , -13.426138 , -15.403074 , ...,  -5.3500843,
          8.068838 ,  -0.2420347],
       [-14.182167 , -15.964982 , -16.923204 , ...,  17.386711 ,
        -30.5944   , -34.101276 ],
       ...,
       [-21.258785 , -21.433754 , -24.272427 , ..., -15.490054 ,
        -17.568901 , -13.650479 ],
       [ 24.024141 , -18.163757 , -34.81647  , ...,  -9.982034 ,
        -25.596466 , -18.084906 ],
       [-10.436564 ,  -5.757683 , -23.97744  , ..., -11.999895 ,
         12.9152565, -11.71101  ]], dtype=float32), label_ids=array([2, 7, 5, ..., 3, 0, 6]), metrics={'test_loss': 0.22537368535995483, 'test_accuracy': {'accuracy': 0.9724604966139955}, 'test_runtime': 3.7449, 'test_samples_per_second': 591.466, 'test_steps_per_second': 147.933})

In [9]:
model.to('cpu')
text_list = ["how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges"," how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges",
"Who is Alexander the great?", "Preventing heart attacks can be done by many methods",
"Feet are prone to fungal infections", "is a student-centered cultural anthropology mini textbook built with an equity lens. We are excited to share this with you all. This book attempts to address the lack of current, reliable, and relevant resources for introductory anthropology courses that center equity and anti-racism.",
"How would you describe ?", "What is the importance of cells", "In what year was Armenia invdaded?", "Matter can change its state under different conditions. We have solids, like ice and rocks, which have a definite shape and volume. Liquids, such as water and juice, take the shape of their container but maintain a constant volume. Gases, like the air we breathe, have neither a definite shape nor volume and fill the space they occupy.", "The cradle of civilization was in the fertile valleys of rivers like the Tigris and Euphrates in Mesopotamia, the Nile in Egypt, the Indus in the Indian subcontinent, and the Yellow River in China. These early civilizations developed complex societies, writing systems, and technologies that shaped the course of history.", "feet can get fungal infections if not washed"]
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])


Trained model predictions:
--------------------------
how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges - Math
 how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges - Math
Who is Alexander the great? - Social Studies
Preventing heart attacks can be done by many methods - Health
Feet are prone to fungal infections - Science
is a student-centered cultural anthropology mini textbook built with an equity lens. We are excited to share this with you all. This book attempts to address the lack of current, reliable, and relevant resources for introductory anthropology courses that center equity and anti-racism. - Social Studies
How would you describe ? - Social Studies
What is the importance of cells - Science
In what year was Armenia invdaded? - Social Studies
Matter can change its state under different conditions. We have

In [13]:
model.to("cpu")
trainer.save_model("saved-model1")


RuntimeError: [enforce fail at inline_container.cc:365] . invalid file name: ./model/

# 

In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
from subjects import label2id, id2label

2024-03-28 08:18:51.308298: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-28 08:18:51.350427: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_checkpoint = 'distilbert/distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=8, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset = load_dataset("csv", data_files={"train": "train_dataset_1.csv", "validation": "validation_dataset_1.csv"})

In [4]:
def tokenize(data_to_tokenize):
    text = data_to_tokenize["text"]
    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_inputs
    
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    
tokenized_dataset = dataset.map(tokenize, batched=True)

In [5]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions,
                                          references=labels)}

In [6]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4, # intrinsic rank of trainable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.1, # probablity of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer only
model = get_peft_model(model, peft_config)

lr = 1e-3
batch_size = 4
num_epochs = 8

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.025,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [7]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3139,0.215787,{'accuracy': 0.9625112917795845}
2,0.3162,0.207179,{'accuracy': 0.9561878952122854}
3,0.1723,0.176222,{'accuracy': 0.9715447154471545}
4,0.182,0.173681,{'accuracy': 0.9683830171635049}
5,0.0982,0.165955,{'accuracy': 0.973803071364047}
6,0.0808,0.139311,{'accuracy': 0.9796747967479674}
7,0.046,0.146433,{'accuracy': 0.980126467931346}
8,0.0283,0.143333,{'accuracy': 0.980126467931346}


Trainer is attempting to log a value of "{'accuracy': 0.9625112917795845}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9561878952122854}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9715447154471545}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9683830171635049}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.973803071364047}" of

TrainOutput(global_step=35432, training_loss=0.16083369660953498, metrics={'train_runtime': 552.9524, 'train_samples_per_second': 256.297, 'train_steps_per_second': 64.078, 'total_flos': 4265303822740800.0, 'train_loss': 0.16083369660953498, 'epoch': 8.0})

In [8]:
test_dataset = load_dataset("csv", data_files={"test":"test_dataset_1.csv"})
tester_tokenized_dataset = test_dataset.map(tokenize, batched=True)
trainer.predict(tester_tokenized_dataset["test"])

PredictionOutput(predictions=array([[-20.199799 , -20.917608 ,  26.548565 , ..., -17.77113  ,
        -23.250957 , -27.736086 ],
       [-10.285996 , -13.426138 , -15.403074 , ...,  -5.3500843,
          8.068838 ,  -0.2420347],
       [-14.182167 , -15.964982 , -16.923204 , ...,  17.386711 ,
        -30.5944   , -34.101276 ],
       ...,
       [-21.258785 , -21.433754 , -24.272427 , ..., -15.490054 ,
        -17.568901 , -13.650479 ],
       [ 24.024141 , -18.163757 , -34.81647  , ...,  -9.982034 ,
        -25.596466 , -18.084906 ],
       [-10.436564 ,  -5.757683 , -23.97744  , ..., -11.999895 ,
         12.9152565, -11.71101  ]], dtype=float32), label_ids=array([2, 7, 5, ..., 3, 0, 6]), metrics={'test_loss': 0.22537368535995483, 'test_accuracy': {'accuracy': 0.9724604966139955}, 'test_runtime': 3.7449, 'test_samples_per_second': 591.466, 'test_steps_per_second': 147.933})

In [9]:
model.to('cpu')
text_list = ["how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges"," how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges",
"Who is Alexander the great?", "Preventing heart attacks can be done by many methods",
"Feet are prone to fungal infections", "is a student-centered cultural anthropology mini textbook built with an equity lens. We are excited to share this with you all. This book attempts to address the lack of current, reliable, and relevant resources for introductory anthropology courses that center equity and anti-racism.",
"How would you describe ?", "What is the importance of cells", "In what year was Armenia invdaded?", "Matter can change its state under different conditions. We have solids, like ice and rocks, which have a definite shape and volume. Liquids, such as water and juice, take the shape of their container but maintain a constant volume. Gases, like the air we breathe, have neither a definite shape nor volume and fill the space they occupy.", "The cradle of civilization was in the fertile valleys of rivers like the Tigris and Euphrates in Mesopotamia, the Nile in Egypt, the Indus in the Indian subcontinent, and the Yellow River in China. These early civilizations developed complex societies, writing systems, and technologies that shaped the course of history.", "feet can get fungal infections if not washed"]
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])


Trained model predictions:
--------------------------
how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges - Math
 how much space the oranges need by themselves, what is  the remaining space, determine how many equal spaces you have between the oranges - Math
Who is Alexander the great? - Social Studies
Preventing heart attacks can be done by many methods - Health
Feet are prone to fungal infections - Science
is a student-centered cultural anthropology mini textbook built with an equity lens. We are excited to share this with you all. This book attempts to address the lack of current, reliable, and relevant resources for introductory anthropology courses that center equity and anti-racism. - Social Studies
How would you describe ? - Social Studies
What is the importance of cells - Science
In what year was Armenia invdaded? - Social Studies
Matter can change its state under different conditions. We have

In [13]:
model.to("cpu")
trainer.save_model("saved-model1")


RuntimeError: [enforce fail at inline_container.cc:365] . invalid file name: ./model/

# 