In [1]:
print('Hello Google')
!pip install evaluate
#!pip install accelerate -U
#!pip install transformers[torch]
#!pip install torchinfo
##import torch

##torch.__version__

Hello Google


In [None]:
import warnings

warnings.filterwarnings("ignore")

import os

import torch
import torch._dynamo

torch._dynamo.config.suppress_errors = True

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torchinfo import summary

import evaluate

metric_acc = evaluate.load("accuracy")  #
metric_f1 = evaluate.load("f1")
import numpy as np

tokenizer = None

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)


def get_tokenizer(model_path):
    bert_tokenizer = AutoTokenizer.from_pretrained(model_path)

    return bert_tokenizer


def compute_metrics1(eval_pred, metric=metric_acc):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


def train_classifier(model_path: str,
                     dataset,
                     output_dir="output",
                     train_batch_size=16,
                     eval_batch_size=8,
                     learning_rate= 5e-7, #1.25e-5
                     num_epochs=10,
                     metric_for_best_model="accuracy"
                     ):
    dataset = dataset.rename_column("label", "labels")  # to match Trainer
    print(dataset)
    tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
    print(tokenized_dataset["train"].features.keys())

    # Prepare model labels - useful for inference
    num_labels = 2
    id2label = {0: "NON_INFOSEC", 1: "INFOSEC"}
    label2id = {"NON_INFOSEC": 0, "INFOSEC": 1}

    # Fine-tune & evaluate
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
        hidden_dropout_prob=0.3,
        attention_probs_dropout_prob=0.25
    )

    for param in model.parameters(): param.data = param.data.contiguous()

    print(" ############ Model Summary ######")
    print(model.cuda())

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        learning_rate=learning_rate,
        lr_scheduler_type='linear', #constant constant_with_warmup
        warmup_steps=0,
        num_train_epochs=num_epochs,
        torch_compile=True,  # optimizations
        optim="adamw_torch",  # improved optimizer
        # logging & evaluation strategies
        # logging_dir=f"{repository_id}/logs",
        logging_strategy="steps",
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        weight_decay=0.00, # prevent overfitting default 0.01
        #fp16=True,
        save_total_limit=2,
        load_best_model_at_end=True,
        #metric_for_best_model=metric_for_best_model,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    return model, trainer



In [None]:
from datasets import load_dataset

def prepare_trainingset(dataset_file_path:str, test_size=0.25):
    dataset = load_dataset("csv", data_files=dataset_file_path)
    dataset = dataset['train'].train_test_split(test_size=test_size, shuffle=True)

    return dataset

In [None]:
model_path = "allenai/scibert_scivocab_uncased" #"anferico/bert-for-patents"
tokenizer = get_tokenizer(model_path)


dataset = prepare_trainingset('info_sec_training_dataset_0_1.csv')


In [None]:
model, trainer = train_classifier(model_path, dataset, num_epochs=10)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 4875
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 1625
    })
})


Map:   0%|          | 0/4875 [00:00<?, ? examples/s]

Map:   0%|          | 0/1625 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.


 ############ Model Summary ######
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.25, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprince-donnii[0m ([33mprince-donnii-fuiz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W0727 18:41:51.291000 171 torch/_inductor/utils.py:1137] [0/0_1] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6544,0.639646,0.601846,0.493343,0.720789,0.565953
2,0.5599,0.507499,0.797538,0.785164,0.831782,0.782606
3,0.4438,0.359429,0.884923,0.88308,0.888277,0.880661
4,0.3394,0.271259,0.905231,0.90463,0.904202,0.905157
5,0.2796,0.239315,0.904,0.903534,0.902833,0.904778
6,0.2515,0.221345,0.907692,0.907195,0.906551,0.908176
7,0.2422,0.214163,0.909538,0.909026,0.908429,0.909874
8,0.2462,0.208296,0.913231,0.912723,0.912161,0.913487
9,0.2126,0.209311,0.913231,0.912817,0.91209,0.914133
10,0.2144,0.209233,0.913231,0.912846,0.912106,0.914348


In [None]:
print(model.num_parameters())

109920002


In [None]:
#save the model
def save_model(model_dir_path:str, trainer, tokenizer):
  trainer.save_model(model_dir_path)
  tokenizer.save_pretrained(model_dir_path)
  print('Model is saved ..')

save_model("infosec_model", trainer, tokenizer)

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TextClassificationPipeline, pipeline
import pandas as pd

import evaluate
from evaluate import evaluator
from datasets import Dataset

def evaluate_model(test_data_path, model_path):
    pipe = pipeline(
        "text-classification", model=model_path, max_length=128
    )

    # Define dataset
    test_data = pd.read_csv(test_data_path)
    test_dataset = Dataset.from_pandas(test_data)

    # Define evaluator
    accuracy = evaluate.load("accuracy")

    # Evaluate accuracy
    eval = evaluator("text-classification")
    acc_result = eval.compute(
        model_or_pipeline=pipe,
        data=test_dataset,
        metric=accuracy,
        label_mapping={"NON_INFOSEC": 0, "INFOSEC": 1},
        strategy="bootstrap",
        n_resamples=100,
    )

    # Evaluate F1 score
    f1_metric = evaluate.load("f1")
    f1_result = eval.compute(
        model_or_pipeline=pipe,
        data=test_dataset,
        metric=f1_metric,
        label_mapping={"NON_INFOSEC": 0, "INFOSEC": 1},
        strategy="bootstrap",
        n_resamples=100,
    )

    return acc_result, f1_result

In [None]:
acc, f = evaluate_model("plasma_test_data_annotated_.csv", "plasma_model")

In [None]:
acc

{'accuracy': {'confidence_interval': (0.9467419350565625, 0.9676031733089774),
  'standard_error': 0.005530670650744364,
  'score': 0.9567567567567568},
 'total_time_in_seconds': 256.0170243320001,
 'samples_per_second': 5.05825736932501,
 'latency_in_seconds': 0.1976965438857144}

In [None]:
f

{'f1': {'confidence_interval': (0.9512315072825834, 0.9706627504041747),
  'standard_error': 0.005368977706792648,
  'score': 0.9622641509433962},
 'total_time_in_seconds': 251.65904402599995,
 'samples_per_second': 5.145851225065482,
 'latency_in_seconds': 0.1943313081281853}