# Teacher Model Training

### imports

In [1]:
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


### load data

#### How to load custom dataset from CSV in Huggingfaces

In [2]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
data_files = {"train": "train.csv", "test": "test.csv", "validation":"validation.csv"}
dataset_dict = load_dataset("./data", data_files=data_files)

In [3]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 700
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 150
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 150
    })
})

### Train Teacher Model

In [4]:
# Load model directly
model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

# id2label = {0: "Safe", 1: "Not Safe"}
# label2id = {"Safe": 0, "Not Safe": 1}

id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}
label2id = {'negative': 0, 'neutral': 1, 'positive': 2}

# id2label = {'negative': 0, 'neutral': 1, 'positive': 2}
# label2id = {0: 'negative', 1: 'neutral', 2: 'positive'}

model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=3, 
                                                           id2label=id2label, 
                                                           label2id=label2id,)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Freeze base model

In [5]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [6]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

#### Preprocess text

In [7]:
# define text preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
# tokenize all datasetse
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████| 700/700 [00:00<00:00, 14302.83 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 13786.47 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 14295.51 examples/s]


In [9]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

- DataCollatorWithPadding: This collator pads the input samples so that they are all of the same length. For padding,

#### Evaluation

In [10]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    # auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    # return {"Accuracy": acc, "AUC": auc} # ValueError: multi_class must be in ('ovo', 'ovr') (for multiclass AUC)
    return {"Accuracy": acc}

#### Train model

In [11]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="output",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0033,0.835815,0.627
2,0.8428,0.686456,0.667
3,0.8048,0.649066,0.727
4,0.739,0.626422,0.747
5,0.7337,0.630984,0.747
6,0.714,0.624891,0.72
7,0.7098,0.611644,0.733
8,0.6808,0.604158,0.747
9,0.6779,0.603422,0.753
10,0.6805,0.602777,0.747


TrainOutput(global_step=880, training_loss=0.7586528951471502, metrics={'train_runtime': 102.375, 'train_samples_per_second': 68.376, 'train_steps_per_second': 8.596, 'total_flos': 95076444610728.0, 'train_loss': 0.7586528951471502, 'epoch': 10.0})

### Apply Model to Validation Dataset

In [13]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

{'Accuracy': np.float64(0.767)}


In [14]:
model_name = "./bert-classifier_tweets"
trainer.save_model(model_name)

### Push to hub