# Train iid model on a subset and compare finetune and distil on a successive subset

* Load data of 2006, 2007 and 2008 subsets

In [1]:
import sys
sys.path.append("..")

import pandas as pd
pd.options.mode.chained_assignment = None

from data_processor.data_loader import split_set


train_set1 = "2006"
train_set2 = "2007"
test_set = "2008"

ds_size = "subset"
label_col_name = '18'
label_col_pos_val = '1'

# We only keep features 0 to 13
cols = [str(i) for i in range(14)]

def prepare_set(year, ds_size, is_test_set=False):
    keep_cols = cols

    if is_test_set:
        keep_cols += [label_col_name, ]

    df_year_path = f"../datasets/Kyoto-2016_AnoShift/{ds_size}/{year}_{ds_size}.parquet"
    print("Loading set:", df_year_path)

    df_year = pd.read_parquet(df_year_path)
    df_year = df_year.drop(columns=list(set(df_year.columns) - set(keep_cols)))
    return df_year


df_set1 = prepare_set(train_set1, ds_size)
df_set2 = prepare_set(train_set2, ds_size)
df_test = prepare_set(test_set, ds_size, is_test_set=True)


# Split test set in inliers and outliers
df_test_inlier, df_test_outlier = split_set(
    df_test, label_col_name=label_col_name, label_col_pos_val=label_col_pos_val
)

df_test = [(test_set, df_test_inlier, df_test_outlier),]

Loading set: ../datasets/Kyoto-2016_AnoShift/subset/2006_subset.parquet
Loading set: ../datasets/Kyoto-2016_AnoShift/subset/2007_subset.parquet
Loading set: ../datasets/Kyoto-2016_AnoShift/subset/2008_subset.parquet


* Load pretrained tokenizer and tokenize the sets

In [2]:
from transformers import PreTrainedTokenizerFast

tokenizer_path = '../saved_tokenizers/kyoto-2016.json'
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
tokenizer.add_special_tokens(
    {"pad_token": "[PAD]", "unk_token": "[UNK]", "mask_token": "[MASK]"}
)

0

* Prepare train and test datasets from the dataframes

In [3]:
from language_models.data_utils import prepare_train_ds, prepare_test_ds

lm_ds_set1 = prepare_train_ds(
    df_train=df_set1, tokenizer=tokenizer, block_size=len(cols)
)

lm_ds_set2 = prepare_train_ds(
    df_train=df_set2, tokenizer=tokenizer, block_size=len(cols)
)

ds_test = prepare_test_ds(
    dfs_test=df_test, tokenizer=tokenizer, block_size=len(cols)
)

Mapping tokenizer on train
Mapped tokenizer on train
Mapping tokenizer on train
Mapped tokenizer on train


Configure IID model to train on set1 and finetune on set2

In [4]:
from language_models.model_utils import configure_model, train_model
from copy import deepcopy

architecture = 'bert'
pretrained = False
vocab_size = len(tokenizer.get_vocab())
bs_train = 256
bs_eval = 256
num_epochs = 5

model_iid = configure_model(
        architecture=architecture,
        pretrained=pretrained,
        small=True,
        vocab_size=vocab_size,
        tokenizer=tokenizer,
        embed_size=len(cols)
    )

print("Training iid model on set1")
train_model(
    model=model_iid,
    tokenizer=tokenizer,
    ds_name='kyoto-2016',
    train_set_name=f'{train_set1}',
    run_name='iid',
    lm_ds_train=lm_ds_set1,
    lm_ds_eval=ds_test[0][1]['inlier'],
    dss_test=ds_test,
    save_model_path='/tmp/',
    batch_size_train=bs_train,
    batch_size_eval=bs_eval,
    num_epochs=num_epochs,
    tb_writer=None
)



***** Running training *****
  Num examples = 466774
  Num Epochs = 5
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 9120


Training iid model on set1
Training started...


Epoch,Training Loss,Validation Loss
1,4.8429,3.726619
2,2.871,2.695308
3,2.2109,2.399026
4,1.9973,2.258456
5,1.8793,2.179849


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_1.0/config.json
Model weights saved in /tmp/_1.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 1.0: 3.726618766784668


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_2.0/config.json
Model weights saved in /tmp/_2.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 2.0: 2.695307731628418


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_3.0/config.json
Model weights saved in /tmp/_3.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 3.0: 2.3990261554718018


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_4.0/config.json
Model weights saved in /tmp/_4.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 4.0: 2.258456230163574


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_5.0/config.json
Model weights saved in /tmp/_5.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 5.0: 2.179849147796631
{'inlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300000
}), 'outlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 74710
})}


100%|██████████| 1170/1170 [02:27<00:00,  7.95it/s]


Class: inlier Anomaly score: 0.5905607052171541


100%|██████████| 290/290 [00:36<00:00,  8.01it/s]


Class: outlier Anomaly score: 0.580146057729217
ROC AUC       2008: 0.4549984012224876
AUCPR INLIER  2008: 0.7712426840717375
AUCPR OUTLIER 2008: 0.1944036253780393
F1 INLIER 2008: 0.8897796663051393
F1 OUTLIER 2008: 0.33142857142857146


  2 * (precision_inlier * recall_inlier) /


Training completed. Do not forget to share your model on huggingface.co/models =)




* Finetune iid model on set 2

In [5]:
print("Finetuning iid model on set2")
model_finetune = deepcopy(model_iid)
train_model(
    model=model_iid,
    tokenizer=tokenizer,
    ds_name='kyoto-2016',
    train_set_name=f'{train_set2}',
    run_name='iid',
    lm_ds_train=lm_ds_set2,
    lm_ds_eval=ds_test[0][1]['inlier'],
    dss_test=ds_test,
    save_model_path='/tmp/',
    batch_size_train=bs_train,
    batch_size_eval=bs_eval,
    num_epochs=num_epochs,
    tb_writer=None
)

PyTorch: setting up devices


Finetuning iid model on set2
Training started...


***** Running training *****
  Num examples = 415471
  Num Epochs = 5
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 8115


Epoch,Training Loss,Validation Loss
1,1.8603,1.811404
2,1.7345,1.749303
3,1.6706,1.70648
4,1.627,1.678786
5,1.5883,1.660579


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_1.0/config.json
Model weights saved in /tmp/_1.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 1.0: 1.8114039897918701


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_2.0/config.json
Model weights saved in /tmp/_2.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 2.0: 1.7493031024932861


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_3.0/config.json
Model weights saved in /tmp/_3.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 3.0: 1.706479787826538


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_4.0/config.json
Model weights saved in /tmp/_4.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 4.0: 1.678786039352417


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_5.0/config.json
Model weights saved in /tmp/_5.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 5.0: 1.6605793237686157
{'inlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300000
}), 'outlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 74710
})}


100%|██████████| 1170/1170 [02:20<00:00,  8.35it/s]


Class: inlier Anomaly score: 0.5131177052078075


100%|██████████| 290/290 [00:34<00:00,  8.32it/s]


Class: outlier Anomaly score: 0.5579185805923171
ROC AUC       2008: 0.6473447652288921
AUCPR INLIER  2008: 0.8796352238806586
AUCPR OUTLIER 2008: 0.28176339124610855
F1 INLIER 2008: 0.8898239335091125
F1 OUTLIER 2008: 0.3872355900357597




Training completed. Do not forget to share your model on huggingface.co/models =)




* Distil iid model on set 2

In [6]:
from language_models.model_utils import distil_model
from language_models.data_utils import train_df_to_ds

# Instantiate the teacher as the iid model checkpointed on set1
teacher_model = deepcopy(model_iid)

# Instantiate a new student model
student_model = configure_model(
        architecture=architecture,
        pretrained=pretrained,
        small=True,
        vocab_size=vocab_size,
        tokenizer=tokenizer,
        embed_size=len(cols)
    )

ds_set2 = train_df_to_ds(df_set2)

student_model = distil_model(
        teacher=teacher_model,
        student=student_model,
        tokenizer=tokenizer,
        ds_train=ds_set2,
        dss_test=ds_test,
        save_model_path='/tmp/',
        batch_size_train=bs_train,
        batch_size_eval=bs_eval,
        num_epochs=num_epochs,
        tb_writer=None
    )

Starting epoch 1


100%|██████████| 1623/1623 [00:45<00:00, 35.86it/s]
Configuration saved in /tmp/_1.0/config.json
Model weights saved in /tmp/_1.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Starting epoch 2


100%|██████████| 1623/1623 [00:48<00:00, 33.16it/s]
Configuration saved in /tmp/_2.0/config.json
Model weights saved in /tmp/_2.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Starting epoch 3


100%|██████████| 1623/1623 [00:49<00:00, 32.74it/s]
Configuration saved in /tmp/_3.0/config.json
Model weights saved in /tmp/_3.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Starting epoch 4


100%|██████████| 1623/1623 [00:49<00:00, 33.06it/s]
Configuration saved in /tmp/_4.0/config.json
Model weights saved in /tmp/_4.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Starting epoch 5


100%|██████████| 1623/1623 [00:50<00:00, 31.98it/s]
Configuration saved in /tmp/_5.0/config.json
Model weights saved in /tmp/_5.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


{'inlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300000
}), 'outlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 74710
})}


100%|██████████| 1170/1170 [02:23<00:00,  8.16it/s]


Class: inlier Anomaly score: 0.5631501623717752


100%|██████████| 290/290 [00:36<00:00,  8.01it/s]


Class: outlier Anomaly score: 0.6124671218926347
ROC AUC       2008: 0.6665836452278388
AUCPR INLIER  2008: 0.8980555348518733
AUCPR OUTLIER 2008: 0.2677317179252389
F1 INLIER 2008: 0.8898073057719134
F1 OUTLIER 2008: 0.4185166667910364
