# Train iid model on a subset

* Load data of 2006, 2008 subset

In [1]:
import sys
sys.path.append("..")

import pandas as pd
pd.options.mode.chained_assignment = None

from data_processor.data_loader import split_set


train_set1 = "2006"
test_set = "2008"

ds_size = "subset"
label_col_name = '18'
label_col_pos_val = '1'

# We only keep features 0 to 13
cols = [str(i) for i in range(14)]

def prepare_set(year, ds_size, is_test_set=False):
    keep_cols = cols

    if is_test_set:
        keep_cols += [label_col_name, ]

    df_year_path = f"../datasets/Kyoto-2016_AnoShift/{ds_size}/{year}_{ds_size}.parquet"
    print("Loading set:", df_year_path)

    df_year = pd.read_parquet(df_year_path)
    df_year = df_year.drop(columns=list(set(df_year.columns) - set(keep_cols)))
    return df_year


df_set1 = prepare_set(train_set1, ds_size)
df_test = prepare_set(test_set, ds_size, is_test_set=True)


# Split test set in inliers and outliers
df_test_inlier, df_test_outlier = split_set(
    df_test, label_col_name=label_col_name, label_col_pos_val=label_col_pos_val
)

df_test = [(test_set, df_test_inlier, df_test_outlier),]

Loading set: ../datasets/Kyoto-2016_AnoShift/subset/2006_subset.parquet
Loading set: ../datasets/Kyoto-2016_AnoShift/subset/2008_subset.parquet


* Load pretrained tokenizer and tokenize the sets

In [2]:
from transformers import PreTrainedTokenizerFast

tokenizer_path = '../saved_tokenizers/kyoto-2016.json'
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
tokenizer.add_special_tokens(
    {"pad_token": "[PAD]", "unk_token": "[UNK]", "mask_token": "[MASK]"}
)

0

* Prepare train and test datasets from the dataframes

In [3]:
from language_models.data_utils import prepare_train_ds, prepare_test_ds

lm_ds_set1 = prepare_train_ds(
    df_train=df_set1, tokenizer=tokenizer, block_size=len(cols)
)

ds_test = prepare_test_ds(
    dfs_test=df_test, tokenizer=tokenizer, block_size=len(cols)
)



Mapping tokenizer on train
Mapped tokenizer on train


Configure IID model to train on set1 and finetune on set2

In [4]:
from language_models.model_utils import configure_model, train_model
from copy import deepcopy

architecture = 'bert'
pretrained = False
vocab_size = len(tokenizer.get_vocab())
bs_train = 256
bs_eval = 256
num_epochs = 5

model_iid = configure_model(
        architecture=architecture,
        pretrained=pretrained,
        small=True,
        vocab_size=vocab_size,
        tokenizer=tokenizer,
        embed_size=len(cols)
    )

print("Training iid model on set1")
train_model(
    model=model_iid,
    tokenizer=tokenizer,
    ds_name='kyoto-2016',
    train_set_name=f'{train_set1}',
    run_name='iid',
    lm_ds_train=lm_ds_set1,
    lm_ds_eval=ds_test[0][1]['inlier'],
    dss_test=ds_test,
    save_model_path='/tmp/',
    batch_size_train=bs_train,
    batch_size_eval=bs_eval,
    num_epochs=num_epochs,
    tb_writer=None
)



***** Running training *****
  Num examples = 466774
  Num Epochs = 5
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 9120


Training iid model on set1
Training started...


Epoch,Training Loss,Validation Loss
1,4.8786,3.747528
2,2.8811,2.707754
3,2.2109,2.418274
4,1.9953,2.284305
5,1.8731,2.202164


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_1.0/config.json
Model weights saved in /tmp/_1.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 1.0: 3.747527599334717


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_2.0/config.json
Model weights saved in /tmp/_2.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 2.0: 2.7077536582946777


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_3.0/config.json
Model weights saved in /tmp/_3.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 3.0: 2.418274402618408


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_4.0/config.json
Model weights saved in /tmp/_4.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 4.0: 2.2843053340911865


***** Running Evaluation *****
  Num examples = 300000
  Batch size = 256
Configuration saved in /tmp/_5.0/config.json
Model weights saved in /tmp/_5.0/pytorch_model.bin
Configuration saved in /tmp/_final/config.json
Model weights saved in /tmp/_final/pytorch_model.bin


Best loss at epoch 5.0: 2.2021636962890625
{'inlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300000
}), 'outlier': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 74710
})}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1170/1170 [02:26<00:00,  7.99it/s]


Class: inlier Anomaly score: 0.5981702628699448


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 290/290 [00:35<00:00,  8.08it/s]


Class: outlier Anomaly score: 0.581571835178923
ROC AUC       2008: 0.4345462931063264
AUCPR INLIER  2008: 0.7633391538420858
AUCPR OUTLIER 2008: 0.18097028621284614
F1 INLIER 2008: 0.8897398041178619
F1 OUTLIER 2008: 0.33142857142857146


  2 * (precision_inlier * recall_inlier) /


Training completed. Do not forget to share your model on huggingface.co/models =)


