In [1]:
import torch
import transformers

from datasets import load_dataset
import numpy as np

from project import CreditDefault, PII, SkinCancer, set_seed

  from .autonotebook import tqdm as notebook_tqdm


## Credit-card default

In [2]:
set_seed(1234)

In [3]:
credit = CreditDefault()
credit.prepare_dataset("cuda")

In [6]:
optimizers = {
    "adamw": (torch.optim.AdamW, {"lr": 2e-5}),
    "rmsprop": (torch.optim.RMSprop, {"lr": 2e-5}),
    "adagrad": (torch.optim.Adagrad, {"lr": 2e-5}),
}

schedulers = {
    "linear": (torch.optim.lr_scheduler.LinearLR, {"start_factor": 0.01, "end_factor": 0.0001, "total_iters": 2}),
    "cosine": (torch.optim.lr_scheduler.CosineAnnealing, {"T_max": 100, "eta_min": 0.01}),
    "polynomial": (torch.optim.lr_scheduler.PolynomialLR, {"total_iters": 100, "power": 2})
}

In [7]:
df = credit.train(
    result_csv = "test.csv", 
    optimizers = optimizers,
    schedulers = schedulers,
    epochs = 5,
    batch_size = 1000,
    device = "cuda",
    loss_func = torch.nn.MSELoss
    )

-------------------------------------
|linear, adamw|
Epoch 1
	batch 1 | loss: 0.4424096345901489 | accuracy: 0.537
	batch 2 | loss: 0.4221212863922119 | accuracy: 0.556
	batch 3 | loss: 0.41576284170150757 | accuracy: 0.563
	batch 4 | loss: 0.3999929428100586 | accuracy: 0.584
	batch 5 | loss: 0.3750000596046448 | accuracy: 0.605
	batch 6 | loss: 0.35946759581565857 | accuracy: 0.618
	batch 7 | loss: 0.3029974699020386 | accuracy: 0.673
	batch 8 | loss: 0.3039945960044861 | accuracy: 0.68
	batch 9 | loss: 0.24527829885482788 | accuracy: 0.725
	batch 10 | loss: 0.24620354175567627 | accuracy: 0.715
	batch 11 | loss: 0.2339823693037033 | accuracy: 0.742
	batch 12 | loss: 0.23700402677059174 | accuracy: 0.746
	batch 13 | loss: 0.22199484705924988 | accuracy: 0.767
	batch 14 | loss: 0.2247716784477234 | accuracy: 0.767
	batch 15 | loss: 0.23900000751018524 | accuracy: 0.755
	batch 16 | loss: 0.2389996200799942 | accuracy: 0.753
	batch 17 | loss: 0.2141554355621338 | accuracy: 0.78
	batch 

## PII

In [3]:
model_path = "distilbert/distilroberta-base"

pii = PII()
pii.load_tokenizer(model_path)
pii.prepare_dataset()

In [14]:
optimizers = {
    #"adamw": (torch.optim.AdamW, {"lr": 2e-4}),
    #"rmsprop": (torch.optim.RMSprop, {"lr": 2e-4, "momentum": 0}),
    "adagrad": (torch.optim.Adagrad, {"lr": 3e-5}),
}

schedulers = {
    "linear": {"num_warmup_steps": 0},
    "cosine": {"num_warmup_steps": 0},
    "polynomial": {"num_warmup_steps": 0, "power": 2}
}

In [15]:
df = pii.train(
    output_dir = "test/",
    optimizers = optimizers,
    schedulers = schedulers,
    tokenizer_path = model_path,
    model_path = model_path,
    strategy = "epoch",
    epochs = 10,
    batch_size = 150,
    result_csv = "test.csv",
    device = "cuda"
)

----------------------------
Optimizer: adagrad | Scheduler: linear


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.27008575201034546, 'eval_precision': 0.647475015477138, 'eval_recall': 0.6914431431809596, 'eval_f1': 0.6687371546015072, 'eval_accuracy': 0.9267435795158172, 'eval_runtime': 3.9603, 'eval_samples_per_second': 820.652, 'eval_steps_per_second': 5.555, 'epoch': 1.0}
{'eval_loss': 0.19655659794807434, 'eval_precision': 0.710111966410077, 'eval_recall': 0.7667170381564035, 'eval_f1': 0.7373297002724796, 'eval_accuracy': 0.9443404620907613, 'eval_runtime': 3.9643, 'eval_samples_per_second': 819.818, 'eval_steps_per_second': 5.55, 'epoch': 2.0}
{'eval_loss': 0.17069782316684723, 'eval_precision': 0.7380763466455083, 'eval_recall': 0.7907064601435587, 'eval_f1': 0.7634854771784232, 'eval_accuracy': 0.950968058666503, 'eval_runtime': 3.9691, 'eval_samples_per_second': 818.821, 'eval_steps_per_second': 5.543, 'epoch': 3.0}
{'eval_loss': 0.15602441132068634, 'eval_precision': 0.7565047509102211, 'eval_recall': 0.8045901020022668, 'eval_f1': 0.7798068561490228, 'eval_accuracy': 0.

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.28139516711235046, 'eval_precision': 0.618368962787015, 'eval_recall': 0.6638647525500566, 'eval_f1': 0.6403097244363471, 'eval_accuracy': 0.9244883556810163, 'eval_runtime': 3.9034, 'eval_samples_per_second': 832.602, 'eval_steps_per_second': 5.636, 'epoch': 1.0}
{'eval_loss': 0.19794824719429016, 'eval_precision': 0.7112318205712284, 'eval_recall': 0.7667170381564035, 'eval_f1': 0.7379329151895283, 'eval_accuracy': 0.9448620784879261, 'eval_runtime': 3.851, 'eval_samples_per_second': 843.938, 'eval_steps_per_second': 5.713, 'epoch': 2.0}
{'eval_loss': 0.16779176890850067, 'eval_precision': 0.7429803714461755, 'eval_recall': 0.7972232716282585, 'eval_f1': 0.769146658162103, 'eval_accuracy': 0.9521033414132736, 'eval_runtime': 3.9689, 'eval_samples_per_second': 818.867, 'eval_steps_per_second': 5.543, 'epoch': 3.0}
{'eval_loss': 0.15220597386360168, 'eval_precision': 0.7669213004085983, 'eval_recall': 0.8154514544767661, 'eval_f1': 0.7904421862125791, 'eval_accuracy': 0

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.29934048652648926, 'eval_precision': 0.5962062000526916, 'eval_recall': 0.6411975821684927, 'eval_f1': 0.6178839590443685, 'eval_accuracy': 0.9192875333681078, 'eval_runtime': 3.8821, 'eval_samples_per_second': 837.178, 'eval_steps_per_second': 5.667, 'epoch': 1.0}
{'eval_loss': 0.22242829203605652, 'eval_precision': 0.6837509822753863, 'eval_recall': 0.7396108802417831, 'eval_f1': 0.7105848192005807, 'eval_accuracy': 0.9389401982142309, 'eval_runtime': 3.8754, 'eval_samples_per_second': 838.615, 'eval_steps_per_second': 5.677, 'epoch': 2.0}
{'eval_loss': 0.1935511827468872, 'eval_precision': 0.7123072903057225, 'eval_recall': 0.7723838307517945, 'eval_f1': 0.7411300919842312, 'eval_accuracy': 0.9456751864011537, 'eval_runtime': 4.5576, 'eval_samples_per_second': 713.092, 'eval_steps_per_second': 4.827, 'epoch': 3.0}
{'eval_loss': 0.18016143143177032, 'eval_precision': 0.7315641968821159, 'eval_recall': 0.7889119758216849, 'eval_f1': 0.7591565936562755, 'eval_accuracy':

In [19]:
df.at["adagrad","linear"]

{'LOC': {'precision': 0.725335955424451,
  'recall': 0.8136029411764706,
  'f1': 0.7669381389707157,
  'number': 2720},
 'MISC': {'precision': 0.5721877767936226,
  'recall': 0.5632083696599826,
  'f1': 0.5676625659050966,
  'number': 1147},
 'ORG': {'precision': 0.7049830124575311,
  'recall': 0.7699443413729128,
  'f1': 0.7360331067100208,
  'number': 3234},
 'PER': {'precision': 0.9152076896670099,
  'recall': 0.9302163293789253,
  'f1': 0.9226509776777988,
  'number': 2866},
 'overall_precision': 0.7543529411764706,
 'overall_recall': 0.8041537072338718,
 'overall_f1': 0.7784576534576535,
 'overall_accuracy': 0.9518529567753912,
 'train_time': 193.58787441253662,
 'log': [{'eval_loss': 0.27008575201034546,
   'eval_precision': 0.647475015477138,
   'eval_recall': 0.6914431431809596,
   'eval_f1': 0.6687371546015072,
   'eval_accuracy': 0.9267435795158172,
   'eval_runtime': 3.9603,
   'eval_samples_per_second': 820.652,
   'eval_steps_per_second': 5.555,
   'epoch': 1.0,
   'step':

## Melanoma identification

In [41]:
#dataset = load_dataset(path = "ai4privacy/pii-masking-200k", data_files = "english_pii_43k.jsonl")
dataset = load_dataset(path = "conll2003")

In [59]:
tokenizer = transformers.RobertaTokenizerFast.from_pretrained(model_path, add_prefix_space = True)

In [63]:
def tokenize_and_align_labels(examples, label_all_tokens = True):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation = True, 
        is_split_into_words = True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [64]:
toknized_dataset = dataset.map(tokenize_and_align_labels, batched = True)

Map: 100%|██████████| 14041/14041 [00:01<00:00, 8256.26 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 8099.45 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 9232.67 examples/s]


In [None]:
dataset = load_dataset("marmal88/skin_cancer")

Downloading readme: 100%|██████████| 3.24k/3.24k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 521M/521M [01:21<00:00, 6.37MB/s] 
Downloading data: 100%|██████████| 525M/525M [01:24<00:00, 6.24MB/s] 
Downloading data: 100%|██████████| 527M/527M [01:24<00:00, 6.23MB/s] 
Downloading data: 100%|██████████| 528M/528M [01:19<00:00, 6.60MB/s] 
Downloading data: 100%|██████████| 548M/548M [01:26<00:00, 6.33MB/s] 
Downloading data: 100%|██████████| 341M/341M [00:48<00:00, 7.03MB/s] 
Downloading data: 100%|██████████| 348M/348M [00:48<00:00, 7.21MB/s] 
Downloading data: 100%|██████████| 355M/355M [00:54<00:00, 6.46MB/s] 
Generating train split: 100%|██████████| 9577/9577 [00:04<00:00, 2388.92 examples/s]
Generating validation split: 100%|██████████| 2492/2492 [00:01<00:00, 2204.03 examples/s]
Generating test split: 100%|██████████| 1285/1285 [00:00<00:00, 2136.98 examples/s]


In [None]:
dataset["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=600x450>,
 'image_id': 'ISIC_0024329',
 'lesion_id': 'HAM_0002954',
 'dx': 'actinic_keratoses',
 'dx_type': 'histo',
 'age': 75.0,
 'sex': 'female',
 'localization': 'lower extremity'}

SGD Algorithms: 
- "adamw_torch"
- "adagrad"
- "rmsprop"


LR Schedulers:

- "cosine"
- "inverse_sqrt"

inverse_sqrt = lambda step: 1/math.sqrt(step, 100)

scheduler = LambdaLR(optimizer, lr_lambda = inverse_sqrt)


- "reduce_lr_on_plateau"