from pathlib import Path

def read_data_split(file):
    texts = []
    with open(file,'r') as f:
        for line in f:
            texts.append(line.strip())
    return texts

def read_label_split(file):
    labels = []
    with open(file,'r') as f:
        for line in f:
            if line == '1\n':
                labels.append(1)
            else:
                labels.append(0)
    return labels

train_texts = read_data_split('../data/splits/train2')
train_labels = read_label_split('../data/splits/train_label2')
val_texts = read_data_split('../data/splits/valid2')
val_labels = read_label_split('../data/splits/valid_label2')
test_texts = read_data_split('../data/splits/test2')
test_labels = read_label_split('../data/splits/test_label2')

In [16]:
from datasets import load_dataset

In [17]:
input_dataset = load_dataset('text', data_files={'train': '../data/splits/train', 'valid': '../data/splits/valid', 'test': '../data/splits/test'})
label_dataset = load_dataset('text', data_files={'train': '../data/splits/train_label', 'valid': '../data/splits/valid_label', 'test': '../data/splits/test_label'})

Using custom data configuration default
Reusing dataset text (/home/ubuntu/.cache/huggingface/datasets/text/default-df6fdfd0ab6b35f0/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab)
Using custom data configuration default
Reusing dataset text (/home/ubuntu/.cache/huggingface/datasets/text/default-1d8aa2beeec09988/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab)


In [18]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [19]:
def encode(examples):
     return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    
input_dataset = input_dataset.map(encode, batched=True)

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/text/default-df6fdfd0ab6b35f0/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab/cache-ce35e4dba61492b4.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/text/default-df6fdfd0ab6b35f0/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab/cache-4315d97384efbefc.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/text/default-df6fdfd0ab6b35f0/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab/cache-e242a4782e1c64f1.arrow


In [20]:
# training model on tokenized and split data
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.inputs[idx].items() if key != 'text'}
        item['labels'] = torch.tensor(int(self.labels[idx]['text']))
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(input_dataset['train'], label_dataset['train'])
val_dataset = Dataset(input_dataset['valid'], label_dataset['valid'])
test_dataset = Dataset(input_dataset['test'], label_dataset['test'])

In [15]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='../data/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=10000,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    evaluation_strategy='steps',
    learning_rate=2e-5,
    fp16=True,
    save_total_limit=5,
    eval_steps=2000,
    save_steps=2000,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss


In [19]:
inputs = ['this is the first sentence', 'this is the second']
tokenizer(inputs, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[101, 1142, 1110, 1103, 1148, 5650, 102], [101, 1142, 1110, 1103, 1248, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]]}

In [22]:
len(tokenizer.get_vocab())

28996

In [32]:
test_labels[10]

'0'

In [42]:
train_encodings

{'input_ids': [[101, 10296, 2236, 131, 164, 115, 115, 21692, 1545, 118, 126, 118, 1275, 115, 115, 166, 12398, 2236, 131, 164, 115, 115, 21692, 1545, 118, 126, 118, 1479, 115, 115, 166, 2236, 1104, 3485, 131, 164, 115, 115, 19980, 1604, 118, 1367, 118, 1429, 115, 115, 166, 2673, 131, 182, 1555, 131, 3621, 26423, 16426, 6617, 1665, 1155, 1200, 19310, 131, 4764, 15419, 6546, 131, 164, 115, 115, 1148, 1271, 1495, 113, 181, 2087, 114, 18513, 22433, 115, 115, 166, 2705, 12522, 131, 2229, 3600, 1757, 1558, 13467, 1137, 19849, 7791, 131, 1884, 15789, 1616, 18593, 13981, 176, 15371, 193, 124, 164, 115, 115, 21692, 1545, 118, 126, 118, 1429, 115, 115, 166, 1607, 1104, 1675, 6946, 131, 4667, 194, 120, 184, 2581, 1114, 1185, 11019, 1181, 177, 1775, 3756, 164, 115, 115, 126, 118, 1542, 115, 115, 166, 2370, 177, 1775, 1104, 2229, 3600, 1757, 1105, 20295, 1114, 1821, 23601, 1916, 135, 164, 115, 115, 123, 118, 1405, 115, 115, 166, 2837, 119, 116, 3084, 1204, 113, 164, 115, 115, 126, 118, 130, 115, 115