In [1]:
from datasets import load_dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,TrainingArguments,Trainer
from transformers import default_data_collator
import os 
from datasets import DatasetDict
import torch
import time
import gc
import json
import collections
from tqdm.auto import tqdm
from evaluate import load
import numpy as np 
import wandb
from itertools import product
import torch
wandb.init(mode="disabled")

In [None]:
file_to_train='strict_model_individual_datasets'
path='./SlovakBabyLM/Curricullum_learning/'

In [2]:
dataset = load_dataset("TUKE-DeutscheTelekom/squad-sk")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 124546
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11422
    })
})

In [4]:
def filter_empty_answers(dataset):
    return dataset.filter(lambda x: len(x['answers']['text']) > 0)

dataset['train']=filter_empty_answers(dataset['train'])
dataset['validation'] = filter_empty_answers(dataset['validation'])

In [5]:
test_size = int(0.1 * len(dataset['train']))
train_dataset = dataset["train"]
shuffle_train_dataset = train_dataset.shuffle(seed=42)
test_dataset = shuffle_train_dataset.select(range(test_size))
new_train_dataset = shuffle_train_dataset.select(range(test_size, len(train_dataset)))

dataset = DatasetDict({
    "train": new_train_dataset,
    "validation": dataset["validation"],
    "test": test_dataset
})

In [None]:

# train_size = len(dataset['train'])
# dataset['train'] = dataset['train'].select(range(train_size // 2))
# train_size = len(dataset['validation'])
# dataset['validation'] = dataset['validation'].select(range(train_size // 2))
# train_size = len(dataset['test'])
# dataset['test'] = dataset['test'].select(range(train_size // 2))

In [None]:

files=next(os.walk(f'{path}saved_model/{file_to_train}'))[1]
results_eval = {file_model: [] for file_model in files}


In [6]:
counter=0
counter_2=0
for example in dataset['validation']:
    for answer in example["answers"]["text"]:
        if len(answer)>50:
            counter+=1
        else:
            counter_2+=1


In [26]:
counter

1334

In [27]:
counter_2

18103

In [7]:
metric = load("squad")


In [8]:
def preprocess_validation_examples(examples):
            questions = [q.strip() for q in examples["question"]]
            inputs = tokenizer(
                questions,
                examples["context"],
                max_length=128,
                truncation="only_second",
                stride=50,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                padding="max_length",
            )

            sample_map = inputs.pop("overflow_to_sample_mapping")
            example_ids = []

            for i in range(len(inputs["input_ids"])):
                sample_idx = sample_map[i]
                example_ids.append(examples["id"][sample_idx])

                sequence_ids = inputs.sequence_ids(i)
                offset = inputs["offset_mapping"][i]
                inputs["offset_mapping"][i] = [
                    o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
                ]

            inputs["example_id"] = example_ids
            return inputs
#https://huggingface.co/docs/transformers/en/tasks/question_answering
def preprocess_function(examples):
            questions = [q.strip() for q in examples["question"]]
            inputs = tokenizer(
                questions,
                examples["context"],
                max_length=128,
                truncation="only_second",
                return_offsets_mapping=True,
                padding="max_length",
            )

            offset_mapping = inputs.pop("offset_mapping")
            answers = examples["answers"]
            start_positions = []
            end_positions = []
            # try:
            for i, offset in enumerate(offset_mapping):
                    answer = answers[i]
                    start_char = answer["answer_start"][0]
                    end_char = answer["answer_start"][0] + len(answer["text"][0])
                    sequence_ids = inputs.sequence_ids(i)
                    sequence_ids = [seq_id if seq_id is not None else 0 for seq_id in inputs.sequence_ids(i)]
                    idx = 0
                    while sequence_ids[idx] != 1:
                        idx += 1
                    context_start = idx
                    while idx < len(sequence_ids) and sequence_ids[idx] == 1:
                        idx += 1
                    context_end = idx - 1
                    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                        start_positions.append(0)
                        end_positions.append(0)
                    else:
                        idx = context_start
                        while idx <= context_end and offset[idx][0] <= start_char:
                            idx += 1
                        start_positions.append(idx - 1)
                        idx = context_end
                        while idx >= context_start and offset[idx][1] >= end_char:
                            idx -= 1
                        end_positions.append(idx + 1)   
            inputs["start_positions"] = start_positions
            inputs["end_positions"] = end_positions
            return inputs



def compute_metrics(start_logits, end_logits, features, examples):
            n_best = 20
            max_answer_length = 50
            example_to_features = collections.defaultdict(list)
            for idx, feature in enumerate(features):
                example_to_features[feature["example_id"]].append(idx)

            predicted_answers = []
            for example in tqdm(examples):
                example_id = example["id"]
                context = example["context"]
                answers = []

                for feature_index in example_to_features[example_id]:
                    start_logit = start_logits[feature_index]
                    end_logit = end_logits[feature_index]
                    offsets = features[feature_index]["offset_mapping"]

                    start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
                    end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
                    for start_index in start_indexes:
                        for end_index in end_indexes:
                            # Skip answers that are not fully in the context
                            if offsets[start_index] is None or offsets[end_index] is None:
                                continue
                            # Skip answers with a length that is either < 0 or > max_answer_length
                            if (
                                end_index < start_index
                                or end_index - start_index + 1 > max_answer_length
                            ):
                                continue
                            
                            answer = {
                                "text": context[offsets[start_index][0] : offsets[end_index][1]],
                                "logit_score": start_logit[start_index] + end_logit[end_index],
                            }
                            answers.append(answer)

                # Select the answer with the best score
                if len(answers) > 0:
                    best_answer = max(answers, key=lambda x: x["logit_score"])
                    predicted_answers.append(
                        {"id": example_id, "prediction_text": best_answer["text"]}
                    )
                else:
                    predicted_answers.append({"id": example_id, "prediction_text": ""})

            theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
            return metric.compute(predictions=predicted_answers, references=theoretical_answers)


In [9]:
learning_rate = [5e-5,3e-5,1e-5]
number_of_epochs = [3, 5,7]
combined_params = [{'learning_rate': lr, 'epoch': ep} for lr, ep in product(learning_rate, number_of_epochs)]

In [None]:
torch.cuda.empty_cache()
gc.collect()
for parameters in combined_params:
    learning_rate=parameters['learning_rate']
    number_of_epochs=parameters['epoch']
    for file_to_load in files:
        results_eval = {} 
        save_path = f"{path}saved_model/{file_to_train}/{file_to_load}"
        tokenizer = AutoTokenizer.from_pretrained(save_path)
        qa_model=AutoModelForQuestionAnswering.from_pretrained(save_path)
        torch.cuda.empty_cache()
        gc.collect()
        validation_dataset = dataset["validation"].map(preprocess_validation_examples,batched=True,remove_columns=dataset["validation"].column_names)
        tokenized_dataset_train =  dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
        tokenized_dataset_train = tokenized_dataset_train.select(range(len(tokenized_dataset_train)//2))
        tokenized_dataset_test =  dataset["test"].map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names)
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(15)
        print('LOL')
        training_args = TrainingArguments(
            output_dir=f"{path}saved_model/{file_to_train}/{file_to_load}/QA/results",
            evaluation_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=number_of_epochs,
            weight_decay=0.01,
            save_strategy="epoch",
            logging_dir=f"{path}saved_model/{file_to_train}/{file_to_load}/QA/logs",
            push_to_hub=False
        )
        trainer = Trainer(
            model=qa_model,
            args=training_args,
            train_dataset=tokenized_dataset_train,
            eval_dataset=tokenized_dataset_test,
            processing_class=tokenizer,
            data_collator=default_data_collator,
        )
        trainer.train()
        torch.cuda.empty_cache()
        gc.collect()
        predictions, _, _ = trainer.predict(validation_dataset)
        start_logits, end_logits = predictions
        result=compute_metrics(start_logits, end_logits, validation_dataset, dataset["validation"])
        result['parameters']=[learning_rate,number_of_epochs]
        torch.cuda.empty_cache()
        gc.collect()
        results_eval[file_to_load]=result
        with open(f'{path}results_evaluation/{file_to_train}/output_QA.json', 'a') as f:
            json.dump(results_eval, f)
            f.write(',\n')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_subs and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5729 [00:00<?, ? examples/s]

Map:   0%|          | 0/74635 [00:00<?, ? examples/s]

Map:   0%|          | 0/8292 [00:00<?, ? examples/s]

LOL


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss
1,4.0103,3.88623
2,3.794,3.841075
3,3.6921,3.832596


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_wiki_full and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5729 [00:00<?, ? examples/s]

Map:   0%|          | 0/74635 [00:00<?, ? examples/s]

Map:   0%|          | 0/8292 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.1729,4.061287
2,3.9453,4.015906
3,3.8149,4.026833


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_subs and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/74635 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.004,3.879016
2,3.7718,3.816782
3,3.6389,3.79907
4,3.5841,3.810604
5,3.5166,3.820819


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_wiki_full and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/74635 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.1766,4.061315
2,3.9289,4.014225
3,3.7666,4.035124
4,3.7043,4.047644
5,3.6281,4.076275


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_subs and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8292 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.0076,3.88128
2,3.7775,3.826807
3,3.6371,3.813361
4,3.5716,3.817601
5,3.4875,3.842988
6,3.4889,3.848918
7,3.4394,3.855172


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_wiki_full and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8292 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.1681,4.04916
2,3.9105,3.991788
3,3.7309,4.023921
4,3.65,4.049452
5,3.5538,4.1292
6,3.5454,4.133029
7,3.501,4.159614


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_subs and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.0957,3.962934
2,3.8937,3.894193
3,3.8085,3.881595


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_wiki_full and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.2395,4.124445
2,4.0564,4.065047
3,3.9504,4.058709


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_subs and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.0832,3.949414
2,3.8713,3.869667
3,3.7483,3.838798
4,3.7049,3.836175
5,3.6513,3.838467


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_wiki_full and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.2517,4.125268
2,4.0358,4.055124
3,3.8884,4.054308
4,3.8441,4.056051
5,3.7868,4.06592


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_subs and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.0945,3.952669
2,3.8714,3.86894
3,3.7395,3.843416
4,3.6893,3.832962
5,3.6198,3.830955
6,3.6184,3.83168
7,3.5765,3.831891


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_wiki_full and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.2389,4.114877
2,4.0195,4.039878
3,3.8536,4.029949
4,3.7917,4.029678
5,3.7142,4.054247
6,3.703,4.071322
7,3.6714,4.071349


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_subs and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


LOL


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss
1,4.2833,4.172488
2,4.1433,4.080168
3,4.082,4.055958


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/5729 [00:00<?, ?it/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /data/lubosk/diploma_thesis/saved_model/strict_model_individual_datasets/randomly_selected_wiki_full and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
