Model trained on contaxts being original_code. 

In [1]:
from transformers import (
    AutoTokenizer,
    BartTokenizerFast,
    TFAutoModelForQuestionAnswering,
    DataCollatorForSeq2Seq,
    keras_callbacks,
    TFAutoModelForSeq2SeqLM,
    TFEncoderDecoderModel,
)
import tensorflow as tf
from huggingface_hub import notebook_login
from question_answering.constants import constants
from question_answering.utils import core_qa_utils, generative_qa_utils
from question_answering.paths import generative_qa_paths
from question_answering.keras_callbacks.time_measure_callback import TimeMeasureCallback

In [2]:
df_train, df_val, df_test = core_qa_utils.load_train_val_test_datasets(
    generative_qa_paths.python_dataset_dir
)

train_dataset, val_dataset, test_dataset = core_qa_utils.convert_dataframes_to_datasets(
    [df_train, df_val, df_test]
)

In [3]:
model_checkpoint = "facebook/bart-base"
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [4]:
tokenizer

BartTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}

In [5]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample["questions"].strip()
    context = sample["original_code"].strip()

    return tokenizer(question, context, max_length=max_tokens, padding=padding)

tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_val_dataset = val_dataset.map(tokenize_sample)
tokenized_test_dataset = test_dataset.map(tokenize_sample)

print(
    "Max number of tokens in tokenized train dataset: ",
    len(max(tokenized_train_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized val dataset: ",
    len(max(tokenized_val_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized test dataset: ",
    len(max(tokenized_test_dataset["input_ids"], key=len)),
)

Map:   0%|          | 0/56080 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1490 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Max number of tokens in tokenized train dataset:  9244
Max number of tokens in tokenized val dataset:  575
Max number of tokens in tokenized test dataset:  901


In [6]:
max_length = 256


def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample["input_ids"]) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset


filtered_train_dataset = filter_samples_below_number_of_tokens(
    train_dataset, max_tokens=max_length
)
filtered_val_dataset = filter_samples_below_number_of_tokens(
    val_dataset, max_tokens=max_length
)
filtered_test_dataset = filter_samples_below_number_of_tokens(
    test_dataset, max_tokens=max_length
)



In [7]:
print(
    "Number of samples in tokenized train dataset before filtering: ",
    len(train_dataset),
)
print("Number of samples in tokenized val dataset before filtering: ", len(val_dataset))
print(
    "Number of samples in tokenized test dataset before filtering: ", len(test_dataset)
)

print("\n---------------\n")

print(
    "Number of samples in tokenized train dataset after filtering: ",
    len(filtered_train_dataset),
)
print(
    "Number of samples in tokenized val dataset after filtering: ",
    len(filtered_val_dataset),
)
print(
    "Number of samples in tokenized test dataset after filtering: ",
    len(filtered_test_dataset),
)

Number of samples in tokenized train dataset before filtering:  56080
Number of samples in tokenized val dataset before filtering:  7000
Number of samples in tokenized test dataset before filtering:  7000

---------------

Number of samples in tokenized train dataset after filtering:  54930
Number of samples in tokenized val dataset after filtering:  6828
Number of samples in tokenized test dataset after filtering:  6854


In [30]:
def preprocess_dataset(dataset):
    questions = [q.strip() for q in dataset["questions"]]
    contexts = [c.strip() for c in dataset["original_code"]]
    answers = [c.strip() for c in dataset["answers"]]

    inputs = tokenizer(
        questions,
        contexts,
        # question_context,
        text_target=answers,
        max_length=max_length,
        truncation=True
        # padding="max_length",
        # return_offsets_mapping=True,
    )

    return inputs

In [31]:
tokenized_train_dataset = filtered_train_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_train_dataset.column_names,
)
tokenized_val_dataset = filtered_val_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_val_dataset.column_names,
)
tokenized_test_dataset = filtered_test_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_test_dataset.column_names,
)

Map:   0%|          | 0/54930 [00:00<?, ? examples/s]

Map:   0%|          | 0/6828 [00:00<?, ? examples/s]

Map:   0%|          | 0/6854 [00:00<?, ? examples/s]

In [32]:
print(
    f"All tokenized train dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_train_dataset["input_ids"]
        ]
    ),
)
print(
    f"All tokenized val dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_val_dataset["input_ids"]
        ]
    ),
)
print(
    f"All tokenized test dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_test_dataset["input_ids"]
        ]
    ),
)

All tokenized train dataset entries have 256 tokens:  False
All tokenized val dataset entries have 256 tokens:  False
All tokenized test dataset entries have 256 tokens:  False


In [95]:
# Model parameters
training_number = 5

model_name = "python-bart-uncased"
full_model_name = f"{model_name}-{training_number}"

# Checkpoints
checkpoint_filename_template = constants.checkpoint_filename_template
checkpoints_path = (
    generative_qa_paths.training_checkpoints_dir
    / full_model_name
    / checkpoint_filename_template
)

# Hub
hub_path = generative_qa_paths.hub_models_location / full_model_name

# Saved models
saved_models_path = generative_qa_paths.saved_models_dir / full_model_name

# Evaluation
model_evaluation_dir = generative_qa_paths.model_evaluation_dir / full_model_name

# Hyperparameters
batch_size = 8
train_epochs = 1

AttributeError: module 'question_answering.paths.generative_qa_paths' has no attribute 'model_evaluation_dir'

In [12]:
# Load model for fine-tuning
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForConditionalGeneration: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
- This IS expected if you are initializing TFBartForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [13]:
# Dataset preparation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

tf_train_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_train_dataset,
    collator=data_collator,
    batch_size=batch_size,
    shuffle=True
)

tf_val_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_val_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

tf_test_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_test_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [25]:
data_collator.label_pad_token_id 

-100

In [40]:
labelLengths = [len(tokenized_train_dataset[i]['labels']) for i in range(1, 10)]
labelLengths

[3, 3, 4, 6, 3, 5, 6, 3, 4]

In [48]:
inputIdsLengths = [len(tokenized_train_dataset[i]['input_ids']) for i in range(1, 10)]
inputIdsLengths

[35, 109, 108, 125, 128, 126, 33, 36, 251]

In [44]:
batch = data_collator([tokenized_train_dataset[i] for i in range(1, 10)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [51]:
tokenizer.decode(tokenized_train_dataset['labels'][1])

'<s>Yes</s>'

In [47]:
batch["labels"][0]

<tf.Tensor: shape=(6,), dtype=int32, numpy=array([   0, 9904,    2, -100, -100, -100])>

In [50]:
tokenizer

BartTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}

In [49]:
batch["input_ids"][0]

<tf.Tensor: shape=(251,), dtype=int32, numpy=
array([    0, 27847,     5,  3260,   146,    10, 10606, 17487,     2,
           2,  9232,  5293, 20689,  1459,  7605,   495, 11726,   385,
        6929,  9291, 10606, 13810,  6929,  6929, 10606,  8504,   495,
       11726,   385, 10606, 26411,  2072,   671, 10606,     2,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,   

In [20]:
# Callbacks
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    checkpoints_path, verbose=1, save_weights_only=True
)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=1)
push_to_hub = keras_callbacks.PushToHubCallback(
    output_dir=full_model_name, tokenizer=tokenizer
)

time_measure_cb = TimeMeasureCallback()

callbacks = [
    checkpoint_cb,
    early_stop_cb,
    # push_to_hub,
    time_measure_cb
]

e:\STUDIA\IPS\question-answering\generative-qa\notebooks\python\python-bart-uncased-5 is already a clone of https://huggingface.co/nlp-polish/python-bart-uncased-5. Make sure you pull the latest changes with `repo.git_pull()`.


In [21]:
# Compile
num_train_steps = len(tf_train_dataset) * train_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# Compile
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metrics = ["accuracy"]
# model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
# model.compile(optimizer=optimizer, metrics=metrics)
model.compile(optimizer=optimizer)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4070, compute capability 8.9


In [22]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  139420416 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50265     
 r)                                                              
                                                                 
Total params: 139,470,681
Trainable params: 139,420,416
Non-trainable params: 50,265
_________________________________________________________________


In [None]:
# Fit the model on the new data
history = model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=train_epochs,
    callbacks=callbacks,
)

In [None]:
# Get best version of the model
best_model, best_epoch = core_qa_utils.get_best_model_from_checkpoints(
    model, history, model_name=full_model_name, remove_checkpoints=True, model_type="generative"
)

In [None]:
# Save best model's weights
generative_qa_utils.save_model(best_model, model_name=full_model_name)

In [23]:
loaded_weights_model = generative_qa_utils.load_weights_into_model(
    model=model, 
    model_name=full_model_name
)

In [92]:
# Get predictions from the best model
loaded_model_evaluation = loaded_weights_model.evaluate(tf_test_dataset)



In [85]:
import evaluate

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")
bertscore_metric = evaluate.load("bertscore")
sacrebleu_metric = evaluate.load("sacrebleu")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [53]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

def generate_with_xla(batch):
    return loaded_weights_model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=max_length,
    )


In [54]:
tokenized_test_dataset2 = tokenized_test_dataset.train_test_split(test_size=0.001)['test']
len(tokenized_test_dataset2[0]['input_ids'])

119

In [55]:
tf_test_dataset2 = core_qa_utils.prepare_tf_dataset(
    model=loaded_weights_model,
    hf_dataset=tokenized_test_dataset2,
    collator=data_collator,
    batch_size=batch_size,
)

In [56]:
tokenized_test_dataset2[0]['input_ids']

[0,
 2264,
 473,
 5,
 3260,
 11113,
 561,
 17487,
 2,
 2,
 9232,
 11113,
 1215,
 33966,
 1215,
 42274,
 3023,
 31799,
 16,
 1215,
 45041,
 35297,
 114,
 16,
 1215,
 45041,
 3724,
 46446,
 9624,
 8284,
 111,
 134,
 112,
 114,
 3724,
 8061,
 321,
 775,
 48081,
 1990,
 414,
 11,
 3023,
 414,
 46446,
 25,
 30766,
 414,
 12313,
 3631,
 293,
 31799,
 321,
 414,
 414,
 10975,
 111,
 134,
 27779,
 23687,
 414,
 12313,
 3631,
 293,
 321,
 31799,
 775,
 40462,
 414,
 671,
 46446,
 25,
 30766,
 775,
 1493,
 671,
 46446,
 25,
 30766,
 3023,
 1493,
 775,
 48081,
 1990,
 414,
 11,
 3023,
 414,
 46446,
 25,
 30766,
 414,
 12313,
 3631,
 293,
 31799,
 321,
 414,
 414,
 10975,
 111,
 134,
 27779,
 23687,
 414,
 12313,
 3631,
 293,
 321,
 31799,
 775,
 40462,
 414,
 671,
 46446,
 25,
 30766,
 775,
 2]

In [57]:
tokenizer.decode(tokenized_test_dataset2[0]['input_ids'])

'<s>What does the code flip together?</s></s>def flip_axis_multi x axis is_random False if is_random factor np random uniform -1 1 if factor > 0 results []for data in x data np asarray data swapaxes axis 0 data data[ -1 ]data data swapaxes 0 axis results append data return np asarray results else return np asarray x else results []for data in x data np asarray data swapaxes axis 0 data data[ -1 ]data data swapaxes 0 axis results append data return np asarray results</s>'

In [83]:
import pandas as pd

all_preds = []
all_labels = []

df = pd.DataFrame()

for batch, labels in tqdm(tf_test_dataset2):
    predictions = generate_with_xla(batch)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = labels.numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    all_preds.extend(decoded_preds)
    all_labels.extend(decoded_labels)

    data = {
        'question_contexts': tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True),
        'labels': decoded_labels,
        'preds': decoded_preds
    }
    df = pd.DataFrame(data)

100%|██████████| 1/1 [00:05<00:00,  5.26s/it]


In [84]:
all_preds, all_labels

(['two arrays',
  'No',
  'a unique path',
  'the digital ocean switch platform',
  'the current windowlength',
  'a model',
  'a simple paginator'],
 ['the axises of multiple images',
  'No',
  'a version of path',
  'the digital ocean droplet switch',
  'small or incorrect window lengths',
  'a model instances parameter array',
  'a simplepaginator page'])

In [108]:
questions = []
contexts = []

if 'question_contexts' in df:
    for index, row in df.iterrows():
        questions.append(row['question_contexts'].split('?')[0] + '?')
        contexts.append(row['question_contexts'].split('?')[1])

    data = {
        'questions': questions,
        'contexts': contexts,
        'labels': df['labels'],
        'preds': df['preds']
    }
    df = pd.DataFrame(data)

In [80]:
df

Unnamed: 0,questions,contexts,labels,preds
0,What does the code flip together?,def flip_axis_multi x axis is_random False if ...,the axises of multiple images,two arrays
1,Does circular references between an event on a...,@skip'silverlight' def test_event_lifetime def...,No,No
2,What does not exist on the filesystem?,def unique_path path if not os path exists sys...,a version of path,a unique path
3,What does the code setup?,def setup_platform hass config add_devices dis...,the digital ocean droplet switch,the digital ocean switch platform
4,What does the code handle?,def _len_guards M if int M M or M < 0 raise Va...,small or incorrect window lengths,the current windowlength
5,What does the code convert to an array that ca...,def _model_to_fit_params model fitparam_indice...,a model instances parameter array,a model
6,What does the code get?,def simple_paginate request queryset per_page ...,a simplepaginator page,a simple paginator


In [86]:
bleu_result = bleu_metric.compute(predictions=all_preds, references=all_labels)
bleu_result

{'bleu': 0.0,
 'precisions': [0.5263157894736842, 0.25, 0.16666666666666666, 0.0],
 'brevity_penalty': 0.6227038648477501,
 'length_ratio': 0.6785714285714286,
 'translation_length': 19,
 'reference_length': 28}

In [87]:
rogue_result = rouge_metric.compute(predictions=all_preds, references=all_labels)
rogue_result

{'rouge1': 0.46734693877551026,
 'rouge2': 0.1285714285714286,
 'rougeL': 0.4625850340136055,
 'rougeLsum': 0.4680272108843538}

In [115]:
meteor_result = meteor_metric.compute(predictions=all_preds, references=all_labels)
meteor_result

{'meteor': 0.2960018704699556}

In [118]:
bertscore_result = bertscore_metric.compute(predictions=all_preds, references=all_labels, lang='en')
bertscore_result

Downloading config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.8852345943450928,
  1.000000238418579,
  0.8785045146942139,
  0.952263593673706,
  0.8838106393814087,
  0.9287854433059692,
  0.9260398745536804],
 'recall': [0.8251450061798096,
  1.000000238418579,
  0.8626241087913513,
  0.9275587201118469,
  0.8552083969116211,
  0.8669842481613159,
  0.8938043117523193],
 'f1': [0.854134202003479,
  1.000000238418579,
  0.8704918622970581,
  0.9397488236427307,
  0.8692743182182312,
  0.8968214392662048,
  0.9096365571022034],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.35.0)'}

In [117]:
sacrebleu_result = sacrebleu_metric.compute(predictions=all_preds, references=all_labels)
sacrebleu_result

{'score': 16.94436713288991,
 'counts': [10, 3, 1, 0],
 'totals': [19, 12, 6, 2],
 'precisions': [52.63157894736842, 25.0, 16.666666666666668, 25.0],
 'bp': 0.6227038648477501,
 'sys_len': 19,
 'ref_len': 28}

In [104]:
from pyprojroot import find_root, has_dir

root = find_root(has_dir(".git"))
generative_qa_dir = root / "generative-qa"
model_evaluation_dir = generative_qa_dir / "model-evaluation" / full_model_name

In [105]:
model_evaluation_dir

WindowsPath('e:/STUDIA/IPS/question-answering/generative-qa/model-evaluation/python-bart-uncased-5')

In [119]:
# Save all relevant training and evaluation metrics to a json file.
evaluation_data = {
    "training": {
        "metrics": 'history.history',
        "attempted_epochs": train_epochs,
        "best_epoch": 'best_epoch',
        "training_time": "time_measure_cb.total_training_time()",
        "gpu": core_qa_utils.get_gpu_name(),
    },
    "test_set": {
        "loss": loaded_model_evaluation,
        "bleu": bleu_result,
        "rogue": rogue_result,
        "meteor": meteor_result,
        "bertscore": bertscore_result,
        "sacrebleu": sacrebleu_result,
    },
}

core_qa_utils.save_dict_as_json(
    evaluation_data, dir_path=model_evaluation_dir, filename="evaluation_data.json"
)

In [113]:
samples_for_manual_check = 3

predictions_for_manual_check = df.sample(n = samples_for_manual_check).reset_index(drop=True)
predictions_for_manual_check

Unnamed: 0,questions,contexts,labels,preds
0,What does the code get?,def simple_paginate request queryset per_page ...,a simplepaginator page,a simple paginator
1,What does the code handle?,def _len_guards M if int M M or M < 0 raise Va...,small or incorrect window lengths,the current windowlength
2,What does the code setup?,def setup_platform hass config add_devices dis...,the digital ocean droplet switch,the digital ocean switch platform


In [114]:
df.to_csv(
    model_evaluation_dir / "test_set_sample_generation.csv", index=True, index_label="index"
)