Model trained on contaxts being original_code. 

In [1]:
from transformers import (
    BartTokenizerFast,
    DataCollatorForSeq2Seq,
    keras_callbacks,
    TFAutoModelForSeq2SeqLM,
)
import tensorflow as tf
import pandas as pd
from huggingface_hub import notebook_login
from datasets import concatenate_datasets
from question_answering.constants import constants
from question_answering.utils import core_qa_utils, generative_qa_utils
from question_answering.paths import generative_qa_paths
from question_answering.keras_callbacks.time_measure_callback import TimeMeasureCallback

In [2]:
df_train, df_val, df_test = core_qa_utils.load_datasets_from_csv(
    generative_qa_paths.python_dataset_dir
)

train_dataset, val_dataset, test_dataset = core_qa_utils.convert_dataframes_to_datasets(
    [df_train, df_val, df_test]
)

train_dataset = concatenate_datasets([train_dataset, val_dataset])

In [3]:
model_checkpoint = "facebook/bart-base"
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

In [4]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample["questions"].strip()
    context = sample["original_code"].strip()

    return tokenizer(question, context, max_length=max_tokens, padding=padding)


tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_test_dataset = test_dataset.map(tokenize_sample)

print(
    "Max number of tokens in tokenized train dataset: ",
    len(max(tokenized_train_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized test dataset: ",
    len(max(tokenized_test_dataset["input_ids"], key=len)),
)

Map:   0%|          | 0/63080 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1490 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Max number of tokens in tokenized train dataset:  9244
Max number of tokens in tokenized test dataset:  901


In [5]:
max_length = 256


def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample["input_ids"]) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset


filtered_train_dataset = filter_samples_below_number_of_tokens(
    train_dataset, max_tokens=max_length
)
filtered_test_dataset = filter_samples_below_number_of_tokens(
    test_dataset, max_tokens=max_length
)



In [6]:
print(
    "Number of samples in tokenized train dataset before filtering: ",
    len(train_dataset),
)
print(
    "Number of samples in tokenized test dataset before filtering: ", len(test_dataset)
)

print("\n---------------\n")

print(
    "Number of samples in tokenized train dataset after filtering: ",
    len(filtered_train_dataset),
)
print(
    "Number of samples in tokenized test dataset after filtering: ",
    len(filtered_test_dataset),
)

Number of samples in tokenized train dataset before filtering:  63080
Number of samples in tokenized test dataset before filtering:  7000

---------------

Number of samples in tokenized train dataset after filtering:  61758
Number of samples in tokenized test dataset after filtering:  6854


In [7]:
def preprocess_dataset(dataset):
    questions = [q.strip() for q in dataset["questions"]]
    contexts = [c.strip() for c in dataset["original_code"]]
    answers = [c.strip() for c in dataset["answers"]]

    inputs = tokenizer(
        questions, contexts, text_target=answers, max_length=max_length, truncation=True
    )

    return inputs

In [8]:
tokenized_train_dataset = filtered_train_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_train_dataset.column_names,
)
tokenized_test_dataset = filtered_test_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_test_dataset.column_names,
)

Map:   0%|          | 0/61758 [00:00<?, ? examples/s]

Map:   0%|          | 0/6854 [00:00<?, ? examples/s]

In [9]:
# Model parameters
training_number = 1

model_name = "python-bart-uncased"
full_model_name = f"{model_name}-{training_number}"

# Checkpoints
checkpoint_filename_template = constants.checkpoint_filename_template
checkpoints_path = (
    generative_qa_paths.training_checkpoints_dir
    / full_model_name
    / checkpoint_filename_template
)

# Hub
hub_path = generative_qa_paths.hub_models_location / full_model_name

# Saved models
saved_models_path = generative_qa_paths.saved_models_dir / full_model_name

# Evaluation
model_evaluation_dir = generative_qa_paths.model_evaluation_dir / full_model_name

# Hyperparameters
batch_size = 8
train_epochs = 1

In [10]:
# Load model for fine-tuning
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForConditionalGeneration: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
- This IS expected if you are initializing TFBartForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [11]:
# Dataset preparation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

tf_train_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_train_dataset,
    collator=data_collator,
    batch_size=batch_size,
    shuffle=True,
)

tf_test_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_test_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
# Callbacks
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    checkpoints_path, verbose=1, save_weights_only=True
)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=1)
# push_to_hub = keras_callbacks.PushToHubCallback(
#     output_dir=full_model_name, tokenizer=tokenizer
# )

time_measure_cb = TimeMeasureCallback()

callbacks = [
    checkpoint_cb,
    early_stop_cb,
    # push_to_hub,
    time_measure_cb,
]

In [13]:
# Compile
num_train_steps = len(tf_train_dataset) * train_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# Compile
model.compile(optimizer=optimizer)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4070, compute capability 8.9


In [14]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  139420416 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50265     
 r)                                                              
                                                                 
Total params: 139,470,681
Trainable params: 139,420,416
Non-trainable params: 50,265
_________________________________________________________________


In [15]:
# Fit the model on the new data
history = model.fit(
    tf_train_dataset,
    epochs=train_epochs,
    callbacks=callbacks,
)

Epoch 1: saving model to e:\STUDIA\IPS\question-answering\generative-qa\training-checkpoints\python-bart-uncased-1\cp-01.ckpt


In [18]:
# Get best version of the model
best_model, best_epoch = generative_qa_utils.get_best_model_from_checkpoints(
    model,
    history,
    model_name=full_model_name,
    metric="loss",
    remove_checkpoints=True,
    model_type="generative",
)

In [19]:
# Save best model's weights
generative_qa_utils.save_model(best_model, model_name=full_model_name)

In [15]:
loaded_weights_model = generative_qa_utils.load_weights_into_model(
    model=model, model_name=full_model_name
)

In [16]:
# Get predictions from the best model
loaded_model_evaluation = loaded_weights_model.evaluate(tf_test_dataset)



In [18]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm


def generate_with_xla(batch):
    return loaded_weights_model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=max_length,
    )

In [42]:
tokenized_test_dataset2 = tokenized_test_dataset.train_test_split(test_size=0.01)[
    "test"
]
len(tokenized_test_dataset2[0]["input_ids"])

55

In [43]:
tf_test_dataset2 = core_qa_utils.prepare_tf_dataset(
    model=loaded_weights_model,
    hf_dataset=tokenized_test_dataset2,
    collator=data_collator,
    batch_size=batch_size,
)

In [44]:
import pandas as pd

all_preds = []
all_labels = []
question_contexts_list = []

df = pd.DataFrame()

for batch, labels in tqdm(tf_test_dataset2):
    # print(batch)
    predictions = generate_with_xla(batch)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = labels.numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    all_preds.extend(decoded_preds)
    all_labels.extend(decoded_labels)
    question_contexts_list.extend(
        tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
    )

    data = {
        "question_contexts": tokenizer.batch_decode(
            batch["input_ids"], skip_special_tokens=True
        ),
        "labels": decoded_labels,
        "preds": decoded_preds,
    }
    df = pd.DataFrame(data)

100%|██████████| 9/9 [00:53<00:00,  5.94s/it]


In [50]:
data = {
    "question_contexts": question_contexts_list,
    "labels": all_labels,
    "predictions": all_preds,
}
dataset_dataframe_with_predictions = pd.DataFrame(data)

In [None]:
dataset_dataframe_with_predictions = (
    generative_qa_utils.get_dataset_dataframe_with_predictions(
        model=loaded_weights_model,
        tokenizer=tokenizer,
        tf_dataset=tf_test_dataset,
        max_length=max_length,
    )
)

In [51]:
dataset_dataframe_with_predictions

Unnamed: 0,question_contexts,labels,predictions
0,What does the code ensure?def test_complex ent...,we tokenize complex numbers properly,that a complex is a complex
1,What does switchport have?def is_switchport_de...,oob layer 2 config,the default vlan
2,What does the code write?@jingo register funct...,a string that tells the user what they are see...,a jinja
3,How does the code convert a thrift structure t...,by recursing over the dictionary,using thrift2json
4,What does the code compute?def median_absolute...,the median of the absolute deviations from the...,the median absolute deviation of an array
...,...,...,...
64,What does the code create?def create_var size ...,a new internal variable,a new var
65,What does the code pull from the database?def ...,the status message,a status message
66,For what purpose does the code add extra xml p...,for the calling test,for a record
67,What does the code compute?@dispatch Broadcast...,the result of a broadcast expression,the results of a Mongo query


In [52]:
dataset_dataframe_with_predictions = (
    generative_qa_utils.split_questions_and_contexts_into_two_columns(
        dataframe=dataset_dataframe_with_predictions
    )
)
dataset_dataframe_with_predictions

Unnamed: 0,questions,contexts,labels,predictions
0,What does the code ensure?,def test_complex entry tokenize'1j'[0][0]asser...,we tokenize complex numbers properly,that a complex is a complex
1,What does switchport have?,def is_switchport_default existing c1 existing...,oob layer 2 config,the default vlan
2,What does the code write?,@jingo register functiondef showing query page...,a string that tells the user what they are see...,a jinja
3,How does the code convert a thrift structure t...,def thrift2json tft if isinstance tft type Non...,by recursing over the dictionary,using thrift2json
4,What does the code compute?,def median_absolute_deviation x x asarray x me...,the median of the absolute deviations from the...,the median absolute deviation of an array
...,...,...,...,...
64,What does the code create?,def create_var size var_id None if var_id is N...,a new internal variable,a new var
65,What does the code pull from the database?,def get_site_status_msg course_key try if not ...,the status message,a status message
66,For what purpose does the code add extra xml p...,@pytest fixturedef record_xml_property request...,for the calling test,for a record
67,What does the code compute?,@dispatch Broadcast MongoQuery def post_comput...,the result of a broadcast expression,the results of a Mongo query


In [53]:
(
    bleu_result,
    rogue_result,
    meteor_result,
    bertscore_result,
    sacrebleu_result,
) = generative_qa_utils.get_metrics(dataset_dataframe_with_predictions)
print(
    "BLEU:\n",
    bleu_result,
    "\nROGUE:\n",
    rogue_result,
    "\nMETEOR:\n",
    meteor_result,
    "\nVERTSCORE:\n",
    bertscore_result,
    "\nSCRABLEU:\n",
    sacrebleu_result,
)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BLEU:
 {'bleu': 0.0, 'precisions': [0.39436619718309857, 0.14583333333333334, 0.04819277108433735, 0.0], 'brevity_penalty': 0.4295271087576894, 'length_ratio': 0.5419847328244275, 'translation_length': 213, 'reference_length': 393} 
ROGUE:
 {'rouge1': 0.2701503668255586, 'rouge2': 0.07623869689087082, 'rougeL': 0.25601968009257015, 'rougeLsum': 0.2572279690566136} 
METEOR:
 {'meteor': 0.1731376051843862} 
VERTSCORE:
 {'precision': [0.8573920130729675, 0.8738719820976257, 0.818203866481781, 0.8449787497520447, 0.9142791628837585, 0.8252677917480469, 0.7977335453033447, 0.880974292755127, 0.9026201367378235, 0.9194954037666321, 0.8559521436691284, 0.8920223712921143, 0.8359789848327637, 0.9154269099235535, 0.864473819732666, 0.9021365642547607, 0.8582707643508911, 0.9263198375701904, 0.9612858295440674, 0.8641085624694824, 0.8402149081230164, 0.9974247217178345, 0.844182550907135, 0.9112904667854309, 0.9227746725082397, 0.8870024681091309, 0.9315030574798584, 0.8743858337402344, 0.897162

In [56]:
# Save all relevant training and evaluation metrics to a json file.
evaluation_data = {
    "training": {
        "metrics": "history.history",
        "attempted_epochs": "train_epochs",
        "best_epoch": "best_epoch",
        "training_time": "time_measure_cb.total_training_time()",
        "gpu": core_qa_utils.get_gpu_name(),
    },
    "test_set": {
        "loss": "loaded_model_evaluation",
        "bleu": bleu_result,
        "rogue": rogue_result,
        "meteor": meteor_result,
        "bertscore": bertscore_result,
        "sacrebleu": sacrebleu_result,
    },
}

core_qa_utils.save_dict_as_json(
    evaluation_data, dir_path=model_evaluation_dir, filename="evaluation_data.json"
)

In [57]:
samples_for_manual_check = 50

predictions_for_manual_check = dataset_dataframe_with_predictions.sample(
    n=samples_for_manual_check
).reset_index(drop=True)
predictions_for_manual_check

Unnamed: 0,questions,contexts,labels,predictions
0,What does the code get?,def get_features return get_var 'FEATURES',the value of features variable in the make,all available features
1,What calls what it?,def _norm_encoding encoding try return codecs ...,pythons codecs module,decorator
2,What does the code find?,def domains_for_certname config certname def u...,the domains in the cert with name certname,a list of domains for a given certname
3,What does the code compare for tests?,def subrange_exercise mult lb ub m MultisetPar...,filter - based and more optimized subrange imp...,the subrange exercise
4,What does the code validate?,def _validate_ui_config obj_type ui_config ref...,the value of a ui configuration,the ui configuration
5,What can generate injectors to inject the prov...,def inject **k return InjectionFactory k,a injectorfactory,this
6,What does the code create?,def create_var size var_id None if var_id is N...,a new internal variable,a new var
7,How do a dsc configuration compile?,def run_config path source None config_name No...,in the form of a powershell script,with the specified path
8,What does the code pull from the database?,def get_site_status_msg course_key try if not ...,the status message,a status message
9,What have no clr attributes?,@skip'multiple_execute' def test_no_clr_attrib...,verify types,objects


In [58]:
dataset_dataframe_with_predictions.to_csv(
    model_evaluation_dir / "test_set_sample_generation.csv",
    index=True,
    index_label="index",
)