Model trained on contaxts being original_code. 

In [1]:
from transformers import (
    BartTokenizerFast,
    DataCollatorForSeq2Seq,
    keras_callbacks,
    TFAutoModelForSeq2SeqLM,
)
import tensorflow as tf
from datasets import Dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
from huggingface_hub import notebook_login
from datasets import concatenate_datasets
from question_answering.constants import constants
from question_answering.utils import core_qa_utils, generative_qa_utils
from question_answering.paths import generative_qa_paths
from question_answering.keras_callbacks.time_measure_callback import TimeMeasureCallback

In [2]:
df_train, df_val, df_test = core_qa_utils.load_datasets_from_csv(
    generative_qa_paths.python_dataset_dir
)

df_train = pd.concat([df_train, df_val], ignore_index=True)

train_dataset, test_dataset = core_qa_utils.convert_dataframes_to_datasets(
    [df_train, df_test]
)

In [3]:
model_checkpoint = "facebook/bart-base"
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

In [4]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample["questions"].strip()
    context = sample["original_code"].strip()

    return tokenizer(question, context, max_length=max_tokens, padding=padding)

tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_test_dataset = test_dataset.map(tokenize_sample)

print(
    "Max number of tokens in tokenized train dataset: ",
    len(max(tokenized_train_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized test dataset: ",
    len(max(tokenized_test_dataset["input_ids"], key=len)),
)

Map:   0%|          | 0/63080 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1490 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Max number of tokens in tokenized train dataset:  9244
Max number of tokens in tokenized test dataset:  901


In [5]:
max_length = 256


def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample["input_ids"]) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset


filtered_train_dataset = filter_samples_below_number_of_tokens(
    train_dataset, max_tokens=max_length
)
filtered_test_dataset = filter_samples_below_number_of_tokens(
    test_dataset, max_tokens=max_length
)



In [6]:
print(
    "Number of samples in tokenized train dataset before filtering: ",
    len(train_dataset),
)
print(
    "Number of samples in tokenized test dataset before filtering: ", len(test_dataset)
)

print("\n---------------\n")

print(
    "Number of samples in tokenized train dataset after filtering: ",
    len(filtered_train_dataset),
)
print(
    "Number of samples in tokenized test dataset after filtering: ",
    len(filtered_test_dataset),
)

Number of samples in tokenized train dataset before filtering:  63080
Number of samples in tokenized test dataset before filtering:  7000

---------------

Number of samples in tokenized train dataset after filtering:  61758
Number of samples in tokenized test dataset after filtering:  6854


In [7]:
def preprocess_dataset(dataset):
    questions = [q.strip() for q in dataset["questions"]]
    contexts = [c.strip() for c in dataset["original_code"]]
    answers = [c.strip() for c in dataset["answers"]]

    inputs = tokenizer(
        questions,
        contexts,
        text_target=answers,
        max_length=max_length,
        truncation=True
    )

    return inputs

In [8]:
tokenized_train_dataset = filtered_train_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_train_dataset.column_names,
)
tokenized_test_dataset = filtered_test_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_test_dataset.column_names,
)

Map:   0%|          | 0/61758 [00:00<?, ? examples/s]

Map:   0%|          | 0/6854 [00:00<?, ? examples/s]

In [9]:
# Model parameters
training_number = 1

model_name = "python-bart-uncased"
full_model_name = f"{model_name}-{training_number}"

# Checkpoints
checkpoint_filename_template = constants.checkpoint_filename_template
checkpoints_path = (
    generative_qa_paths.training_checkpoints_dir
    / full_model_name
    / checkpoint_filename_template
)

# Hub
hub_path = generative_qa_paths.hub_models_location / full_model_name

# Saved models
saved_models_path = generative_qa_paths.saved_models_dir / full_model_name

# Evaluation
model_evaluation_dir = generative_qa_paths.model_evaluation_dir / full_model_name

# Hyperparameters
batch_size = 8
train_epochs = 3

In [10]:
# Load model for fine-tuning
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForConditionalGeneration: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
- This IS expected if you are initializing TFBartForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [11]:
# Dataset preparation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

tf_train_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_train_dataset,
    collator=data_collator,
    batch_size=batch_size,
    shuffle=True
)

tf_test_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_test_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
# Callbacks
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    checkpoints_path, verbose=1, save_weights_only=True
)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=1)
# push_to_hub = keras_callbacks.PushToHubCallback(
#     output_dir=full_model_name, tokenizer=tokenizer
# )

time_measure_cb = TimeMeasureCallback()

callbacks = [
    checkpoint_cb,
    early_stop_cb,
    # push_to_hub,
    time_measure_cb
]

In [13]:
# Compile
num_train_steps = len(tf_train_dataset) * train_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# Compile
model.compile(optimizer=optimizer)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4070, compute capability 8.9


In [14]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  139420416 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50265     
 r)                                                              
                                                                 
Total params: 139,470,681
Trainable params: 139,420,416
Non-trainable params: 50,265
_________________________________________________________________


In [15]:
# Fit the model on the new data
history = model.fit(
    tf_train_dataset,
    epochs=train_epochs,
    callbacks=callbacks,
)

Epoch 1/3
Epoch 1: saving model to e:\STUDIA\IPS\question-answering\generative-qa\training-checkpoints\python-bart-uncased-1\cp-01.ckpt
Epoch 2/3
Epoch 2: saving model to e:\STUDIA\IPS\question-answering\generative-qa\training-checkpoints\python-bart-uncased-1\cp-02.ckpt
Epoch 3/3
Epoch 3: saving model to e:\STUDIA\IPS\question-answering\generative-qa\training-checkpoints\python-bart-uncased-1\cp-03.ckpt


In [17]:
# Get best version of the model
best_model, best_epoch = core_qa_utils.get_best_model_from_checkpoints(
    model, history, model_name=full_model_name, metric="loss", remove_checkpoints=True, model_type="generative"
)

In [18]:
# Save best model's weights
generative_qa_utils.save_model(best_model, model_name=full_model_name)

In [19]:
loaded_weights_model = generative_qa_utils.load_weights_into_model(
    model=model, 
    model_name=full_model_name
)

In [20]:
# Get predictions from the best model
loaded_model_evaluation = loaded_weights_model.evaluate(tf_test_dataset)



In [21]:
parts_number = 100

test_dataset_pandas = tokenized_test_dataset.to_pandas()
test_dataset_array = np.array_split(test_dataset_pandas, parts_number)

test_datasets_list = []

for row in tqdm(test_dataset_array):
    test_datasets_list.append(core_qa_utils.prepare_tf_dataset(
        model=model,
        hf_dataset=Dataset.from_pandas(row),
        collator=data_collator,
        batch_size=batch_size,
))

  return bound(*args, **kwds)
100%|██████████| 100/100 [00:15<00:00,  6.55it/s]


In [22]:
questions_and_answers_df = pd.DataFrame()

In [30]:
dataset_dataframe_with_predictions = generative_qa_utils.get_dataset_dataframe_with_predictions(
    model=loaded_weights_model, tokenizer=tokenizer, tf_dataset_list=test_datasets_list, dataframe=questions_and_answers_df, max_length=max_length, index_to_start_from=99)

100%|██████████| 9/9 [00:53<00:00,  5.94s/it]
100%|██████████| 100/100 [00:53<00:00,  1.87it/s]


In [31]:
dataset_dataframe_with_predictions = generative_qa_utils.split_questions_and_contexts_into_two_columns(dataframe=dataset_dataframe_with_predictions)
dataset_dataframe_with_predictions

Unnamed: 0,questions,contexts,labels,predictions
0,What d i d download for given image?,def get_image_files_json image_id files_json g...,the specified layer,image files
1,What contain the key word?,def has_key k trie return _retrive_branch k tr...,trie,all words in k
2,What do trie contain?,def has_key k trie return _retrive_branch k tr...,the key word,key
3,What does the code create?,def create_api_key name description enabled Tr...,an api key given name and description,an api key
4,What did by loadobjects use the code?,def defaultFactoryMethod rowClass data kw newO...,to create rowobject instances,to set the default factory method
...,...,...,...,...
63,What does the code convert into an int64index ...,def _dt_to_epoch_ns dt_series index pd to_date...,a timeseries,a datetime
64,What converts into a python list of integers?,def positive_int_list argument if'' in argumen...,a space- or comma - separated list of values,a list of integers
65,What does a space- or comma - separated list o...,def positive_int_list argument if'' in argumen...,into a python list of integers,to a positive integer
66,What do the current user run?,def is_current_user_capable api_name user get_...,a certain api args,the given api


In [32]:
bleu_result, rogue_result, meteor_result, bertscore_result, sacrebleu_result = generative_qa_utils.get_metrics(dataset_dataframe_with_predictions)
print(
    "BLEU:\n",
    bleu_result,
    "\nROGUE:\n",
    rogue_result,
    "\nMETEOR:\n",
    meteor_result,
    "\nVERTSCORE:\n",
    bertscore_result,
    "\nSCRABLEU:\n",
    sacrebleu_result,
)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Artur\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BLEU:
 {'bleu': 0.06883854647761965, 'precisions': [0.37727272727272726, 0.15789473684210525, 0.0989010989010989, 0.06382978723404255], 'brevity_penalty': 0.49433322011463077, 'length_ratio': 0.5866666666666667, 'translation_length': 220, 'reference_length': 375} 
ROGUE:
 {'rouge1': 0.28053910958322725, 'rouge2': 0.10156889678948502, 'rougeL': 0.2785424116945856, 'rougeLsum': 0.2808231658934983} 
METEOR:
 {'meteor': 0.1893512777495596} 
VERTSCORE:
 {'precision': [0.8593517541885376, 0.8372496366500854, 0.8991413712501526, 0.9680740833282471, 0.8955209851264954, 0.8728666305541992, 0.8574054837226868, 0.9993814826011658, 0.8920223712921143, 0.9037001729011536, 0.8135514855384827, 0.8349941968917847, 0.9011490941047668, 0.8364732265472412, 0.8376576900482178, 0.8579943180084229, 0.8714290857315063, 0.8299556374549866, 0.9360508918762207, 0.8897707462310791, 0.8863735198974609, 0.8985435962677002, 0.9773555397987366, 0.8346676230430603, 0.8371437191963196, 0.8616835474967957, 0.9598776698

In [33]:
# Save all relevant training and evaluation metrics to a json file.
evaluation_data = {
    "training": {
        "metrics": 'history.history',
        "attempted_epochs": 'train_epochs',
        "best_epoch": 'best_epoch',
        "training_time": 'time_measure_cb.total_training_time()',
        "gpu": core_qa_utils.get_gpu_name(),
    },
    "test_set": {
        "loss": 'loaded_model_evaluation',
        "bleu": bleu_result,
        "rogue": rogue_result,
        "meteor": meteor_result,
        "bertscore": bertscore_result,
        "sacrebleu": sacrebleu_result,
    },
}

core_qa_utils.save_dict_as_json(
    evaluation_data, dir_path=model_evaluation_dir, filename="evaluation_data.json"
)

In [34]:
samples_for_manual_check = 50

predictions_for_manual_check = dataset_dataframe_with_predictions.sample(n = samples_for_manual_check).reset_index(drop=True)
predictions_for_manual_check

Unnamed: 0,questions,contexts,labels,predictions
0,What does the code designate?,def set_time_server time_server 'time apple co...,a network time server,the correct time server
1,What does this function create as defined in t...,def keyboard_role name rawtext text lineno inl...,an inline console input block,a keyboard role
2,What does the code remove?,def subtract d1 d2 warnings warn 'deprecated' ...,all items from d1 whose key occurs in d2,all keys from d1 from d2
3,What does the code return?,def get_http_expiry _Expirestype _num if _Expi...,the future date,an http expire date
4,What does the tokenize ( ) function accept?,def tokenize readline tokeneater printtoken tr...,two parameters : one representing the input st...,a readline
5,What does the code build?,def get_dynamic_link_map for_delete False if g...,a map of all dynamically linked tables,a dynamic link map
6,What does the code rebuild from the file system?,def Rebuild verbose 1 clsidToTypelib clear inf...,the cache indexes,the cache
7,When did returns use luns?,def _get_used_lun_ids_for_mappings mappings us...,when provided with mappings,when mappings are used
8,How does the docs build?,def build_pdf branch os chdir os path join git...,using sphinx in the buildenv virtualenv,using sphinx - build
9,What does this function perform on cpu?,def svd a full_matrices 1 compute_uv 1 return ...,the svd,svd


In [35]:
dataset_dataframe_with_predictions.to_csv(
    model_evaluation_dir / "test_set_sample_generation.csv", index=True, index_label="index"
)