Model trained on contaxts being original_code. 

In [1]:
from transformers import (
    AutoTokenizer,
    BartTokenizerFast,
    TFAutoModelForQuestionAnswering,
    DataCollatorForSeq2Seq,
    keras_callbacks,
    TFAutoModelForSeq2SeqLM,
    TFEncoderDecoderModel,
)
import tensorflow as tf
from huggingface_hub import notebook_login
from question_answering.constants import constants
from question_answering.utils import core_qa_utils, generative_qa_utils
from question_answering.paths import generative_qa_paths

In [2]:
df_train, df_val, df_test = core_qa_utils.load_train_val_test_datasets(
    generative_qa_paths.python_dataset_dir
)

train_dataset, val_dataset, test_dataset = core_qa_utils.convert_dataframes_to_datasets(
    [df_train, df_val, df_test]
)

In [5]:
model_checkpoint = "facebook/bart-base"
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
tokenizer

PreTrainedTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True)})

In [None]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample["questions"].strip()
    context = sample["original_code"].strip()

    return tokenizer(question, context, max_length=max_tokens, padding=padding)

tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_val_dataset = val_dataset.map(tokenize_sample)
tokenized_test_dataset = test_dataset.map(tokenize_sample)

print(
    "Max number of tokens in tokenized train dataset: ",
    len(max(tokenized_train_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized val dataset: ",
    len(max(tokenized_val_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized test dataset: ",
    len(max(tokenized_test_dataset["input_ids"], key=len)),
)

In [None]:
max_length = 256


def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample["input_ids"]) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset


filtered_train_dataset = filter_samples_below_number_of_tokens(
    train_dataset, max_tokens=max_length
)
filtered_val_dataset = filter_samples_below_number_of_tokens(
    val_dataset, max_tokens=max_length
)
filtered_test_dataset = filter_samples_below_number_of_tokens(
    test_dataset, max_tokens=max_length
)

In [None]:
print(
    "Number of samples in tokenized train dataset before filtering: ",
    len(train_dataset),
)
print("Number of samples in tokenized val dataset before filtering: ", len(val_dataset))
print(
    "Number of samples in tokenized test dataset before filtering: ", len(test_dataset)
)

print("\n---------------\n")

print(
    "Number of samples in tokenized train dataset after filtering: ",
    len(filtered_train_dataset),
)
print(
    "Number of samples in tokenized val dataset after filtering: ",
    len(filtered_val_dataset),
)
print(
    "Number of samples in tokenized test dataset after filtering: ",
    len(filtered_test_dataset),
)

In [None]:
def preprocess_dataset(dataset):
    questions = [q.strip() for q in dataset["questions"]]
    contexts = [c.strip() for c in dataset["original_code"]]
    answers = [c.strip() for c in dataset["answers"]]

    inputs = tokenizer(
        questions,
        contexts,
        # question_context,
        text_target=answers,
        max_length=max_length,
        padding="max_length",
        return_offsets_mapping=True,
    )

    return inputs

In [None]:
tokenized_train_dataset = filtered_train_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_train_dataset.column_names,
)
tokenized_val_dataset = filtered_val_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_val_dataset.column_names,
)
tokenized_test_dataset = filtered_test_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_test_dataset.column_names,
)

In [None]:
print(
    f"All tokenized train dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_train_dataset["input_ids"]
        ]
    ),
)
print(
    f"All tokenized val dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_val_dataset["input_ids"]
        ]
    ),
)
print(
    f"All tokenized test dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_test_dataset["input_ids"]
        ]
    ),
)

In [None]:
# Model parameters
training_number = 5

model_name = "python-bart-uncased"
full_model_name = f"{model_name}-{training_number}"

# Checkpoints
checkpoint_filename_template = constants.checkpoint_filename_template
checkpoints_path = (
    generative_qa_paths.training_checkpoints_dir
    / full_model_name
    / checkpoint_filename_template
)

# Hub
hub_path = generative_qa_paths.hub_models_location / full_model_name

# Saved models
saved_models_path = generative_qa_paths.saved_models_dir / full_model_name

# Figures
figures_dir = generative_qa_paths.figures_dir / full_model_name

# Hyperparameters
batch_size = 8
train_epochs = 1

In [None]:
# Load model for fine-tuning
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

In [None]:
# Dataset preparation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

tf_train_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_train_dataset,
    collator=data_collator,
    batch_size=batch_size,
    shuffle=True
)

tf_val_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_val_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

tf_test_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_test_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

In [None]:
data_collator

In [None]:
batch = data_collator([tokenized_train_dataset[i] for i in range(1, 3)])
batch.keys()

In [None]:
tokenizer.decode(tokenized_train_dataset['labels'][1])

In [None]:
tokenizer.decode(1)

In [None]:
batch["labels"][0]

In [None]:
tokenizer.decode(batch["labels"][0])

In [None]:
# Callbacks
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    checkpoints_path, verbose=1, save_weights_only=True
)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=1)
push_to_hub = keras_callbacks.PushToHubCallback(
    output_dir=full_model_name, tokenizer=tokenizer
)

callbacks = [
    checkpoint_cb,
    early_stop_cb,
    # push_to_hub
]

In [None]:
# Compile
num_train_steps = len(tf_train_dataset) * train_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# Compile
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metrics = ["accuracy"]
# model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
# model.compile(optimizer=optimizer, metrics=metrics)
model.compile(optimizer=optimizer)

In [None]:
model.summary()

In [None]:
# Fit the model on the new data
history = model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=train_epochs,
    callbacks=callbacks,
)

In [None]:
# Get best version of the model
best_model, best_epoch = core_qa_utils.get_best_model_from_checkpoints(
    model, history, model_name=full_model_name, remove_checkpoints=True, model_type="generative"
)

In [None]:
# Save best model's weights
generative_qa_utils.save_model(best_model, model_name=full_model_name)

In [None]:
# Load best model
loaded_model = generative_qa_utils.load_model(
    model_checkpoint, model_name=full_model_name
)
loaded_model.compile(optimizer=optimizer)

In [None]:
loaded_weights_model = generative_qa_utils.load_weights_into_model(
    model=model, 
    model_name=full_model_name
)

In [None]:
# Get predictions from the best model
loaded_model_evaluation = loaded_weights_model.evaluate(tf_test_dataset)

In [None]:
import evaluate

metric = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")
sacrebleu = evaluate.load("sacrebleu")

In [None]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

def generate_with_xla(batch):
    return loaded_weights_model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=max_length,
    )


In [None]:
tokenized_test_dataset2 = tokenized_test_dataset.train_test_split(test_size=0.001)['test']
len(tokenized_test_dataset2[0]['input_ids'])

In [None]:
tf_test_dataset2 = core_qa_utils.prepare_tf_dataset(
    model=loaded_weights_model,
    hf_dataset=tokenized_test_dataset2,
    collator=data_collator,
    batch_size=batch_size,
)

In [None]:
tokenized_test_dataset2[0]['input_ids']

In [None]:
tokenizer.decode(tokenized_test_dataset2[0]['input_ids'])

In [None]:
import pandas as pd

all_preds = []
all_labels = []
all_labels2 = []
something = []
something2 = []
df = pd.DataFrame()

for batch, labels in tqdm(tf_test_dataset2):
    predictions = generate_with_xla(batch)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = labels.numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    all_preds.extend(decoded_preds)
    all_labels.extend(decoded_labels)
    all_labels2.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))
    something.extend(tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True))
    something2.extend(tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True))
    data = {
        'qc': tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True),
        'labels': decoded_labels,
        'preds': decoded_preds
    }
    df = pd.DataFrame(data)

In [None]:
all_preds, all_labels, all_labels2, something, something2

In [None]:
df

In [None]:
questions = []
contexts = []

for index, row in df.iterrows():
    questions.append(row['qc'].split('?')[0] + '?')
    contexts.append(row['qc'].split('?')[1])

df['questions'] = questions
df['contexts'] = contexts
labels = df['labels']
preds = df['preds']
df = df.drop(['qc', 'labels', 'preds'], axis=1)
df['labels'] = labels
df['preds'] = preds

In [None]:
df

In [None]:
df.sample(n = 1)

In [None]:
result = metric.compute(predictions=all_preds, references=all_labels)
result