# Finetuning

## Initial set-up

### Installs

In [None]:
%pip install codecarbon comet_ml

In [None]:
%pip install datasets transformers

### Carbon emission tracker launch

In [None]:
from comet_ml import Experiment
from codecarbon import EmissionsTracker
from datetime import datetime

# Initialise and start CodeCarbon tracker
tracker = EmissionsTracker()
tracker.start()

start_time = datetime.now()
print(f'Start time is {start_time}')

# Initialise the Comet experiment
experiment = Experiment(
    api_key="XXXXXXXXXXXXXXXXXXXXXXXXX",
    project_name="",
    workspace="",
)

### Drive mounting

In [None]:
# Mount gdrive
from google.colab import drive

drive.mount('/content/gdrive')
gdrive_path = "/content/gdrive/MyDrive/GEMFR/"

## Data

### Data load

In [None]:
# Train set JSON

import json

f = open(gdrive_path + "automatically_cleaned/" + "train_set_clean.json")

raw_train_data = json.load(f)
len(raw_train_data)

In [None]:
# Validation set ConceptFR JSON

f = open(gdrive_path + "automatically_cleaned/" + "val_set.json")
raw_val_data = json.load(f)

In [None]:
# Train set: change raw data to concept-target format

en_entries_train = []
fr_entries_train = []

for entry in raw_train_data:
    en_entry = {
        "concept": entry["english_concepts"],
        "target": entry["english_example"]
    }
    en_entries_train.append(en_entry)

    fr_entry = {
        "concept": entry["french_concepts"],
        "target": entry["french_example"]
    }
    fr_entries_train.append(fr_entry)
print(len(en_entries_train))
print(len(fr_entries_train))

In [None]:
# Validation set: change raw data to concept-target format

en_entries_val = []
fr_entries_val = []

for entry in raw_val_data:
    en_entry = {
        "concept": entry["english_concepts"],
        "target": entry["english_example"]
    }
    en_entries_val.append(en_entry)

    fr_entry = {
        "concept": entry["french_concepts"],
        "target": entry["french_example"]
    }
    fr_entries_val.append(fr_entry)
print(len(en_entries_val))
print(len(fr_entries_val))

### Conversion to a Dataset object

In [None]:
# Convert train and val sets to JSON Lines to feed Dataset class

# ConceptFR French partition
train_data = fr_entries_train
val_data = fr_entries_val

with open('json_lines_train.jl', 'w') as outfile:
    for entry in train_data:
        json.dump(entry, outfile)
        outfile.write('\n')

with open('json_lines_val.jl', 'w') as outfile:
    for entry in val_data:
        json.dump(entry, outfile)
        outfile.write('\n')

### Create train and validation splits

In [None]:
# Create a dataset objects
from datasets import Dataset

train_dataset = Dataset.from_json("json_lines_train.jl")
validation_dataset = Dataset.from_json("json_lines_val.jl")

In [None]:
# Put both splits to the DatasetDict object, the Accelerator understands
from datasets import DatasetDict

data = DatasetDict({"train":train_dataset,"validation": validation_dataset})

## Model finetuning

### Model parameters

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer
import torch

MODEL_NAME = "facebook/bart-base"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
RANDOM_SEED = 42
BEAM_SIZE = 4
MAX_LENGTH = 32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model = model.to(DEVICE)
DEVICE

### Batch input construction

In [None]:
# Construct input strings from a batch.

def construct_input_for_batch(batch):
    source = batch["concept"] 
    target = batch["target"]
    return source, target

# Construct the batch (source, target) and run them through a tokenizer.

def batch_tokenize(batch, tokenizer, max_length=MAX_LENGTH ):
    source, target = construct_input_for_batch(batch)
    res = {
        "input_ids": tokenizer(source)["input_ids"],
        "labels": tokenizer(target, padding="max_length", truncation=True, max_length=max_length)["input_ids"],
    }
    return res

In [None]:
# Map the function to the training and validation sets

train_data_tokenized = data['train'].map(
    lambda batch: batch_tokenize(batch, tokenizer, max_length=MAX_LENGTH),
    batched=True
)
valid_data_tokenized = data['validation'].map(
    lambda batch: batch_tokenize(batch, tokenizer, max_length=MAX_LENGTH),
    batched=True
)

### Trainer set-up & run

In [None]:
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-04
EPOCHS = 10
WARMUP_STEPS = 1000

# Define train args and pass params to the trainer
train_args = Seq2SeqTrainingArguments(
    output_dir="BART-experiments",
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    logging_steps=100,

    # optimization args, the trainer uses the Adam optimizer
    # and has a linear warmup for the learning rate
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,

    # misc args
    seed=RANDOM_SEED,
    disable_tqdm=False,
    load_best_model_at_end=True,
    
    # generation
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=train_data_tokenized,
    eval_dataset=valid_data_tokenized,
    tokenizer=tokenizer,
)

trainer._max_length = MAX_LENGTH
trainer._num_beams = BEAM_SIZE

In [None]:
trainer.train()

### Save finetuned model

In [None]:
# Save the finetuned model

torch.save(model, "/content/MyModel")

In [None]:
# Save statedict of the finetuned model
torch.save(model.state_dict(), "/content/MyModelStateDict")

## Load finetuned model

In [None]:
# Load the model if necessary
new_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
new_model.load_state_dict((torch.load("/content/MyModelStateDict"))) # Load state dict
new_model.to(DEVICE)
new_model.eval()

In [None]:
import torch

loaded_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
loaded_model = torch.load("/content/MyModel")

## Generate text

### Text generation function (beam search decoding)

In [None]:
# Generating and evaluating predictions

def beam_generate_sentences(
    batch,
    model,
    tokenizer,
    num_beams=4,
    max_length=32,
    device="cpu"
):
    """Generate outputs from a model with beam search decoding."""
    
    # Create batch inputs (only concepts are needed here).
    source, _ = construct_input_for_batch(batch)

    # Use the model's tokenizer to create the batch input_ids.
    batch_features = tokenizer(source, padding=True, return_tensors='pt')

    # Move all inputs to the device.
    batch_features = dict([(k, v.to(device)) for k, v in batch_features.items()])

    # Generate with beam search.
    generated_ids = model.generate(
        **batch_features,
        num_beams=num_beams,
        max_length=max_length,
    )

    # Use model tokenizer to decode to text.
    generated_sentences = [
        tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
        for gen_ids in generated_ids
    ]
    return generated_sentences

### Generate text with concepts from French and English test set of the ConceptFR dataset

In [None]:
# Open a JSON with selected entries from ConceptFR FR val 

f = open(gdrive_path + "automatically_cleaned/" + "test_set.json")
raw_test_data = json.load(f)

In [None]:
# Test set: change raw data to concept-target format

en_entries_test = []
fr_entries_test = []

for entry in raw_test_data:
    en_entry = {
        "concept": entry["english_concepts"],
        "target": entry["english_example"]
    }
    en_entries_test.append(en_entry)

    fr_entry = {
        "concept": entry["french_concepts"],
        "target": entry["french_example"]
    }
    fr_entries_test.append(fr_entry)
print(len(en_entries_test))
print(len(fr_entries_test))

In [None]:
# Convert to JSON Lines to feed Dataset class

# English partition of ConceptFR test set
test_data = en_entries_test

# French partition of ConceptFR test set
#test_data = fr_entries_test

with open('json_lines_test.jl', 'w') as outfile:
    for entry in test_data:
        json.dump(entry, outfile)
        outfile.write('\n')

In [None]:
# Create a dataset object

test_set = Dataset.from_json("json_lines_test.jl")

In [None]:
valid_output = test_set.map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        num_beams=BEAM_SIZE,
        max_length=MAX_LENGTH,
        device=DEVICE)
    },
    batched=True,
    batch_size=128,
)

### Convert generated entries to a list object

In [None]:
# Convert to list in order to save, as Dataset is not JSON serializable

test_set_with_generated_entries = []
for i, output in enumerate(valid_output):
    json_object = {
        "concept": output["concept"],
        "target": output["target"],
        "generated": output["generated"]

    }
    test_set_with_generated_entries.append(json_object)

### Save generated text 

In [None]:
import json

file_name = "2nd_Experiment_cgen-conceptfr-en"


with open(gdrive_path + "generated_text/" + file_name + ".json", 'w') as outfile:
    json.dump(test_set_with_generated_entries, outfile)

## Close the experiment after finetuning

In [None]:
# Stop CO2 tracker and print emissions
emissions: float = tracker.stop()
print(f"Emissions: {emissions} kg")

# Calculate the time spent
stop_time = datetime.now() - start_time

# Log the time to Comet
hyper_params = {
    "time spent": stop_time,
    "emmissions": emissions,
    "batch size": BATCH_SIZE,
    "gradient accumulation steps": GRADIENT_ACCUMULATION_STEPS,
    "learning rate": LEARNING_RATE,
    "epochs": EPOCHS,
    "warmup steps": WARMUP_STEPS
}

# Hyperparameters
experiment.log_parameters(hyper_params)

# Turn off Comet
experiment.end()

# Metrics

## Reopen the experiment for metrics logging

In [None]:
from comet_ml import Experiment
from codecarbon import EmissionsTracker
from datetime import datetime

# Initialise and start CodeCarbon tracker
tracker = EmissionsTracker()
tracker.start()

start_time = datetime.now()
print(f'Start time is {start_time}')

# Initialise the Comet experiment
experiment = Experiment(
    api_key="XXXXXXXXXXXXXXXXXXXXXXXXX",
    project_name="",
    workspace="",
)

## Open generated text

In [None]:
import json

file_name = "2nd_Experiment_cgen-conceptfr-fr"

f = open(gdrive_path + "generated_text/" + file_name + ".json")
valid_output = json.load(f)

## Calculate metrics

In [None]:
%pip install 'gem-metrics[heavy] @ git+https://github.com/GEM-benchmark/GEM-metrics.git'

### Lexical metrics

In [None]:
# Apply in a single pass format
import gem_metrics

list_of_predictions = [valid_output[0]["generated"]]
list_of_references = [valid_output[0]["target"]]

preds = gem_metrics.texts.Predictions(list_of_predictions)
refs = gem_metrics.texts.References(list_of_references)

result = gem_metrics.compute(preds, refs, metrics_list=['bleu', 'rouge', 'nist', 'meteor'])
print(list_of_predictions, "|", list_of_references)
result

In [None]:
# Calculate metrics in a loop

metrics_result = []
for i, output in enumerate(valid_output):
    print(i)
    prediction = [output["generated"]]
    reference = [output["target"]]

    preds = gem_metrics.texts.Predictions(prediction)
    refs = gem_metrics.texts.References(reference)

    result_lexical = gem_metrics.compute(preds, refs, metrics_list=['bleu', 'rouge', 'nist', 'meteor'])
    metrics_result.append(result_lexical)

In [None]:
# Extract values
bleu = []
rouge1_precision = []
rouge1_recall = []
rouge1_fmeasure = []
rouge2_precision = []
rouge2_recall = []
rouge2_fmeasure = []
rougeL_precision = []
rougeL_recall = []
rougeL_fmeasure = []
nist = []
meteor = []

for metric_object in metrics_result:
    bleu.append(metric_object.get("bleu"))

    rouge1_precision.append(metric_object["rouge1"].get("precision"))
    rouge1_recall.append(metric_object["rouge1"].get("recall"))
    rouge1_fmeasure.append(metric_object["rouge1"].get("fmeasure"))

    rouge2_precision.append(metric_object["rouge2"].get("precision"))
    rouge2_recall.append(metric_object["rouge2"].get("recall"))
    rouge2_fmeasure.append(metric_object["rouge2"].get("fmeasure"))

    rougeL_precision.append(metric_object["rougeL"].get("precision"))
    rougeL_recall.append(metric_object["rougeL"].get("recall"))
    rougeL_fmeasure.append(metric_object["rougeL"].get("fmeasure"))

    nist.append(metric_object.get("nist"))
    meteor.append(metric_object.get("meteor"))

In [None]:
# Calculate mean, min and max values for each metric

bleu_average = sum(bleu)/len(bleu)

rouge1_precision_average = sum(rouge1_precision)/len(rouge1_precision)
rouge1_recall_average = sum(rouge1_recall)/len(rouge1_recall)
rouge1_fmeasure_average = sum(rouge1_fmeasure)/len(rouge1_fmeasure)

rouge2_precision_average = sum(rouge2_precision)/len(rouge2_precision)
rouge2_recall_average = sum(rouge2_recall)/len(rouge2_recall)
rouge2_fmeasure_average = sum(rouge2_fmeasure)/len(rouge2_fmeasure)

rougeL_precision_average = sum(rougeL_precision)/len(rougeL_precision)
rougeL_recall_average = sum(rougeL_recall)/len(rougeL_recall)
rougeL_fmeasure_average = sum(rougeL_fmeasure)/len(rougeL_fmeasure)

nist_average = sum(nist)/len(nist)
meteor_average = sum(meteor)/len(meteor)

print("bleu_average", bleu_average)
print("rouge1_precision_average", rouge1_precision_average)
print("rouge1_recall_average", rouge1_recall_average)
print("rouge1_fmeasure_average", rouge1_fmeasure_average)
print("rouge2_precision_average", rouge2_precision_average)
print("rouge2_recall_average", rouge2_recall_average)
print("rouge2_fmeasure_average", rouge2_fmeasure_average)
print("rougeL_precision_average", rougeL_precision_average)
print("rougeL_recall_average", rougeL_recall_average)
print("rougeL_fmeasure_average", rougeL_fmeasure_average)
print("nist_average", nist_average)
print("meteor_average", meteor_average)

In [None]:
# Extract preds and refs from a list object

preds = []
refs = []
for output in valid_output:
    preds.append(output["generated"])
    refs.append(output["target"])

In [None]:
# Calculate MSTTR

predictions = gem_metrics.texts.Predictions(preds)
references = gem_metrics.texts.References(refs)

result_msttr = gem_metrics.compute(predictions, metrics_list=['msttr'])
print(result_msttr)

msttr_100 = result_msttr["msttr-100"]
print(msttr_100)

In [None]:
# Calculate ngrams without reference
import gem_metrics

result_ngrams = gem_metrics.compute(predictions, metrics_list=['ngrams'])
result_ngrams

### Semantic metrics

In [None]:
%pip install git+https://github.com/google-research/bleurt.git

In [None]:
%pip install git+https://github.com/Tiiiger/bert_score

In [None]:
# check installation
import bert_score
bert_score.__version__

In [None]:
# Native bertscore
from bert_score import score

bert_precision, bert_recall, bert_f1 = score(preds, refs, lang='en', verbose=True)
print(bert_precision, bert_recall, bert_f1)

In [None]:
# Calculate BERTscore average

bertscore_precision_average = bert_precision.mean() 
bertscore_recall_average = bert_recall.mean()
bertscore_f1_average = bert_f1.mean()
print("bertscore_precision_average", bertscore_precision_average)
print("bertscore_recall_average", bertscore_recall_average)
print("bertscore_f1_average", bertscore_f1_average)

In [None]:
from datasets import load_metric

bleurt_metric = load_metric("bleurt")
bleurt = []

#bertscore_metric = load_metric("bertscore")
#bertscore = []

for i, output in enumerate(valid_output):
    #bertscore_result = bertscore_metric.compute(predictions=[output["generated"]], references=[output["target"]], lang="bert-base-multilingual-cased")
    #bertscore.append(bertscore_result)
  
    bleurt_result = bleurt_metric.compute(predictions=[output["generated"]], references=[output["target"]])
    bleurt.append(bleurt_result["scores"][0])
      
    print(i)

In [None]:
# Calculate BLEURT average

bleurt_average = sum(bleurt)/len(bleurt)
print("bleurt_average", bleurt_average)

In [None]:
# Calculate repetitions

same = []
for output in valid_output:
    if output["generated"] == output["target"]:
        same.append(output)
print("number of identical to validation", len(same))

repetition_percentage = (len(same) / len(valid_output)) * 100
print("repetition_percentage", repetition_percentage, "%")

In [None]:
def count_concepts_coverage(concepts, phrase):
    covered_concepts = []
    for concept in concepts:
        for word in phrase:
            if word.startswith(concept[:3]):
                covered_concepts.append(word)
    not_covered_len = len(concepts) - len(set(covered_concepts))
    return not_covered_len, set(covered_concepts)

### Concepts coverage in generated text

In [None]:
# Find concepts coverage

concepts_covered = []
anomalies = []
for i, output in enumerate(valid_output):
    concepts = output["concept"].split(" ")
    phrase = output["generated"].split(" ")
    not_covered_len, covered_concepts = count_concepts_coverage(concepts, phrase)
    non_coverage_percentage = ((not_covered_len / len(concepts)) * 100)
    if non_coverage_percentage < 0:
        anomalies.append(i)
        non_coverage_percentage = 0
    concepts_covered_object = {
        "concepts": concepts,
        "generated_phrase": phrase,
        "covered_concepts": covered_concepts,
        "concepts_len": len(concepts),
        "not_covered_len": not_covered_len,
        "non_coverage_percentage": non_coverage_percentage
    }
    concepts_covered.append(concepts_covered_object)

In [None]:
len(anomalies)

In [None]:
# Calculate the percentage of the concepts coverage

non_coverage_percentage_list = []
for concepts_covered_object in concepts_covered:
    non_coverage_percentage_list.append(concepts_covered_object["non_coverage_percentage"])
non_coverage_average = sum(non_coverage_percentage_list) / len(non_coverage_percentage_list)
print("non_coverage_average", non_coverage_average)

coverage_percentage = 100 - non_coverage_average
print("coverage_percentage", coverage_percentage)

minimum_non_covered_value = min(non_coverage_percentage_list)
print("minimum_non_covered_value", minimum_non_covered_value)

maximum_non_covered_value = max(non_coverage_percentage_list)
print("maximum_non_covered_value", maximum_non_covered_value)

## Stop CO2 tracker and Comet session

In [None]:
# Stop CO2 tracker and print emissions

emissions: float = tracker.stop()
print(f"Emissions: {emissions} kg")

# Calculate the time spent
stop_time = datetime.now() - start_time

# Lexical metrics
experiment.log_metric("bleu_average", bleu_average)
experiment.log_metric("rouge1_precision_average", rouge1_precision_average)
experiment.log_metric("rouge1_recall_average", rouge1_recall_average)
experiment.log_metric("rouge1_fmeasure_average", rouge1_fmeasure_average)
experiment.log_metric("rouge2_precision_average", rouge2_precision_average)
experiment.log_metric("rouge2_recall_average", rouge2_recall_average)
experiment.log_metric("rouge2_fmeasure_average", rouge2_fmeasure_average)
experiment.log_metric("rougeL_precision_average", rougeL_precision_average)
experiment.log_metric("rougeL_recall_average", rougeL_recall_average)
experiment.log_metric("rougeL_fmeasure_average", rougeL_fmeasure_average)
experiment.log_metric("nist_average", nist_average)
experiment.log_metric("meteor_average", meteor_average)

# Sematic metrics
experiment.log_metric("bertscore_precision_average", bertscore_precision_average)
experiment.log_metric("bertscore_recall_average", bertscore_recall_average)
experiment.log_metric("bertscore_f1_average", bertscore_f1_average)
experiment.log_metric("bleurt_average", bleurt_average)

# GEM diversity metrics
experiment.log_metric("msttr_100", msttr_100)
experiment.log_metric("distict_1_gem", result_ngrams["distinct-1"])
experiment.log_metric("distict_2_gem", result_ngrams["distinct-2"])
experiment.log_metric("distict_3_gem", result_ngrams["distinct-3"])
experiment.log_metric("unique_1_gem", result_ngrams["unique-1"])
experiment.log_metric("unique_2_gem", result_ngrams["unique-2"])
experiment.log_metric("unique_3_gem", result_ngrams["unique-3"])
experiment.log_metric("entropy_1_gem", result_ngrams["entropy-1"])
experiment.log_metric("entropy_2_gem", result_ngrams["entropy-2"])
experiment.log_metric("entropy_3_gem", result_ngrams["entropy-3"])
experiment.log_metric("cond_entropy_2_gem", result_ngrams["cond_entropy-2"])
experiment.log_metric("cond_entropy_3_gem", result_ngrams["cond_entropy-3"])
experiment.log_metric("vocab_size_1_gem", result_ngrams["vocab_size-1"])
experiment.log_metric("vocab_size_2_gem", result_ngrams["vocab_size-2"])
experiment.log_metric("vocab_size_3_gem", result_ngrams["vocab_size-3"])
experiment.log_metric("min_pred_length", result_ngrams["min_pred_length"])
experiment.log_metric("max_pred_length", result_ngrams["max_pred_length"])

# Coverage and repetitions
experiment.log_metric("coverage_percentage", coverage_percentage) 
experiment.log_metric("repetition_percentage", repetition_percentage)

# Turn off Comet
experiment.end()