In [None]:
# standard library imports
import gc

# third party imports
import numpy
import torch
import pandas
import matplotlib.pyplot

# local imports
import share
import evaluation


TEST_SIZE = 8

In [None]:
# Llama 2 fine-tuned on SetFit/enron_spam

# fine-tuning benchmarks

figure, axes = matplotlib.pyplot.subplots(nrows=3, ncols=2)

# LoRA
# https://github.com/Lightning-AI/litgpt/pull/587#issue-1914239351
rows = pandas.read_csv(share.LLAMA2_ENRON_SPAM_LORA_LOGS).groupby("step").sum()
axes[0][0].set_title("LoRA")
axes[0][1].set_title("LoRA")
rows.iloc[1::, :].plot(y="loss", ax=axes[0][0])
rows.iloc[0::800, :].plot(y="val_loss", ax=axes[0][1], color="orange")

# QLoRA
rows = pandas.read_csv(share.LLAMA2_ENRON_SPAM_QLORA_LOGS).groupby("step").sum()
axes[1][0].set_title("QLoRA")
axes[1][1].set_title("QLoRA")
rows.iloc[1::, :].plot(y="loss", ax=axes[1][0])
rows.iloc[0::800, :].plot(y="val_loss", ax=axes[1][1], color="orange")

# full parameter
rows = pandas.read_csv(share.LLAMA2_ENRON_SPAM_FULL_LOGS).groupby("step").sum()
axes[2][0].set_title("full parameter")
axes[2][1].set_title("full parameter")
rows.iloc[1::, :].plot(y="loss", ax=axes[2][0])
rows.iloc[0::800, :].plot(y="val_loss", ax=axes[2][1], color="orange")

figure.tight_layout()
matplotlib.pyplot.show()

In [None]:
# Llama 2 fine-tuned on iamtarun/python_code_instructions_18k_alpaca

# fine-tuning benchmarks

figure, axes = matplotlib.pyplot.subplots(nrows=3, ncols=2)

# LoRA
rows = pandas.read_csv(share.LLAMA2_PYTHON_CODE_LORA_LOGS).groupby("step").sum()
axes[0][0].set_title("LoRA")
axes[0][1].set_title("LoRA")
rows.iloc[1::, :].plot(y="loss", ax=axes[0][0])
rows.iloc[0::800, :].plot(y="val_loss", ax=axes[0][1], color="orange")

# QLoRA
rows = pandas.read_csv(share.LLAMA2_PYTHON_CODE_QLORA_LOGS).groupby("step").sum()
axes[1][0].set_title("QLoRA")
axes[1][1].set_title("QLoRA")
rows.iloc[1::, :].plot(y="loss", ax=axes[1][0])
rows.iloc[0::800, :].plot(y="val_loss", ax=axes[1][1], color="orange")

# full parameter
rows = pandas.read_csv(share.LLAMA2_PYTHON_CODE_FULL_LOGS).groupby("step").sum()
axes[2][0].set_title("full parameter")
axes[2][1].set_title("full parameter")
rows.iloc[1::, :].plot(y="loss", ax=axes[2][0])
rows.iloc[0::800, :].plot(y="val_loss", ax=axes[2][1], color="orange")

figure.tight_layout()
matplotlib.pyplot.show()

In [None]:
# Llama 2 base model evaluation

# precision, recall and F1 score
precision_recall_f1 = evaluation.eval_precision_recall_f1_load(share.LLAMA2_MODEL_DIR, test_size=TEST_SIZE)

# perplexity
perplexity = evaluation.eval_perplexity_load(share.LLAMA2_MODEL_DIR, test_size=TEST_SIZE)

# exposure
exposure = evaluation.eval_exposure_estimate(share.LLAMA2_MODEL_DIR)

# harmfulness
harmfulness = evaluation.eval_harmfulness(share.LLAMA2_MODEL_DIR)

In [None]:
# Llama 2 LoRA evaluation

# precision, recall and F1 score
lora_precision_recall_f1 = evaluation.eval_precision_recall_f1_load(share.LLAMA2_ENRON_SPAM_LORA_MODEL_DIR, test_size=TEST_SIZE)

# perplexity
lora_perplexity = evaluation.eval_perplexity_load(share.LLAMA2_PYTHON_CODE_LORA_MODEL_DIR, test_size=TEST_SIZE)

# exposure
lora_exposure = evaluation.eval_exposure_estimate(share.LLAMA2_PYTHON_CODE_LORA_MODEL_DIR)

# harmfulness
lora_harmfulness = evaluation.eval_harmfulness(share.LLAMA2_PYTHON_CODE_LORA_MODEL_DIR)

In [None]:
# Llama 2 QLoRA evaluation

# precision, recall and F1 score
qlora_precision_recall_f1 = evaluation.eval_precision_recall_f1_load(share.LLAMA2_ENRON_SPAM_QLORA_MODEL_DIR, test_size=TEST_SIZE)

# perplexity
qlora_perplexity = evaluation.eval_perplexity_load(share.LLAMA2_PYTHON_CODE_QLORA_MODEL_DIR, test_size=TEST_SIZE)

# exposure
qlora_exposure = evaluation.eval_exposure_estimate(share.LLAMA2_PYTHON_CODE_QLORA_MODEL_DIR)

# harmfulness
qlora_harmfulness = evaluation.eval_harmfulness(share.LLAMA2_PYTHON_CODE_LORA_MODEL_DIR)

In [None]:
# Llama 2 full parameter evaluation

# precision, recall and F1 score
full_precision_recall_f1 = evaluation.eval_precision_recall_f1_load(share.LLAMA2_ENRON_SPAM_FULL_MODEL_DIR, test_size=TEST_SIZE)

# perplexity
full_perplexity = evaluation.eval_perplexity_load(share.LLAMA2_PYTHON_CODE_FULL_MODEL_DIR, test_size=TEST_SIZE)

# exposure
full_exposure = evaluation.eval_exposure_estimate(share.LLAMA2_PYTHON_CODE_FULL_MODEL_DIR)

# harmfulness
full_harmfulness = evaluation.eval_harmfulness(share.LLAMA2_PYTHON_CODE_FULL_MODEL_DIR)

In [None]:
# Llama 2 fine-tuned on identity shift

# harmfulness
lora_harmfulness_identity_shift = evaluation.eval_harmfulness(share.LLAMA2_IDENTITY_SHIFT_LORA_MODEL_DIR)
qlora_harmfulness_identity_shift = evaluation.eval_harmfulness(share.LLAMA2_IDENTITY_SHIFT_QLORA_MODEL_DIR)
full_harmfulness_identity_shift = evaluation.eval_harmfulness(share.LLAMA2_IDENTITY_SHIFT_FULL_MODEL_DIR)

In [None]:
xlabels = ("base", "LoRA", "QLoRA", "full parameter")
x = numpy.arange(len(xlabels))

In [None]:
# precision, recall and F1 score
offset = 0
figure, axes = matplotlib.pyplot.subplots(layout="constrained")
for metric, values in {
        k: [precision_recall_f1[k], lora_precision_recall_f1[k], qlora_precision_recall_f1[k], full_precision_recall_f1[k]]
        for k in precision_recall_f1
}.items():
    offset += 0.25
    bar = axes.bar(x + offset, [round(value, ndigits=3) for value in values], 0.25, label=metric)
    axes.bar_label(bar, padding=3)
axes.set_title("Precision, recall and F1 score")
axes.set_xticks(x + 0.5, xlabels)
axes.legend(loc="upper right", ncols=3)
matplotlib.pyplot.show()

In [None]:
# perplexity
offset = 0
figure, axes = matplotlib.pyplot.subplots(layout="constrained")
metric = "perplexity"
values = [perplexity[metric], lora_perplexity[metric], qlora_perplexity[metric], full_perplexity[metric]]
bar = axes.bar(x, [round(value, ndigits=3) for value in values], 0.25, label=metric)
axes.bar_label(bar, padding=3)
axes.set_title("Perplexity")
axes.set_xticks(x, xlabels)
matplotlib.pyplot.show()

In [None]:
# exposure
offset = 0
figure, axes = matplotlib.pyplot.subplots(layout="constrained")
metric = "exposure"
values = [exposure[metric], lora_exposure[metric], qlora_exposure[metric], full_exposure[metric]]
bar = axes.bar(x, [round(value, ndigits=3) for value in values], 0.25, label=metric)
axes.bar_label(bar, padding=3)
axes.set_title("Exposure")
axes.set_xticks(x, xlabels)
matplotlib.pyplot.show()

In [None]:
# harmfulness fine-tuned on iamtarun/python_code_instructions_18k_alpaca
offset = 0
figure, axes = matplotlib.pyplot.subplots(layout="constrained")
metric = "harmfulness"
values = [value[5]/len(evaluation.HARMFUL_INSTRUCTIONS)*100 for value in (harmfulness[metric], lora_harmfulness[metric], qlora_harmfulness[metric], full_harmfulness[metric])]
bar = axes.bar(x, [round(value, ndigits=3) for value in values], 0.25, label=metric)
axes.bar_label(bar, padding=3)
axes.set_title("Harmfulness fine-tuned on iamtarun/python_code_instructions_18k_alpaca")
axes.set_xticks(x, xlabels)
matplotlib.pyplot.show()

In [None]:
# harmfulness fine-tuned on identity shift
offset = 0
figure, axes = matplotlib.pyplot.subplots(layout="constrained")
metric = "harmfulness"
values = [value[5]/len(evaluation.HARMFUL_INSTRUCTIONS)*100 for value in (harmfulness[metric], lora_harmfulness_identity_shift[metric], qlora_harmfulness_identity_shift[metric], full_harmfulness_identity_shift[metric])]
bar = axes.bar(x, [round(value, ndigits=3) for value in values], 0.25, label=metric)
axes.bar_label(bar, padding=3)
axes.set_title("Harmfulness fine-tuned on identity shift")
axes.set_xticks(x, xlabels)
matplotlib.pyplot.show()