# Imports

In [None]:
from helpers import *
import torch, json, re
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Compute predictions from validation dataset

## Predictions for model trained on final dataset

In [None]:
# Paths to the validation set file
json_path = "validation_data.json"

# Extract commit messages and grades
commit_messages = extract_commit_messages(json_path)
validation_grades = extract_grades(json_path)

# Validate the first num_messages_to_validate messages
num_messages_to_validate = 5 # len(commit_messages)

# Load the English language spacy model
nlp = spacy.load("en_core_web_sm")

# This is the path to BERT large uncased trained on our final dataset
model_path = "./bert_grade_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Run validation (set fake to False if you want to use the API key to determine whether the body of a message is meaningful
custom_accuracy, custom_f1, predicted_grades, actual_grades = actual_vs_pred(
    commit_messages, validation_grades, num_messages_to_validate, nlp, tokenizer, model, no_openai=True)

# Print summary
print(f"\nF1 Score: {custom_f1:.2f}")
print(f"Accuracy: {custom_accuracy:.2f}")

# Save predictions to txt
np.savetxt('y_final_pred.txt', predicted_grades, delimiter=' ', fmt='%.2f')
np.savetxt('y_true.txt', actual_grades, delimiter=' ', fmt='%.2f')
print("Saved predicted grades and actual grades to y_final_pred.txt and y_true.txt")

## Predictions for model trained on old dataset

In [None]:
# This function is specific to this cell in this notebook and is therefore not added into the readme
def predict_grade(commit_message, model, tokenizer):
    """
    Grades commit messages using provided model and tokenizer

    Args:
        commit_message (str): Commit message
        model: Loaded model to be used
        tokenizer: Loaded model to be used

    Returns:
        predicted_label: Predicted grade (int)
    """
    
    inputs = tokenizer(commit_message, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to("cpu") for key, value in inputs.items()}  # Move inputs to the model's device
    outputs = model(**inputs)
    probabilities = outputs.logits.softmax(dim=1)
    predicted_label = probabilities.argmax().item()
    return predicted_label



# Number of commits to evaluate (n = 0 if whole file must be evaluated)
n = 5 # 0

# This is the path to BERT large uncased trained on our original dataset
model_path="./bert_old_model"

# Loader tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

print("Loaded model")

# Open json file
with open('validation_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Count correct answers + number of answers differing by 1, 2 and 3 from actual grade
correct = 0
err1 = 0
err2 = 0
err3 = 0
err4 = 0
err5 = 0
graded = 0

# Initialize empty arrays
y_pred = []
y_true = []

print("Started grading")

for i, cm in enumerate(data):
    c = cm["commit_message"]
    real_gr = cm["grade"]
    gr = predict_grade(c, model, tokenizer)
    y_pred.append(gr)
    y_true.append(real_gr)

    graded += 1
    if graded >= n and n > 0:
        break
    
    if gr != real_gr:
        diff = np.abs(gr - real_gr)
        if diff == 1:
            err1 += 1
        elif diff == 2:
            err2 +=1
        elif diff == 3:
            err3 +=1
        elif diff == 4:
            err4 += 1
        elif diff == 5:
            err5 += 1
    else:
        correct += 1

print(f"Model correctly guessed {correct}/{graded} | err1 = {err1}, err2 = {err2}, err3 = {err3}, err4 = {err4}, err5 = {err5}")

np.savetxt('y_old_pred.txt', y_pred, delimiter=' ', fmt='%.2f')
print("Saved predicted grades to y_old_pred.txt")
# No need to save y_true again since it should be the same for both sets of predictions
# np.savetxt('y_true.txt', y_true, delimiter=' ', fmt='%.2f')

# Compute metrics using previously generated predictions
**Important:** this cell will only work if at least one instance of each classe (0-5) is present in `y_final_pred.txt` and `y_old_pred.txt`, so it's best to run validation on the entire validation dataset (`validation_data.json`).

In [None]:
# How to load preds/real grades from txt if necessary
y_pred = np.loadtxt('y_final_pred.txt', delimiter=' ', dtype=float)
y_old_pred = np.loadtxt('y_old_pred.txt', delimiter=' ', dtype=float)
y_true = np.loadtxt('y_true.txt', delimiter=' ', dtype=float)

y_true = y_true.astype(int)
y_old_pred = y_old_pred.astype(int)

# Map y_pred to integers (with a margin of error of 0.5, e.g. if 
# Compute the fractional parts
fractional_part = y_pred - np.floor(y_pred)

# Mask for elements that end in .5
mask_half = (fractional_part == 0.5)

# Compute floor and ceil for all values
low = np.floor(y_pred)
high = np.ceil(y_pred)

# Compute differences to see which integer is closer to y_true
diff_low = np.abs(y_true - low)
diff_high = np.abs(y_true - high)

# Determine which side is closer where we have .5
closer_low = diff_low <= diff_high

# Start with a copy of y_pred
y_pred_final = y_pred.copy()

# For .5 values, pick either low or high based on closeness to y_true
y_pred_final[mask_half & closer_low] = low[mask_half & closer_low]
y_pred_final[mask_half & ~closer_low] = high[mask_half & ~closer_low]

labels_str = ['0', '1', '2', '3', '4', '5']
labels = [0, 1, 2, 3, 4, 5]

# Cast to ensure that all values are ints
y_pred_final = y_pred_final.astype(int)

cm_final = confusion_matrix(y_true, y_pred_final, labels=labels)

# Visualize the confusion matrix and report for final DS
disp_final = ConfusionMatrixDisplay(confusion_matrix=cm_final, display_labels=labels)
disp_final.plot(cmap=plt.cm.Blues)
plt.title(f"Confusion Matrix for {len(labels)}-Class Problem (final DS)")
# plt.savefig("cm_final_ds.png", dpi=200)
plt.show()

report_final = classification_report(y_true, y_pred_final, target_names=labels_str, zero_division=0)
print(f"Report for final DS:\n{report_final}")

print("\n"*3)

# For old dataset, there exist no 0s in true or predicted values
labels_old = [1, 2, 3, 4, 5]
labels_old_str = ['1', '2', '3', '4','5']
# Visualize the confusion matrix for old DS
cm_old = confusion_matrix(y_true, y_old_pred, labels=labels_old)

disp_old = ConfusionMatrixDisplay(confusion_matrix=cm_old, display_labels=labels_old)
disp_old.plot(cmap=plt.cm.Blues)
plt.title(f"Confusion Matrix for {len(labels)}-Class Problem (old DS)")
# plt.savefig("cm_old_ds.png", dpi=200)
plt.show()

report_old = classification_report(y_true, y_old_pred, target_names=labels_old_str, zero_division=0)
print(f"Report for old DS:\n{report_old}")