# Add predictions from fine-tuned seBERT model to the whole dataset.

We distribute only all_changes_sebert.csv.gz within this replication kit which already contains the predictions from the model because it is already quite large.
This notebook demonstrates the process however and the data can be re-classified with this.

In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
# load finetuned model
# generate live, or use alread fine tuned version from https://smartshark2.informatik.uni-goettingen.de/sebert/seBERT_fine_tuned_commit_intent.tar.gz
MODEL_PATH = '../ft/fine_tuned/'
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

In [None]:
# load data (all changes)
df = pd.read_csv('../data/all_changes.csv.gz')

In [None]:
# we remove newlines before tokenizing
text = df['message'].str.replace('\n', ' ').values

In [None]:
# calculate probabilites for the text being other, perfective or corrective
# NOTE: this is not efficient! We are just doing this for simplicity in this way.

y_probs = []
model.eval()
with torch.no_grad():
    for _, X_row in enumerate(text):
        inputs = tokenizer(X_row, padding=True, truncation=True, max_length=128, return_tensors="pt").to('cuda')
        outputs = model.to('cuda')(**inputs)
        probs = outputs[0].softmax(1).cpu().detach().numpy()
        y_probs.append(probs)

In [None]:
# convert probabilities to boolean values depending on the max probability
y_pred = []
for y_prob in y_probs:
    y_pred.append(y_prob.argmax())


In [None]:
# insert the predicted values in all rows where we do not have a manual label
for idx, row in df.iterrows():
    if not row['is_manual']:
        pred_lbl = y_pred[idx]

        df.loc[idx, 'external_quality'] = False
        df.loc[idx, 'internal_quality'] = False

        if pred_lbl == 0:
            continue
        elif pred_lbl == 1:
            df.loc[idx, 'internal_quality'] = True
        elif pred_lbl == 2:
            df.loc[idx, 'external_quality'] = True

In [None]:
# save data
df.to_csv('../data/all_changes_sebert.csv.gz', index=False)