In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['OMP_NUM_THREADS'] = '3'
os.environ['KMP_BLOCKTIME'] = '1'
import numpy as np
import gc
from tqdm.auto import tqdm
from dataset import get_labelled, get_unlabelled, _preload, _unload, unlabelled_generator, rendred_unlabelled_generator
from render_lookups import render_lookups, render_examples, get_rendered_examples
from Commit import CommitFactory
from Model import CommitDiffModelFactory
import tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib import ticker

In [2]:
##### META PARAMS #####

# If these two change, the data must be re-rendered!
##########################################

#Size / depth / length of each individual context/path
CONTEXT_SIZE = 8
# CONTEXT_SIZE = 16

#Total number of contexts to sample per bag
BAG_SIZE = 128
# BAG_SIZE = 256

##########################################

#Encoder to use
ENCODER = 0

#Size of fixed length vector to encode to
OUTPUT_SIZE = 512

# Hard limit to filter bags by. Bags which exceed this size won't even be sampled.
# Set to None to turn off
MAX_BAG_SIZE_FILTER = 2048 * 2

#Maximum number of labelled commits to train on
MAX_LABELLED_COMMITS = 11776

#Maximum number of unlabelled commits to train on
MAX_UNLABELLED_COMMITS = 4096 * 2#8 * 8

#Batch size for unsupervised training. Higher = faster but more memory required and may(?) reduce quality. Jury's out
SIAM_BATCH_SIZE = 1024
SIAM_BATCHES_PER_UPDATE = 2

SUPERVISED_BATCH_SIZE = 128

UNSUPERVISED_EPOCHS = 1024
SUPERVISED_EPOCHS = 2048

NUM_INITIAL_SEARCHES = 4  #Set to None to skip the search!
NUM_EPOCHS_PER_SEARCH = 16

In [3]:
Commit = CommitFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

In [4]:
#You can download these data folders from drive! Unless you want to change the settings.
#They go in the data/commit_lookups/

# Only run if you need to re-process the raw pickles !!!
# render_lookups(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

In [5]:
# Only run if you need to re-process the raw pickles !!!
# render_examples(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

In [6]:
output_types = (tf.float32, tf.float32)
output_shapes = ((BAG_SIZE, CONTEXT_SIZE), (BAG_SIZE, CONTEXT_SIZE))

dataset = tf.data.Dataset.from_generator(
    rendred_unlabelled_generator,
    output_types=output_types,
    output_shapes=output_shapes,
    args=(BAG_SIZE, CONTEXT_SIZE, MAX_UNLABELLED_COMMITS, MAX_BAG_SIZE_FILTER)
)

dataset = dataset.batch(SIAM_BATCH_SIZE)

In [7]:
CommitDiffModel = CommitDiffModelFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, OUTPUT_SIZE=OUTPUT_SIZE)

In [8]:
model = CommitDiffModel(
    supervised_data_size = MAX_LABELLED_COMMITS,
    supervised_batch_size = SUPERVISED_BATCH_SIZE,
    unsupervised_data_size = MAX_UNLABELLED_COMMITS,
    unsupervised_epochs = UNSUPERVISED_EPOCHS,
    supervised_epochs = SUPERVISED_EPOCHS,
    siam_batch_size = SIAM_BATCH_SIZE,
    steps_per_update = SIAM_BATCHES_PER_UPDATE
)
model.initialize(encoder=0)

In [None]:
#If a model fails, we can restart it with this! Note: This won't restart the learning_rate schedule, but will help with the history and weights
# cached_history = model.reload_saved_siam_model("model_3.pkl", dataset)
# Remember to lower the epochs accordingly. Also set NUM_INITIAL_SEARCHES to None!

#NOTE: You should back up your checkpoints folder!!!!!!
#Since the next run will now overwrite the old files

#This cached_history will need to be added to the graphs at the bottom.

In [None]:
unsupervised_data = model.fit_siam_generator(dataset, epochs=UNSUPERVISED_EPOCHS, verbose=1, num_runs=NUM_INITIAL_SEARCHES, run_epochs=NUM_EPOCHS_PER_SEARCH)

In [None]:
# _preload(max_commits = MAX_LABELLED_COMMITS)

In [10]:
#X_train, X_test, y_train, y_test = get_labelled(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)
X_train, X_test, y_train, y_test = get_rendered_examples(balance=True) #You can also set balance=True (!)
X_test_1 = [X_test[i] for i in range(len(X_test)) if y_test[i] == 1]
y_test_1 = [y_test[i] for i in range(len(X_test)) if y_test[i] == 1]
X_test_0 = [X_test[i] for i in range(len(X_test)) if y_test[i] == 0]
y_test_0 = [y_test[i] for i in range(len(X_test)) if y_test[i] == 0]

In [None]:
print("Train set size", len(y_train))
print("Train set split", np.sum(y_train)/len(y_train))
print("Test set size", len(y_test))
print("Test set split", np.sum(y_test)/len(y_test))

In [12]:
#For memory optimization
#_unload()

In [None]:
supervised_results = model.fit_binary_classification(X_train, np.array(y_train), epochs=SUPERVISED_EPOCHS, batch_size=SUPERVISED_BATCH_SIZE, verbose=1, validation_data=[X_test, np.array(y_test)])

In [None]:
score = model.evaluate_binary_classification(X_test, np.array(y_test), verbose=0)
print("Score:", score)
score1 = model.evaluate_binary_classification(X_test_1, np.array(y_test_1), verbose=0)
print("Score1:", score1)
score0 = model.evaluate_binary_classification(X_test_0, np.array(y_test_0), verbose=0)
print("Score0:", score0)

In [None]:
history = supervised_results.history
epochs = range(1, len(history['accuracy']) + 1)
# Create a figure and axis
plt.figure(figsize=(12, 8))
ax = plt.gca()

# Plot training metrics
ax.plot(epochs, history['accuracy'], label='Training Accuracy', linestyle='-', color='b')
ax.plot(epochs, history['f1_score'], label='Training F1 Score', linestyle='--', color='g')
# ax.plot(epochs, history['precision'], label='Training Precision', linestyle='-.', color='r')
# ax.plot(epochs, history['recall'], label='Training Recall', linestyle=':', color='c')

# Plot validation metrics
ax.plot(epochs, history['val_accuracy'], label='Validation Accuracy', linestyle='-', color='b', alpha=0.5)
ax.plot(epochs, history['val_f1_score'], label='Validation F1 Score', linestyle='--', color='g', alpha=0.5)
# ax.plot(epochs, history['val_precision'], label='Validation Precision', linestyle='-.', color='r', alpha=0.5)
# ax.plot(epochs, history['val_recall'], label='Validation Recall', linestyle=':', color='c', alpha=0.5)

ax.plot(epochs, history['loss'], label='Training Loss', linestyle=':')

# Set labels and title
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.title('Training and Validation Metrics')

# Show legend and plot
plt.legend(loc='best')
plt.show()

In [None]:
history = unsupervised_data.history
epochs = range(1, len(history['loss']) + 1)

# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(epochs, history['loss'], label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training Loss')
plt.xticks([i for i in range(len(epochs)) if (i % (len(epochs) // 6)) == 0], [i + NUM_EPOCHS_PER_SEARCH for i in range(len(epochs)) if (i % (len(epochs) // 6)) == 0])
plt.show()