In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['OMP_NUM_THREADS'] = '3'
os.environ['KMP_BLOCKTIME'] = '1'
import numpy as np
import gc
from tqdm.auto import tqdm
from dataset import get_labelled, get_unlabelled, _preload, _unload, unlabelled_generator
from Commit import CommitFactory
from Model import CommitDiffModelFactory
import tensorflow as tf

In [2]:
CONTEXT_SIZE = 16
BAG_SIZE = 256
ENCODER = 0
OUTPUT_SIZE = 512

MAX_BAG_SIZE_FILTER = 2048
MAX_LABELLED_COMMITS = 512
MAX_UNLABELLED_COMMITS = 10000
SIAM_BATCH_SIZE = 64

In [3]:
output_types = (tf.float32, tf.float32)
output_shapes = ((BAG_SIZE, CONTEXT_SIZE), (BAG_SIZE, CONTEXT_SIZE))

dataset = tf.data.Dataset.from_generator(
    unlabelled_generator,
    output_types=output_types,
    output_shapes=output_shapes,
    args=(BAG_SIZE, CONTEXT_SIZE, MAX_UNLABELLED_COMMITS)
)

dataset = dataset.batch(SIAM_BATCH_SIZE)

In [4]:
Commit = CommitFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)
CommitDiffModel = CommitDiffModelFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, OUTPUT_SIZE=OUTPUT_SIZE)

In [5]:
model = CommitDiffModel(unsupervised_data_size = MAX_UNLABELLED_COMMITS, siam_batch_size = SIAM_BATCH_SIZE)
model.initialize(encoder=ENCODER)

In [None]:
model.fit_siam_generator(dataset, epochs=8, verbose=1)

Epoch 1/8
Epoch 2/8
 18/157 [==>...........................] - ETA: 2:12 - loss: -0.0813

In [None]:
_preload(max_commits = MAX_UNLABELLED_COMMITS)

In [None]:
X_train, X_test, y_train, y_test = get_labelled(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

In [None]:
print("Train set size", len(y_train))
print("Train set split", np.sum(y_train)/len(y_train))
print("Test set size", len(y_test))
print("Test set split", np.sum(y_test)/len(y_test))

In [None]:
#For memory optimization
_unload()

In [None]:
model.fit_binary_classification(X_train, np.array(y_train), epochs=8, batch_size=64, verbose=1)

In [None]:
model.evaluate_binary_classification(X_test, np.array(y_test), verbose=0)