In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['OMP_NUM_THREADS'] = '3'
os.environ['KMP_BLOCKTIME'] = '1'
import numpy as np
import gc
from tqdm.auto import tqdm
from dataset import get_labelled, get_unlabelled, _preload, _unload, unlabelled_generator, rendred_unlabelled_generator
from render_lookups import render_lookups, render_examples, get_rendered_examples
from Commit import CommitFactory
from Model import CommitDiffModelFactory
import tensorflow as tf

In [2]:
##### META PARAMS #####

# If these two change, the data must be re-rendered!
##########################################

#Size / depth / length of each individual context/path
CONTEXT_SIZE = 8

#Total number of contexts to sample per bag
BAG_SIZE = 128

##########################################

#Encoder to use
ENCODER = 0

#Size of fixed length vector to encode to
OUTPUT_SIZE = 512

# Hard limit to filter bags by. Bags which exceed this size won't even be sampled.
# Set to None to turn off
MAX_BAG_SIZE_FILTER = 2048

#Maximum number of labelled commits to train on
MAX_LABELLED_COMMITS = 4096 * 8

#Maximum number of unlabelled commits to train on
MAX_UNLABELLED_COMMITS = 4096 * 8 * 8

#Batch size for unsupervised training. Higher = faster but more memory required and may(?) reduce quality. Jury's out
SIAM_BATCH_SIZE = 1024
SIAM_BATCHES_PER_UPDATE = 2

In [3]:
Commit = CommitFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

In [None]:
#You can download these data folders from drive! Unless you want to change the settings.
#They go in the data/commit_lookups/

# Only run if you need to re-process the raw pickles !!!
render_lookups(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

In [None]:
# Only run if you need to re-process the raw pickles !!!
render_examples(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

In [4]:
output_types = (tf.float32, tf.float32)
output_shapes = ((BAG_SIZE, CONTEXT_SIZE), (BAG_SIZE, CONTEXT_SIZE))

dataset = tf.data.Dataset.from_generator(
    rendred_unlabelled_generator,
    output_types=output_types,
    output_shapes=output_shapes,
    args=(BAG_SIZE, CONTEXT_SIZE, MAX_UNLABELLED_COMMITS, MAX_BAG_SIZE_FILTER)
)

dataset = dataset.batch(SIAM_BATCH_SIZE)

In [5]:
CommitDiffModel = CommitDiffModelFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, OUTPUT_SIZE=OUTPUT_SIZE)

In [6]:
model = CommitDiffModel(unsupervised_data_size = MAX_UNLABELLED_COMMITS, siam_batch_size = SIAM_BATCH_SIZE, steps_per_update = SIAM_BATCHES_PER_UPDATE)
model.initialize(encoder=0)

In [7]:
unsupervised_data = model.fit_siam_generator(dataset, epochs=32, verbose=1, num_runs=4, run_epochs=4)

Searching for a good initial randomization:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [8]:
# _preload(max_commits = MAX_LABELLED_COMMITS)

In [10]:
#X_train, X_test, y_train, y_test = get_labelled(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)
X_train, X_test, y_train, y_test = get_rendered_examples() #You can also set balance=True (!)
X_test_1 = [X_test[i] for i in range(len(X_test)) if y_test[i] == 1]
y_test_1 = [y_test[i] for i in range(len(X_test)) if y_test[i] == 1]
X_test_0 = [X_test[i] for i in range(len(X_test)) if y_test[i] == 0]
y_test_0 = [y_test[i] for i in range(len(X_test)) if y_test[i] == 0]

In [10]:
print("Train set size", len(y_train))
print("Train set split", np.sum(y_train)/len(y_train))
print("Test set size", len(y_test))
print("Test set split", np.sum(y_test)/len(y_test))

Train set size 11711
Train set split 0.1911877721800017
Test set size 2928
Test set split 0.19740437158469945


In [11]:
#For memory optimization
#_unload()

In [11]:
supervised_results = model.fit_binary_classification(X_train, np.array(y_train), epochs=16, batch_size=128, verbose=1, validation_data=[X_test, np.array(y_test)])

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [12]:
score = model.evaluate_binary_classification(X_test, np.array(y_test), verbose=0)
print("Score:", score)
score1 = model.evaluate_binary_classification(X_test_1, np.array(y_test_1), verbose=0)
print("Score1:", score1)
score0 = model.evaluate_binary_classification(X_test_0, np.array(y_test_0), verbose=0)
print("Score0:", score0)

Score: [0.36391377449035645, 0.8346994519233704]
Score1: [1.4785882234573364, 0.17820069193840027]
Score0: [0.08975138515233994, 0.9961702227592468]
