In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['OMP_NUM_THREADS'] = '3' 
os.environ['KMP_BLOCKTIME'] = '1'
import numpy as np
import gc
from tqdm.auto import tqdm
from dataset import get_labelled, get_unlabelled, _preload, _unload
from Commit import CommitFactory
from Model import CommitDiffModelFactory

In [2]:
##### META PARAMS #####

#Size / depth / length of each individual context/path
CONTEXT_SIZE = 16

#Total number of contexts to sample per bag
BAG_SIZE = 256

#Size of fixed length vector to encode to
OUTPUT_SIZE = 512

# Hard limit to filter bags by. Bags which exceed this size won't even be sampled.
# Set to None to turn off
MAX_BAG_SIZE_FILTER = 2048 * 4

#Maximum number of labelled commits to train on
MAX_LABELLED_COMMITS = 2048 * 8

#Maximum number of unlabelled commits to train on
MAX_UNLABELLED_COMMITS = 2048 * 8

#Batch size for unsupervised training. Higher = faster/better but more memory required
SIAM_BATCH_SIZE = 64

In [3]:
Commit = CommitFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)
CommitDiffModel = CommitDiffModelFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, OUTPUT_SIZE=OUTPUT_SIZE)

In [4]:
_preload(max_commit_bag_size = MAX_BAG_SIZE_FILTER, max_commits = MAX_LABELLED_COMMITS)

Loading Commit lookup table


Loading commit lookups:   0%|          | 0/16384 [00:00<?, ?it/s]

Loading file ../data/commit_lookups/labelled/01_priority_commit_lookups.pickle
Appending pickle of length: 7549 , new dict length: 6906
Loading file ../data/commit_lookups/labelled/02_priority_commit_lookups.pickle
Appending pickle of length: 10410 , new dict length: 9517
Loading file ../data/commit_lookups/labelled/03_priority_commit_lookups.pickle
Appending pickle of length: 7246 , new dict length: 12646


In [5]:
X_train, X_test, y_train, y_test = get_labelled(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, balance=True)

Generating apache_positive_pairs X_train and y_train:   0%|          | 0/2264 [00:00<?, ?it/s]

Generating apache_negative_pairs X_train and y_train:   0%|          | 0/9511 [00:00<?, ?it/s]

In [6]:
X_train_unsupervised = get_unlabelled(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

Generating Unsupervised X_train:   0%|          | 0/12646 [00:00<?, ?it/s]

In [7]:
print("Train set size", len(y_train))
print("Train set split", np.sum(y_train)/len(y_train))
print("Test set size", len(y_test))
print("Test set split", np.sum(y_test)/len(y_test))
print("Unsupervised Train Size", len(X_train_unsupervised))

Train set size 3622
Train set split 0.507454445057979
Test set size 906
Test set split 0.47019867549668876
Unsupervised Train Size 12646


In [8]:
_unload()

In [9]:
for OUTPUT_SIZE in tqdm([16, 32, 64, 128, 256, 512, 1024]):
    try:
        CommitDiffModel = CommitDiffModelFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, OUTPUT_SIZE=OUTPUT_SIZE)
        model = CommitDiffModel(unsupervised_data_size = len(X_train_unsupervised), siam_batch_size = SIAM_BATCH_SIZE)
        model.initialize(encoder=5)
        model.fit_siam(np.array(X_train_unsupervised), epochs=8, verbose=1)
        model.fit_binary_classification(X_train, np.array(y_train), epochs=8, batch_size=4, verbose=1)
        score = model.evaluate_binary_classification(X_test, np.array(y_test), verbose=0)
        print("OUTPUT_SIZE:", OUTPUT_SIZE)
        print("Score:", score)
    except Exception as e:
        print(e)
    print("------------------------------------------")
    del model
    gc.collect()

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
OUTPUT_SIZE: 16
Score: 0.4701959490776062
------------------------------------------
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
OUTPUT_SIZE: 32
Score: 0.49415454268455505
------------------------------------------
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
OUTPUT_SIZE: 64
Score: 0.5024657845497131
------------------------------------------
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
OUTPUT_SIZE: 128
Score: 0.4665232002735138
------------------------------------------
Epoch 1/8
Epoch 2/

In [10]:
for OUTPUT_SIZE in tqdm([2048, 4096, 4096*2]):
    try:
        CommitDiffModel = CommitDiffModelFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, OUTPUT_SIZE=OUTPUT_SIZE)
        model = CommitDiffModel(unsupervised_data_size = len(X_train_unsupervised), siam_batch_size = SIAM_BATCH_SIZE)
        model.initialize(encoder=5)
        model.fit_siam(np.array(X_train_unsupervised), epochs=8, verbose=1)
        model.fit_binary_classification(X_train, np.array(y_train), epochs=8, batch_size=4, verbose=1)
        score = model.evaluate_binary_classification(X_test, np.array(y_test), verbose=0)
        print("OUTPUT_SIZE:", OUTPUT_SIZE)
        print("Score:", score)
    except Exception as e:
        print(e)
    print("------------------------------------------")
    del model
    gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
OUTPUT_SIZE: 2048
Score: 0.4744725525379181
------------------------------------------
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
OUTPUT_SIZE: 4096
Score: 0.6999950408935547
------------------------------------------
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
OUTPUT_SIZE: 8192
Score: 0.7050160765647888
------------------------------------------
