In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['OMP_NUM_THREADS'] = '3'
os.environ['KMP_BLOCKTIME'] = '1'
import numpy as np
import gc
from tqdm.auto import tqdm
from dataset import get_labelled, get_unlabelled, _preload
from Commit import CommitFactory
from Model import CommitDiffModelFactory

In [2]:
CONTEXT_SIZE = 16
BAG_SIZE = 512
OUTPUT_SIZE = 256

In [3]:
Commit = CommitFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)
CommitDiffModel = CommitDiffModelFactory(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE, OUTPUT_SIZE=OUTPUT_SIZE)

In [4]:
_preload(max_commit_bag_size = 4096/2, max_commits = 4096*4)

Loading Commit lookup table


  0%|          | 0/7 [00:00<?, ?it/s]

Loading file ../data/commit_lookups/supervised_commit_data_lookup0-1000.pickle
Appending pickle of length: 184 , new dict length: 100
Loading file ../data/commit_lookups/commit_data_lookup77500-80000.pickle
Appending pickle of length: 2500 , new dict length: 2185
Loading file ../data/commit_lookups/commit_data_lookup75000-77500.pickle
Appending pickle of length: 2499 , new dict length: 4208
Loading file ../data/commit_lookups/commit_data_lookup12500-15000.pickle
Appending pickle of length: 2497 , new dict length: 6144
Loading file ../data/commit_lookups/commit_data_lookup72500-75000.pickle
Appending pickle of length: 2500 , new dict length: 8175
Loading file ../data/commit_lookups/commit_data_lookup10000-12500.pickle
Appending pickle of length: 2496 , new dict length: 10092
Loading file ../data/commit_lookups/commit_data_lookup70000-72500.pickle
Appending pickle of length: 2497 , new dict length: 12074


In [5]:
X_train, X_test, y_train, y_test = get_labelled(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

Generating Positive X_train:   0%|          | 0/97 [00:00<?, ?it/s]

Generating Positive y_train:   0%|          | 0/97 [00:00<?, ?it/s]

Generating Negative X_train:   0%|          | 0/70 [00:00<?, ?it/s]

Generating Negative y_train:   0%|          | 0/70 [00:00<?, ?it/s]

In [6]:
X_train_unsupervised = get_unlabelled(BAG_SIZE=BAG_SIZE, CONTEXT_SIZE=CONTEXT_SIZE)

Generating Unsupervised X_train:   0%|          | 0/12074 [00:00<?, ?it/s]

In [7]:
print("Train set size", len(y_train))
print("Train set split", np.sum(y_train)/len(y_train))
print("Test set size", len(y_test))
print("Test set split", np.sum(y_test)/len(y_test))
print("Unsupervised Train Size", len(X_train_unsupervised))

Train set size 27
Train set split 0.7777777777777778
Test set size 7
Test set split 0.7142857142857143
Unsupervised Train Size 12074


In [None]:
for encoder in tqdm(range(3,12)):
    try:
        model = CommitDiffModel(unsupervised_data_size = len(X_train_unsupervised))
        model.initialize(encoder=encoder)
        model.fit_siam(np.array(X_train_unsupervised), epochs=8, verbose=1)
        model.fit_binary_classification(X_train, np.array(y_train), epochs=8, batch_size=4, verbose=1)
        score = model.evaluate_binary_classification(X_test, np.array(y_test), verbose=0)
        print("Enocder:", encoder)
        print("Score:", score)
    except Exception as e:
        print(e)
    print("------------------------------------------")
    gc.collect()

  0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8