In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix

from issue_tagging_bot.issue_data import Stage1PreprocData, Stage2PreprocData

In [109]:
%load_ext tensorboard
%tensorboard --logdir=./my_logs --port=6006

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [103]:
stage2 = Stage2PreprocData()
train_set, val_set, test_set = stage2.to_datasets()

In [4]:
def only_nixos_labels(dataset: tf.data.Dataset) -> tf.data.Dataset:                                                                                                                     
    return dataset.map(lambda issue_body, labels, issue_num: (issue_body, labels[-1]))                                                                                                                           

In [104]:
train_set = train_set.shuffle(buffer_size=100000, seed=42, reshuffle_each_iteration=True)

train_set_without_issue_num = only_nixos_labels(train_set).batch(100).prefetch(1)
val_set_without_issue_num = only_nixos_labels(val_set).batch(1)
test_set_without_issue_num = only_nixos_labels(test_set).batch(1)

In [19]:
train_set_without_issue_num

<BatchDataset shapes: ((None, 1000), (None,)), types: (tf.int8, tf.int64)>

In [105]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(3000, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1000, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

#model.summary()

In [106]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam", 
    metrics=[
        tf.keras.metrics.Accuracy(),
        tf.keras.metrics.FalseNegatives(),
        tf.keras.metrics.FalsePositives(),
        tf.keras.metrics.AUC(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ]
)

In [107]:
history = model.fit(train_set_without_issue_num, epochs=5, validation_data=val_set_without_issue_num)

Train for 126 steps, validate for 1600 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
model.evaluate(test_set_without_issue_num)



[0.25615801994372783, 0.932]

In [79]:
predictions = model.predict(test_set_without_issue_num).reshape(-1)
predictions = np.array(list(map(lambda val: 1 if val > 0.1 else 0, predictions)))

In [80]:
real_vals = np.array(list(test_set_without_issue_num.map(lambda x, p: p).as_numpy_iterator())).reshape(-1)

In [81]:
confusion_matrix(real_vals, predictions)

array([[850, 548],
       [ 58,  44]])

In [39]:


# TODO: confusion matrix function can't handle a mix of int and float targets

array([0.07405987, 0.08968125, 0.16573511, ..., 0.05871441, 0.11020781,
       0.11297136], dtype=float32)

In [37]:
#confusion_matrix(blaha, predictions)

ValueError: Classification metrics can't handle a mix of binary and continuous targets