In [1]:
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from keras.layers import Dense, Input, Dropout, Flatten, Concatenate
from keras.models import Model
from keras.optimizers import Adam
from tensorflow import float16

import wandb
from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint
import pickle
from transformers import  TFBertModel
bmodel = TFBertModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

2023-11-09 17:14:22.504527: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-09 17:14:24.277916: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14211 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:af:00.0, compute capability: 7.0
Some layers from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT e

In [2]:
with open('data/dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

for k, v in dataset.items():
    print(k, v.keys())

sentence dict_keys(['text', 'max_length', 'encoded', 'test', 'train'])
predication dict_keys(['text', 'max_length', 'encoded', 'test', 'train'])
concatenated dict_keys(['text', 'max_length', 'encoded', 'test', 'train'])
label dict_keys(['text', 'test', 'train'])


In [3]:
def train_model():
    # these are config values that came from wandb sweep passes
    config_defaults= {
        "model_name": "two_input_model",
        "dropout": 0.43,
        "learning_rate": 0.000030,
        "epsilon": 1e-08,
        "loss": "binary_crossentropy",
        "metric": ["accuracy"],
        "epoch": 6,
        "batch_size":4,
        "validation_split": 0.2,
    }
    
    wandb.init(
        project="uminn",
        config=config_defaults
    )

    train_x = (dataset['predication']['train'], dataset['sentence']['train'])
    train_y = dataset['label']['train']
    
    # define the model

    input_preds = Input(shape=(32,), dtype='int32')
    input_sents = Input(shape=(250,), dtype='int32')
    preds_bert_output = bmodel(input_preds)
    sents_bert_output = bmodel(input_sents)
    preds_bert_output = preds_bert_output[1]
    sents_bert_output = sents_bert_output[1]
    hidden_sents_1 = Dense(32, activation='relu')(sents_bert_output)
    dropout_sents_1 = Dropout(wandb.config.dropout)(hidden_sents_1)
    concat = Concatenate()([preds_bert_output, hidden_sents_1])
    dense = Dense(1, activation='sigmoid')(concat)
    two_input_model = Model(inputs=[input_preds, input_sents], outputs=dense)

                         
    opt = Adam(learning_rate = wandb.config.learning_rate, 
               epsilon=wandb.config.epsilon)

    two_input_model.compile(loss=wandb.config.loss, 
                         optimizer=opt, 
                         metrics=['accuracy'])
    
    two_input_model.fit(train_x,
                     train_y,
                     epochs=wandb.config.epoch,
                     validation_split=wandb.config.validation_split,
                     callbacks=[WandbMetricsLogger()])
    
    return two_input_model

    

In [4]:
# # just do one to make sure we're OK here
two_input_model = train_model()
wandb.finish()

In [5]:
# Or do a hyperparameter sweep to find some good settings

sweep_config = {
    'method': 'bayes',
    'name': 'umn-two-input',
    'metric': {
        'name': 'epoch/val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        "dropout": {'min': 0.20, 'max': 0.70},
        "learning_rate": {'values': [5e-5, 3e-5, 2e-5]},
        "epoch": {'min': 6, 'max': 10},
        "batch_size": {'values': [2]}
    },
}

sweep_id = wandb.sweep(sweep_config, project="uminn")
wandb.agent(sweep_id, train_model, count=10)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: r4abtlfi
Sweep URL: https://wandb.ai/dlhs_rau/uminn/sweeps/r4abtlfi


[34m[1mwandb[0m: Agent Starting Run: oaqs1n4f with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	dropout: 0.5566476411397208
[34m[1mwandb[0m: 	epoch: 10
[34m[1mwandb[0m: 	learning_rate: 5e-05
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkgweber[0m ([33mdlhs_rau[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0,1
epoch/accuracy,▁▂▃▅▆▇▇███
epoch/epoch,▁▂▃▃▄▅▆▆▇█
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁
epoch/loss,██▇▆▅▃▂▂▁▁
epoch/val_accuracy,▁▃█▇▄▇▇▅▇▃
epoch/val_loss,▁▁▁▁▂▃▃▅▇█

0,1
epoch/accuracy,0.97448
epoch/epoch,9.0
epoch/learning_rate,5e-05
epoch/loss,0.07456
epoch/val_accuracy,0.59792
epoch/val_loss,1.99439


[34m[1mwandb[0m: Agent Starting Run: yci0efr4 with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	dropout: 0.5254547559667291
[34m[1mwandb[0m: 	epoch: 7
[34m[1mwandb[0m: 	learning_rate: 2e-05
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch 1/7
 9/60 [===>..........................] - ETA: 30s - loss: 0.2912 - accuracy: 0.8993

2023-11-09 17:22:31.202138: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 23.44MiB (rounded to 24576000)requested by op model/tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/batchnorm_1/mul_2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-11-09 17:22:31.202282: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2023-11-09 17:22:31.202313: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 120, Chunks in use: 119. 30.0KiB allocated for chunks. 29.8KiB in use in bin. 1.7KiB client-requested in use in bin.
2023-11-09 17:22:31.202337: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Run yci0efr4 errored: ResourceExhaustedError()
[34m[1mwandb[0m: [32m[41mERROR[0m Run yci0efr4 errored: ResourceExhaustedError()
[34m[1mwandb[0m: Agent Starting Run: cclm8rte with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	dropout: 0.5299810779406129
[34m[1mwandb[0m: 	epoch: 7
[34m[1mwandb[0m: 	learning_rate: 3e-05
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch 1/7


2023-11-09 17:23:31.110663: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 23.44MiB (rounded to 24576000)requested by op model/tf_bert_model/bert/encoder/layer_._11/attention/self/key/Tensordot_1/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-11-09 17:23:31.110825: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2023-11-09 17:23:31.110854: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 201, Chunks in use: 200. 50.2KiB allocated for chunks. 50.0KiB in use in bin. 2.9KiB client-requested in use in bin.
2023-11-09 17:23:31.110874: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 1, Chunks in use: 0. 512B allocated for chunks. 0B in use in bin. 0B cl

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Run cclm8rte errored: ResourceExhaustedError()
[34m[1mwandb[0m: [32m[41mERROR[0m Run cclm8rte errored: ResourceExhaustedError()
[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


### Evaluation

In [6]:
two_input_model.summary()

NameError: name 'two_input_model' is not defined

In [None]:
mse_test = two_input_model.evaluate(x=(dataset['predication']['test'], dataset['sentence']['test']), y=dataset['label']['test'])

In [None]:
y_pred = two_input_model.predict((dataset['predication']['test'], dataset['sentence']['test']))

In [None]:
strata = pd.DataFrame({'y_pred': y_pred.round().flatten(), 'label': dataset['label']['test'],
                       # 'predicate': sentences.predicate[0:test_size]
                      })

In [None]:
print("OVERALL")
print(classification_report(strata.label, strata.y_pred))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(strata.label, strata.y_pred, normalize='true')).plot()
plt.show()

In [None]:
# for stratum in strata.predicate.unique():
#     print(stratum)
#     df = strata[strata.predicate == stratum]
#     print(classification_report(df.label, df.y_pred))
#     print(strata[strata.predicate == stratum].y_pred.value_counts())
#     disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(df.label, df.y_pred, normalize='true')).plot()
#     plt.show()

In [None]:
wandb.finish()

In [None]:
concat_model.save('models/concat_resilient_music.keras')