# config and Load Data

In [None]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from BaseNet_clstm import *
from model import MMDNet, PredictNet
from loss import *

In [None]:

# *** MODIFICATIONS WERE MADE TO THIS CELL TO ANONYMIZE DATA NAMES - OUTPUT WAS CLEARED ***

# ===================================== Hyper-parameters =====================================
batch_size = 512
lr = 0.001
l2_decay = 5e-4
num_epochs = 10
input_size = 150
hidden_size = 128 # output of hidden features
sequence_length = 100 # number of visits
num_layers = 2
num_classes = 1 # dense 1 for binary classification
# ===================================== LOAD DATA =====================================

print("loading data...")
path = r"/dirpath/"
X_site1 = np.load(path + "site1_X.npy")
y_site1 = np.load(path + "site1_y.npy")
print("site1 data:", X_site1.shape)

X_site2 = np.load(path + "site2_X.npy")
y_site2 = np.load(path + "site2_y.npy")
print("site2 data:", X_site2.shape)

X_site3 = np.load(path + "site3_X.npy")
y_site3 = np.load(path + "site3_y.npy")
print("site3 data:", X_site3.shape)

X_site5 = np.load(path + "site5_X.npy")
y_site5 = np.load(path + "site5_y.npy")
print("site5 data:", X_site5.shape)

X_site4 = np.load(path + "site4_X.npy")
y_site4 = np.load(path + "site4_y.npy")
print("site4 data:", X_site4.shape)

# --------------------------------

# reshape to 3D
print("\nReshaping to 3D...")
X_site1 = X_site1.reshape((X_site1.shape[0], 100, 150))
print("site1 - 3D tensor:", X_site1.shape)

X_site2 = X_site2.reshape((X_site2.shape[0], 100, 150))
print("site2 - 3D tensor:", X_site2.shape)

X_site3 = X_site3.reshape((X_site3.shape[0], 100, 150))
print("site3 - 3D tensor:", X_site3.shape)

X_site4 = X_site4.reshape((X_site4.shape[0], 100, 150))
print("site4 - 3D tensor:", X_site4.shape)

X_site5 = X_site5.reshape((X_site5.shape[0], 100, 150))
print("site5 - 3D tensor:", X_site5.shape)

# -----------------------------------------------
print("preparing for DataLoader (X, y) tuple...")
site1_data = []
for i in range(len(X_site1)):
    t_ = (X_site1[i], y_site1[i])
    site1_data.append( t_ )

site2_data = []
for i in range(len(X_site2)):
    t_ = (X_site2[i], y_site2[i])
    site2_data.append( t_ )

site5_data = []
for i in range(len(X_site5)):
    t_ = (X_site5[i], y_site5[i])
    site5_data.append( t_ )

site3_data = []
for i in range(len(X_site3)):
    t_ = (X_site3[i], y_site3[i])
    site3_data.append( t_ )

site4_data = []
for i in range(len(X_site4)):
    t_ = (X_site4[i], y_site4[i])
    site4_data.append( t_ )

# -----------------------------------------

# split train validation -> 5% validation from training sets
X_site1_train, X_site1_val = train_test_split(
    site1_data, test_size=0.05, random_state=42)

X_site3_train, X_site3_val = train_test_split(
    site3_data, test_size=0.05, random_state=42)

X_site4_train, X_site4_val = train_test_split(
    site4_data, test_size=0.05, random_state=42)

X_site2_train, X_site2_val = train_test_split(
    site2_data, test_size=0.05, random_state=42)


print(
    "X_site1_train %d \n"
    "X_site1_val %d \n"
    "X_site2_train %d \n"
    "X_site2_val %d \n"
    "X_site3_train %d \n"
    "X_site3_val %d \n"
    "X_site4_train %d \n"
    "X_site4_val %d \n"
    "X_site5 -> test %d \n"

    % (
        len(X_site1_train), len(X_site1_val),
        len(X_site2_train), len(X_site2_val),
        len(X_site3_train), len(X_site3_val),
        len(X_site4_train), len(X_site4_val),
        len(site5_data),
    )
)

# DataLoader
print("DataLoader...")
# test
tgt_test_dataloader = DataLoader(dataset=site5_data, batch_size=batch_size, shuffle=False, drop_last=True)

# train
src_1_train_dataloader = DataLoader(dataset=X_site1_train, batch_size=batch_size, shuffle=False, drop_last=True)
src_1_validate_dataloader = DataLoader(dataset=X_site1_val, batch_size=batch_size, shuffle=False, drop_last=True)

src_2_train_dataloader = DataLoader(dataset=X_site3_train, batch_size=batch_size, shuffle=False, drop_last=True)
src_2_validate_dataloader = DataLoader(dataset=X_site3_val, batch_size=batch_size, shuffle=False, drop_last=True)

src_3_train_dataloader = DataLoader(dataset=X_site2_train, batch_size=batch_size, shuffle=False, drop_last=True)
src_3_validate_dataloader = DataLoader(dataset=X_site2_val, batch_size=batch_size, shuffle=False, drop_last=True)

src_4_train_dataloader = DataLoader(dataset=X_site4_train, batch_size=batch_size, shuffle=False, drop_last=True)
src_4_validate_dataloader = DataLoader(dataset=X_site4_val, batch_size=batch_size, shuffle=False, drop_last=True)

# ==========================================================================================
device = torch.device("cpu")
len_tgt = len(tgt_test_dataloader)
len_src_1_train = len(src_1_train_dataloader)
len_src_2_train = len(src_2_train_dataloader)
len_src_3_train = len(src_3_train_dataloader)
len_src_4_train = len(src_4_train_dataloader)


src_loss_list = []
total_loss_list = []
tgt_val_loss_list = []

seed = 32
np.random.seed(seed=seed)
torch.manual_seed(seed)

# Initialize network
BaseNet_1 = BaseNet_clstm(input_size, hidden_size, num_layers, num_classes).to(device)
BaseNet_2 = BaseNet_clstm(input_size, hidden_size, num_layers, num_classes).to(device)
BaseNet_3 = BaseNet_clstm(input_size, hidden_size, num_layers, num_classes).to(device)
BaseNet_4 = BaseNet_clstm(input_size, hidden_size, num_layers, num_classes).to(device)
TransferNet_1 = MMDNet().to(device)
TransferNet_2 = MMDNet().to(device)
TransferNet_3 = MMDNet().to(device)
TransferNet_4 = MMDNet().to(device)
TaskNet = PredictNet().to(device)
# SELF ATTENTION - ADD IN V2

# Loss and optimizer
task_criterion = nn.BCELoss()

# optimizer = optim.Adam(BaseNet.parameters(), lr=lr)
optimizer = optim.Adam([
    {'params': BaseNet_1.parameters()},
    {'params': BaseNet_2.parameters()},
    {'params': BaseNet_3.parameters()},
    {'params': BaseNet_4.parameters()},
    {'params': TransferNet_1.parameters()},
    {'params': TransferNet_2.parameters()},
    {'params': TransferNet_3.parameters()},
    {'params': TransferNet_4.parameters()},
    {'params': TaskNet.parameters()}], lr=lr, weight_decay=l2_decay)


src_loss_list = []
total_loss_list = []
tgt_val_loss_list = []

best_bce = 1.0

print("===done===")

In [None]:
for epoch in range(num_epochs):
    print("Epoch %d..." % epoch)
    BaseNet_1.train()
    BaseNet_2.train()
    BaseNet_3.train()
    BaseNet_4.train()
    TransferNet_1.train()
    TransferNet_2.train()
    TransferNet_3.train()
    TransferNet_4.train()
    TaskNet.train()

    src_train_avg_bce = 0
    mmd_loss = 0

    iter_src_1 = iter(src_1_train_dataloader)
    iter_src_2 = iter(src_2_train_dataloader)
    iter_src_3 = iter(src_3_train_dataloader)
    iter_src_4 = iter(src_4_train_dataloader)

    num_iter = len_src_4_train
    for i in range(0, num_iter):

        src_data_x_1, src_data_y_1 = next(iter_src_1)
        src_data_x_2, src_data_y_2 = next(iter_src_2)
        src_data_x_3, src_data_y_3 = next(iter_src_3)
        src_data_x_4, src_data_y_4 = next(iter_src_4)

        # because source sizes are different thus # batches will be different
        if (i+1) % len_src_1_train == 0: iter_src_1 = iter(src_1_train_dataloader)
        if (i+1) % len_src_2_train == 0: iter_src_2 = iter(src_2_train_dataloader)
        if (i+1) % len_src_3_train == 0: iter_src_3 = iter(src_3_train_dataloader)
        # no need to reset src 4 since we are iterating through it

        src_data_x_1 = src_data_x_1.float().to(device)
        src_data_x_2 = src_data_x_2.float().to(device)
        src_data_x_3 = src_data_x_3.float().to(device)
        src_data_x_4 = src_data_x_4.float().to(device)
        src_data_y_1 = src_data_y_1.float().to(device)
        src_data_y_2 = src_data_y_2.float().to(device)
        src_data_y_3 = src_data_y_3.float().to(device)
        src_data_y_4 = src_data_y_4.float().to(device)

        optimizer.zero_grad()

        # ------------ forward ------------
        # LSTM - > Tansfer Net (fully connected linear with output 100) ->
        # SELECTED IN V2 - SELF ATTENTION -> TRANSFER NET (fully connected) - SETTING 2 -> 
        # TaskNet (predict net fully connected linear output 1)
        features_src_1 = BaseNet_1(src_data_x_1).squeeze()
        features_src_2 = BaseNet_2(src_data_x_2).squeeze()
        features_src_3 = BaseNet_3(src_data_x_3).squeeze()
        features_src_4 = BaseNet_4(src_data_x_4).squeeze()

        features_1 = TransferNet_1(features_src_1)
        features_2 = TransferNet_1(features_src_2)
        features_3 = TransferNet_1(features_src_3)
        features_4 = TransferNet_1(features_src_4)

        inputs_to_taskNet = torch.cat((features_1,
                                       features_2,
                                       features_3,
                                       features_4), dim=0)

        outputs = TaskNet(inputs_to_taskNet)

        # ---- TASK LOSS - Binary Cross Entropy (BCE) ----
        task_loss_src_1 = task_criterion(
            outputs.narrow(dim=0, start=0, length=512).squeeze(),
            src_data_y_1
        )
        task_loss_src_2 = task_criterion(
            outputs.narrow(dim=0, start=512,
                           length=512).squeeze(),
            src_data_y_2
        )
        task_loss_src_3 = task_criterion(
            outputs.narrow(dim=0, start=1024, length=512).squeeze(),
            src_data_y_3
        )
        task_loss_src_4 = task_criterion(
            outputs.narrow(dim=0, start=1536, length=512).squeeze(),
            src_data_y_4
        )
        task_loss = task_loss_src_1 + task_loss_src_2 + task_loss_src_3 + task_loss_src_4

        # ---- TRANSFER LOSS - DAN ----
        embeddings_to_match = [features_1, features_2, features_3, features_4]
        transfer_losses = []
        for ii in range(0, len(embeddings_to_match)-1):
            for j in range(ii+1, len(embeddings_to_match)):

                transfer_loss_ = DAN(
                    embeddings_to_match[ii],
                    embeddings_to_match[j]
                )
                transfer_losses.append(transfer_loss_)

        # np.sum(transfer_losses)
        transfer_loss = 1
        for l_ in transfer_losses:
            transfer_loss += l_

        # ---- TOTAL LOSS ----
        # total loss
        total_loss = (0.5 * transfer_loss) + task_loss

        src_train_avg_bce += task_loss.item()
        mmd_loss += transfer_loss.item()

        # backward
        task_loss.backward() # change to total loss - this is just to experiment differences
        # gradient descent update step/adam step
        optimizer.step()
        print("batch %d/%d loss: %f" % (i, epoch, total_loss))
    print("======================================================================================")

    src_train_avg_bce /= len_src_4_train
    mmd_loss /= len_src_4_train
    src_loss_list.append(src_train_avg_bce)
    total_loss_list.append(src_train_avg_bce + mmd_loss)

    print('Epoch: [{}/{}], Source train loss: {}, MMD loss: {}'.format(epoch+1, num_epochs, src_train_avg_bce, mmd_loss))


# ============== VALIDATION ==============
    #
    # BaseNet_1.train()
    # BaseNet_2.train()
    # BaseNet_3.train()
    # BaseNet_4.train()
    # TransferNet.eval()
    # TaskNet.eval()
    #
    # tgt_validate_avg_bce = 0
    # len_tgt_validate = len(src_1_validate_dataloader) + \
    #                     len(src_2_validate_dataloader) + \
    #                     len(src_3_validate_dataloader) + \
    #                     len(src_4_validate_dataloader)
    #
    # validation_data_concat = torch.cat((src_1_validate_dataloader,
    #                                     src_2_validate_dataloader,
    #                                     src_3_validate_dataloader,
    #                                     src_4_validate_dataloader), dim=0)
    #
    # for i, (val_data_x, val_data_y) in enumerate(validation_data_concat):
    #
    #         val_data_x, val_data_y = val_data_x.float().to(device), val_data_y.float().to(device)
    #
    #         features = TransferNet(BaseNet(tgt_data_x))
    #         tgt_output = TaskNet(features)
    #
    #         tgt_loss = task_criterion(tgt_output.squeeze(), tgt_data_y)
    #
    #         tgt_validate_avg_bce += tgt_loss.item()
    #
    #     tgt_validate_avg_bce /= len_tgt_validate
    #     tgt_val_loss_list.append(tgt_validate_avg_bce)
    #
    #     if tgt_validate_avg_bce < best_bce:
    #         best_bce = tgt_validate_avg_bce
    #
    #     print('Epoch: [{}/{}], Source train loss: {}, MMD loss: {}, tgt_best_validate_loss: {}'.format(epoch+1, num_epochs, src_train_avg_bce, mmd_loss, best_bce))
    #
    #


In [None]:

# Check accuracy on training & test to see how good our model
def check_accuracy(loader):
    num_correct = 0
    num_samples = 0

    y_true = []
    y_pred = [] # y hat
    y_pred_proba = []

    # Set model to eval
    BaseNet_1.eval()
    BaseNet_2.eval()
    BaseNet_3.eval()
    BaseNet_4.eval()
    TransferNet_1.eval()
    TransferNet_2.eval()
    TransferNet_3.eval()
    TransferNet_4.eval()
    TaskNet.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.float()
            y = y.float()
            x = x.to(device=device)
            y = y.to(device=device)

            y_true.extend(y.numpy().tolist())

            features = BaseNet_2(x).squeeze()
            features = TransferNet_2(features)
            tgt_output = TaskNet(features)

            y_pred_proba.append( tgt_output )
            for score in tgt_output:
                if score > 0.5: y_pred.append(1)
                else: y_pred.append(0)

    # Toggle model back to train
    TaskNet.train()
    return y_true, y_pred, y_pred_proba

y_true, y_pred, y_pred_proba = check_accuracy(tgt_test_dataloader)


In [None]:
np.unique(np.array(y_pred), return_counts=True)
np.unique(np.array(y_true), return_counts=True)

In [None]:
np.asarray(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, average_precision_score, roc_auc_score, accuracy_score

# print("ROC AUC SCORE", roc_auc_score(y_true, y_pred_proba))

metrics_lstm = precision_recall_fscore_support(y_true, y_pred, average='weighted') # y_true, y_hat

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() # y_true, y_pred
specificity = tn / (tn+fp)

print("precision", metrics_lstm[0])
print("recall", metrics_lstm[1])
print("F1 score", metrics_lstm[2])
print("specificity", specificity)
print("accuracy", accuracy_score(y_true, y_pred))