In [1]:
import torch
import torch.optim as optim
from torchinfo import summary

from KSI_models import KSI, ModifiedKSI, LSTMattn
from KSI_utils import load_KSI_data, train_model, test_model

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
n_embedding = 100
n_hidden = 100 # 200 in paper, but too intensive for my machine
batch_size = 32
n_epochs = 25
save = True
profile = False
model_type = 'LSTMattn'

In [3]:
dir = 'data/original/'
loaders, wikivec, word_to_ix = load_KSI_data(dir=dir, 
                                             batch_size=batch_size, 
                                             train=True, 
                                             val=True, 
                                             test=True, 
                                             device=DEVICE)
train_dataloader = loaders['train']
val_dataloader = loaders['val']
test_dataloader = loaders['test']

n_wiki, n_vocab = wikivec.shape
n_words = len(word_to_ix)

In [4]:
# note_lengths = []
# for data in train_dataloader:
#     n, _, _ = data
#     note_lengths.append(n.shape[1])
# avg_note_size = np.round(np.array(note_lengths).mean()).astype(int)

avg_note_size = 2455

In [5]:
base_model = LSTMattn(n_words, n_wiki, n_embedding, n_hidden, batch_size)
base_model = base_model.to(DEVICE)
base_summary = summary(base_model, [(batch_size, avg_note_size), 
                                    (batch_size, n_vocab)], 
                       dtypes=[torch.int, torch.float])

base_summary

Layer (type:depth-idx)                   Output Shape              Param #
LSTMattn                                 --                        --
├─Embedding: 1-1                         [32, 2455, 100]           4,796,200
├─Dropout: 1-2                           [2455, 32, 100]           --
├─LSTM: 1-3                              [2455, 32, 100]           80,800
├─Linear: 1-4                            --                        34,400
├─Linear: 1-5                            --                        34,744
Total params: 4,946,144
Trainable params: 4,946,144
Non-trainable params: 0
Total mult-adds (G): 6.50
Input size (MB): 1.87
Forward/backward pass size (MB): 125.70
Params size (MB): 19.78
Estimated Total Size (MB): 147.35

In [6]:
optimizer = optim.Adam(base_model.parameters())
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, 
                                          max_lr=0.01, 
                                          steps_per_epoch=len(train_dataloader), 
                                          epochs=n_epochs)
prof_base = train_model(base_model, 
                        train_dataloader=train_dataloader,
                        val_dataloader=val_dataloader,
                        optimizer=optimizer,
                        scheduler=scheduler,
                        n_epochs=n_epochs,
                        profile=profile, 
                        log_path=f'./log/{model_type}',
                        device=DEVICE,
                        init_hidden=True)

Epoch: 001, Train Recall@10: 0.4406, Val Recall@10: 0.4461, Train Micro F1: 0.0000, Val Micro F1: 0.0000, Train Macro F1: 0.0000, Val Macro F1: 0.0000, Train Micro AUC: 0.9250, Val Micro AUC: 0.9104, Train Macro AUC: 0.5820, Val Macro AUC: 0.5881
Epoch: 002, Train Recall@10: 0.5104, Val Recall@10: 0.5133, Train Micro F1: 0.3573, Val Micro F1: 0.3602, Train Macro F1: 0.0168, Val Macro F1: 0.0205, Train Micro AUC: 0.9350, Val Micro AUC: 0.9214, Train Macro AUC: 0.6035, Val Macro AUC: 0.6123
Epoch: 003, Train Recall@10: 0.6852, Val Recall@10: 0.6824, Train Micro F1: 0.5851, Val Micro F1: 0.5820, Train Macro F1: 0.0612, Val Macro F1: 0.0733, Train Micro AUC: 0.9579, Val Micro AUC: 0.9486, Train Macro AUC: 0.6929, Val Macro AUC: 0.7063
Epoch: 004, Train Recall@10: 0.7698, Val Recall@10: 0.7623, Train Micro F1: 0.6401, Val Micro F1: 0.6338, Train Macro F1: 0.1198, Val Macro F1: 0.1440, Train Micro AUC: 0.9720, Val Micro AUC: 0.9653, Train Macro AUC: 0.7649, Val Macro AUC: 0.7748
Epoch: 005, 

In [7]:
if save:
    torch.save(base_model, f'{dir}{model_type}_model.pt')
if profile:
    print(prof_base.key_averages(group_by_stack_n=5).table(sort_by='self_cuda_time_total'))

In [8]:
tt_recall_at_k, tt_micro_f1, tt_macro_f1, tt_micro_auc, tt_macro_auc, label_aucs_base = test_model(base_model, 
                                                                                                   test_dataloader, 
                                                                                                   wikivec,
                                                                                                   by_label=False,
                                                                                                   device=DEVICE,
                                                                                                   init_hidden=True)
print(f'Test Recall@10: {tt_recall_at_k:.4f}, Test Micro F1: {tt_micro_f1:.4f}, Test Macro F1: {tt_macro_f1:.4f}' +
      f', Test Micro AUC: {tt_micro_auc:.4f}, Test Macro AUC: {tt_macro_auc:.4f}')
del base_model
if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Test Recall@10: 0.8179, Test Micro F1: 0.6785, Test Macro F1: 0.2844, Test Micro AUC: 0.9788, Test Macro AUC: 0.8617


In [9]:
ksi = KSI(n_embedding, n_vocab)
ksi.to(DEVICE)
model = LSTMattn(n_words, n_wiki, n_embedding, n_hidden, ksi=ksi)
model = model.to(DEVICE)
ksi_summary = summary(model, [(batch_size, avg_note_size), 
                              (batch_size, n_vocab),
                              (n_wiki, n_vocab)], 
                      dtypes=[torch.int, torch.float, torch.float])

ksi_summary

Layer (type:depth-idx)                   Output Shape              Param #
LSTMattn                                 --                        --
├─KSI: 1-1                               --                        --
│    └─Linear: 2-1                       --                        (recursive)
│    └─Linear: 2-2                       --                        (recursive)
│    └─Linear: 2-3                       --                        (recursive)
├─Embedding: 1-2                         [32, 2455, 100]           4,796,200
├─Dropout: 1-3                           [2455, 32, 100]           --
├─LSTM: 1-4                              [2455, 32, 100]           80,800
├─KSI: 1-1                               --                        --
│    └─Linear: 2-4                       [32, 344, 100]            1,217,400
│    └─Linear: 2-5                       [32, 344, 100]            10,100
│    └─Linear: 2-6                       [32, 344, 1]              101
Total params: 6,104,601
Trainable p

In [10]:
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, 
                                          max_lr=0.01, 
                                          steps_per_epoch=len(train_dataloader), 
                                          epochs=n_epochs)
prof_ksi = train_model(model, 
                       train_dataloader=train_dataloader,
                       val_dataloader=val_dataloader,
                       wikivec=wikivec,
                       optimizer=optimizer,
                       scheduler=scheduler,
                       n_epochs=n_epochs, 
                       profile=profile, 
                       log_path=f'./log/{model_type}_KSI',
                       device=DEVICE,
                       init_hidden=True)

Epoch: 001, Train Recall@10: 0.6381, Val Recall@10: 0.6367, Train Micro F1: 0.3920, Val Micro F1: 0.3868, Train Macro F1: 0.0458, Val Macro F1: 0.0537, Train Micro AUC: 0.9640, Val Micro AUC: 0.9561, Train Macro AUC: 0.8051, Val Macro AUC: 0.8029
Epoch: 002, Train Recall@10: 0.6889, Val Recall@10: 0.6802, Train Micro F1: 0.4564, Val Micro F1: 0.4487, Train Macro F1: 0.0823, Val Macro F1: 0.0936, Train Micro AUC: 0.9699, Val Micro AUC: 0.9615, Train Macro AUC: 0.8565, Val Macro AUC: 0.8349
Epoch: 003, Train Recall@10: 0.7583, Val Recall@10: 0.7427, Train Micro F1: 0.5575, Val Micro F1: 0.5412, Train Macro F1: 0.1407, Val Macro F1: 0.1457, Train Micro AUC: 0.9770, Val Micro AUC: 0.9685, Train Macro AUC: 0.8871, Val Macro AUC: 0.8555
Epoch: 004, Train Recall@10: 0.7891, Val Recall@10: 0.7700, Train Micro F1: 0.6172, Val Micro F1: 0.5954, Train Macro F1: 0.1966, Val Macro F1: 0.1939, Train Micro AUC: 0.9810, Val Micro AUC: 0.9721, Train Macro AUC: 0.9136, Val Macro AUC: 0.8664
Epoch: 005, 

In [11]:
if save:
    torch.save(model, f'{dir}{model_type}_KSI_model.pt')
if profile:
    print(prof_ksi.key_averages(group_by_stack_n=5).table(sort_by='self_cuda_time_total'))

In [12]:
tt_recall_at_k, tt_micro_f1, tt_macro_f1, tt_micro_auc, tt_macro_auc, label_aucs_ksi = test_model(model, 
                                                                                                  test_dataloader, 
                                                                                                  wikivec,
                                                                                                  by_label=True,
                                                                                                  device=DEVICE,
                                                                                                  init_hidden=True)
print(f'Test Recall@10: {tt_recall_at_k:.4f}, Test Micro F1: {tt_micro_f1:.4f}, Test Macro F1: {tt_macro_f1:.4f}' +
      f', Test Micro AUC: {tt_micro_auc:.4f}, Test Macro AUC: {tt_macro_auc:.4f}')
del model
if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Test Recall@10: 0.7960, Test Micro F1: 0.6460, Test Macro F1: 0.2915, Test Micro AUC: 0.9765, Test Macro AUC: 0.8754


In [13]:
# run modified KSI using frequency vectors rather than binary vectors
dir = 'data/original_freqs/'
loaders, wikivec, word_to_ix = load_KSI_data(dir=dir, 
                                             batch_size=batch_size, 
                                             train=True, 
                                             val=True, 
                                             test=True, 
                                             device=DEVICE)
train_dataloader = loaders['train']
val_dataloader = loaders['val']
test_dataloader = loaders['test']

n_wiki, n_vocab = wikivec.shape
n_words = len(word_to_ix)

In [14]:
mod_ksi = ModifiedKSI(n_embedding, n_vocab)
mod_ksi.to(DEVICE)
mod_model = LSTMattn(n_words, n_wiki, n_embedding, n_hidden, ksi=mod_ksi)
mod_model = mod_model.to(DEVICE)
mod_summary = summary(mod_model, [(batch_size, avg_note_size), 
                                  (batch_size, n_vocab),
                                  (n_wiki, n_vocab)], 
                      dtypes=[torch.int, torch.float, torch.float])

mod_summary

Layer (type:depth-idx)                   Output Shape              Param #
LSTMattn                                 --                        --
├─ModifiedKSI: 1-1                       --                        --
│    └─Linear: 2-1                       --                        (recursive)
│    └─Linear: 2-2                       --                        (recursive)
│    └─Linear: 2-3                       --                        (recursive)
│    └─Linear: 2-4                       --                        (recursive)
├─Embedding: 1-2                         [32, 2455, 100]           4,796,200
├─Dropout: 1-3                           [2455, 32, 100]           --
├─LSTM: 1-4                              [2455, 32, 100]           80,800
├─ModifiedKSI: 1-1                       --                        --
│    └─Linear: 2-5                       [32, 344, 12173, 1]       2
│    └─Linear: 2-6                       [32, 344, 100]            1,217,400
│    └─Linear: 2-7              

In [15]:
optimizer = optim.Adam(mod_model.parameters())
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, 
                                          max_lr=0.01, 
                                          steps_per_epoch=len(train_dataloader), 
                                          epochs=n_epochs)
prof_mod_ksi = train_model(mod_model, 
                           train_dataloader=train_dataloader,
                           val_dataloader=val_dataloader,
                           wikivec=wikivec,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           n_epochs=n_epochs, 
                           profile=profile, 
                           log_path=f'./log/{model_type}_ModifiedKSI',
                           device=DEVICE,
                           init_hidden=True)

Epoch: 001, Train Recall@10: 0.6451, Val Recall@10: 0.6446, Train Micro F1: 0.4168, Val Micro F1: 0.4139, Train Macro F1: 0.0290, Val Macro F1: 0.0347, Train Micro AUC: 0.9630, Val Micro AUC: 0.9558, Train Macro AUC: 0.8041, Val Macro AUC: 0.8172
Epoch: 002, Train Recall@10: 0.7118, Val Recall@10: 0.7086, Train Micro F1: 0.4962, Val Micro F1: 0.4906, Train Macro F1: 0.0999, Val Macro F1: 0.1220, Train Micro AUC: 0.9734, Val Micro AUC: 0.9673, Train Macro AUC: 0.8594, Val Macro AUC: 0.8609
Epoch: 003, Train Recall@10: 0.7361, Val Recall@10: 0.7312, Train Micro F1: 0.5287, Val Micro F1: 0.5227, Train Macro F1: 0.1250, Val Macro F1: 0.1406, Train Micro AUC: 0.9763, Val Micro AUC: 0.9704, Train Macro AUC: 0.8742, Val Macro AUC: 0.8682
Epoch: 004, Train Recall@10: 0.7839, Val Recall@10: 0.7763, Train Micro F1: 0.6059, Val Micro F1: 0.5961, Train Macro F1: 0.1624, Val Macro F1: 0.1763, Train Micro AUC: 0.9809, Val Micro AUC: 0.9753, Train Macro AUC: 0.8964, Val Macro AUC: 0.8869
Epoch: 005, 

In [16]:
if save:
    torch.save(mod_model, f'{dir}{model_type}_ModifiedKSI_model.pt')
if profile:
    print(prof_mod_ksi.key_averages(group_by_stack_n=5).table(sort_by='self_cuda_time_total'))

In [17]:
tt_recall_at_k, tt_micro_f1, tt_macro_f1, tt_micro_auc, tt_macro_auc, label_aucs_mod = test_model(mod_model, 
                                                                                                  test_dataloader, 
                                                                                                  wikivec,
                                                                                                  by_label=True,
                                                                                                  device=DEVICE,
                                                                                                  init_hidden=True)
print(f'Test Recall@10: {tt_recall_at_k:.4f}, Test Micro F1: {tt_micro_f1:.4f}, Test Macro F1: {tt_macro_f1:.4f}' +
      f', Test Micro AUC: {tt_micro_auc:.4f}, Test Macro AUC: {tt_macro_auc:.4f}')
del mod_model
if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Test Recall@10: 0.7939, Test Micro F1: 0.6528, Test Macro F1: 0.3120, Test Micro AUC: 0.9763, Test Macro AUC: 0.8784


In [18]:
# run modified KSI using tfidf vectors rather than binary vectors
dir = 'data/original_tfidf/'
loaders, wikivec, word_to_ix = load_KSI_data(dir=dir, 
                                             batch_size=batch_size, 
                                             train=True, 
                                             val=True, 
                                             test=True, 
                                             device=DEVICE)
train_dataloader = loaders['train']
val_dataloader = loaders['val']
test_dataloader = loaders['test']

n_wiki, n_vocab = wikivec.shape
n_words = len(word_to_ix)

In [19]:
mod_ksi2 = ModifiedKSI(n_embedding, n_vocab)
mod_ksi2.to(DEVICE)
tfidf_model = LSTMattn(n_words, n_wiki, n_embedding, n_hidden, ksi=mod_ksi2)
tfidf_model = tfidf_model.to(DEVICE)
tfidf_summary = summary(tfidf_model, [(batch_size, avg_note_size), 
                                      (batch_size, n_vocab),
                                      (n_wiki, n_vocab)], 
                        dtypes=[torch.int, torch.float, torch.float])

tfidf_summary

Layer (type:depth-idx)                   Output Shape              Param #
LSTMattn                                 --                        --
├─ModifiedKSI: 1-1                       --                        --
│    └─Linear: 2-1                       --                        (recursive)
│    └─Linear: 2-2                       --                        (recursive)
│    └─Linear: 2-3                       --                        (recursive)
│    └─Linear: 2-4                       --                        (recursive)
├─Embedding: 1-2                         [32, 2455, 100]           4,796,200
├─Dropout: 1-3                           [2455, 32, 100]           --
├─LSTM: 1-4                              [2455, 32, 100]           80,800
├─ModifiedKSI: 1-1                       --                        --
│    └─Linear: 2-5                       [32, 344, 12173, 1]       2
│    └─Linear: 2-6                       [32, 344, 100]            1,217,400
│    └─Linear: 2-7              

In [20]:
optimizer = optim.Adam(tfidf_model.parameters())
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, 
                                          max_lr=0.01, 
                                          steps_per_epoch=len(train_dataloader), 
                                          epochs=n_epochs)
prof_tfidf_ksi = train_model(tfidf_model, 
                             train_dataloader=train_dataloader,
                             val_dataloader=val_dataloader,
                             wikivec=wikivec,
                             optimizer=optimizer,
                             scheduler=scheduler,
                             n_epochs=n_epochs, 
                             profile=profile, 
                             log_path=f'./log/{model_type}_ModifiedKSI_tfidf',
                             device=DEVICE,
                             init_hidden=True)

Epoch: 001, Train Recall@10: 0.6261, Val Recall@10: 0.6211, Train Micro F1: 0.4247, Val Micro F1: 0.4219, Train Macro F1: 0.0331, Val Macro F1: 0.0400, Train Micro AUC: 0.9602, Val Micro AUC: 0.9521, Train Macro AUC: 0.7728, Val Macro AUC: 0.7855
Epoch: 002, Train Recall@10: 0.6808, Val Recall@10: 0.6763, Train Micro F1: 0.4540, Val Micro F1: 0.4503, Train Macro F1: 0.0784, Val Macro F1: 0.0955, Train Micro AUC: 0.9690, Val Micro AUC: 0.9622, Train Macro AUC: 0.8316, Val Macro AUC: 0.8377
Epoch: 003, Train Recall@10: 0.7122, Val Recall@10: 0.7073, Train Micro F1: 0.5043, Val Micro F1: 0.5007, Train Macro F1: 0.0989, Val Macro F1: 0.1098, Train Micro AUC: 0.9734, Val Micro AUC: 0.9672, Train Macro AUC: 0.8607, Val Macro AUC: 0.8574
Epoch: 004, Train Recall@10: 0.7786, Val Recall@10: 0.7732, Train Micro F1: 0.6039, Val Micro F1: 0.5953, Train Macro F1: 0.1495, Val Macro F1: 0.1613, Train Micro AUC: 0.9806, Val Micro AUC: 0.9753, Train Macro AUC: 0.8914, Val Macro AUC: 0.8865
Epoch: 005, 

In [21]:
if save:
    torch.save(tfidf_model, f'{dir}{model_type}_ModifiedKSI_tfidf_model.pt')
if profile:
    print(prof_tfidf_ksi.key_averages(group_by_stack_n=5).table(sort_by='self_cuda_time_total'))

In [22]:
tt_recall_at_k, tt_micro_f1, tt_macro_f1, tt_micro_auc, tt_macro_auc, label_aucs_mod = test_model(tfidf_model, 
                                                                                                  test_dataloader, 
                                                                                                  wikivec,
                                                                                                  by_label=True,
                                                                                                  device=DEVICE,
                                                                                                  init_hidden=True)
print(f'Test Recall@10: {tt_recall_at_k:.4f}, Test Micro F1: {tt_micro_f1:.4f}, Test Macro F1: {tt_macro_f1:.4f}' +
      f', Test Micro AUC: {tt_micro_auc:.4f}, Test Macro AUC: {tt_macro_auc:.4f}')
del tfidf_model
if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Test Recall@10: 0.7925, Test Micro F1: 0.6491, Test Macro F1: 0.3164, Test Micro AUC: 0.9751, Test Macro AUC: 0.8693
