In [None]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

from imports import *
from utils import *
from constants import *
from models import *
from trains import train_model
from predicts import predict_model

torch.cuda.set_device(0)

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
torch.cuda.manual_seed_all(239387)
torch.manual_seed(239387)
random.seed(239387)
np.random.seed(239387)

In [None]:
dataloaders = {}

with open("./data/preprocessed_BB/train.json", "r") as read_file:
    dataloaders['train'] = json.load(read_file)
    
with open("./data/preprocessed_BB/dev.json", "r") as read_file:
    dataloaders['dev'] = json.load(read_file)
    
with open("./data/preprocessed_BB/test.json", "r") as read_file:
    dataloaders['test'] = json.load(read_file)

In [None]:
len(dataloaders['train']), len(dataloaders['dev']), len(dataloaders['test'])

In [None]:
label = {'train': {}, 'dev': {}, 'test': {}}

for phase in ['train', 'dev', 'test']:
    for dataloader in dataloaders[phase]:
        try:
            label[phase][dataloader['label']]
        except KeyError:
            label[phase][dataloader['label']] = 0
        else:
            label[phase][dataloader['label']] += 1
            
print("Data statistic:")
print(f"  - Train: {label['train']}")
print(f"  - Dev: {label['dev']}")
print(f"  - Test: {label['test']}")

## Preprocessing Data

In [None]:
# Build vocab
vocabs = {}
for phase in ['train', 'dev', 'test']:
    for dataloader in dataloaders[phase]:
        for word in dataloader['full_inputs']['full_token']:
            try:
                vocabs[word]
            except KeyError:
                vocabs[word] = 1
            else:
                vocabs[word] += 1

# Sort by freq
vocabs = sorted(vocabs.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Load w2v model
w2v_model = word2vec.KeyedVectors.load_word2vec_format(W2V_MODEL_PATH, binary=True)

In [None]:
# Global max relative distance
max_distance = float(np.max([np.max(np.abs(input_dict['full_inputs']['full_dist1'] + input_dict['full_inputs']['full_dist2'])) for input_dict in dataloaders['train']+dataloaders['dev']+dataloaders['test']]))

word_to_ix, pos_to_ix, distance_to_ix, dependency_to_ix, char_to_ix, in_vocab_count = build_vocab(dataloaders, w2v_model, vocabs)

pretrained_embedding_matrix, distance_pretrain_embedding_matrix = build_pretrain_embedding_matrix(w2v_model, 
                                                                                                  word_to_ix, 
                                                                                                  distance_to_ix, 
                                                                                                  max_distance)

In [None]:
glob_max_sentence_length = np.max([np.max(
                [len(input_dict['full_inputs']['full_token']), 
                 len(input_dict['full_inputs']['full_pos']), 
                 len(input_dict['full_inputs']['full_dist1']),
                 len(input_dict['full_inputs']['full_dist2'])]) for input_dict in dataloaders['train']+dataloaders['dev']+dataloaders['test']])

In [None]:
batch_size = 4

model = Frankenstein(len(word_to_ix), len(pos_to_ix), len(distance_to_ix),
                     glob_max_sentence_length, pretrained_embedding_matrix, distance_pretrain_embedding_matrix, 
                     batch_size, drop=0.5, hidden_dim=64, h=2, multihead_sizes=3)

criterion = nn.CrossEntropyLoss()

optimizer_ft = optim.Adam(filter(lambda p: p.requires_grad,model.parameters()), lr=1e-3)

## Using ELMo and BERT models

In [None]:
# ELMo
elmo_model = Elmo(ELMO_OPTIONS_PATH, ELMO_MODEL_PATH, 1, dropout=0)

# BERT
finetune_berts = []
all_dataloaders = dataloaders['train']+dataloaders['dev']+dataloaders['test']
with open(BERT_FEATURES_PATH, 'rb') as f: # opening file in binary(rb) mode    
    for idx, item in enumerate(json_lines.reader(f)):
        all_dataloaders[idx]['bert_features'] = np.sum([np.array(layer['values']) for layer in item['features'][0]['layers']], axis=0)

## Training

In [None]:
model.cuda()

In [None]:
# 10 epochs

train_out = train_model(model, elmo_model, dataloaders['train'], dataloaders['dev'], word_to_ix, pos_to_ix, 
                        distance_to_ix, criterion, optimizer_ft, num_epochs=100, 
                        early_stopped_patience=10, batch_size=batch_size)

(model, train_f1, val_f1, history) = train_out

## Predicting and Generating submission (.a2) files

In [None]:
y_true, y_pred, test_dataloader = predict_model(model, elmo_model, dataloaders['test'], word_to_ix, 
                                            pos_to_ix, distance_to_ix, batch_size, optimizer_ft)

In [None]:
len(y_pred), len(dataloaders['test'])

In [None]:
# Create output path
if not os.path.exists(OUTPUT_DIR_PATH):
    os.mkdir(OUTPUT_DIR_PATH)

In [None]:
model_dir_name = "test_prediction"

if not os.path.exists(f'{OUTPUT_DIR_PATH}/{model_dir_name}'):
    os.mkdir(f'{OUTPUT_DIR_PATH}/{model_dir_name}')

In [None]:
test_data = test_dataloader

In [None]:
file = minidom.parse("./data/BioNLP-ST-2016_BB-event_test.xml")
docs = file.getElementsByTagName("document")
all_test_files = []
for doc in docs:
    all_test_files.append(doc.getAttribute("origId"))

In [None]:
write_dict = {}
relation_idx_dict = {}
pred_test_files = set()
for idx, input_dict in enumerate(test_data):
    inputs = input_dict['shortest_inputs']
    entity_tag = input_dict['entity_pair']
    label = input_dict['label']
    entity_idx_to_type = input_dict['entity_idx_to_type']

    if y_pred[idx] == 1:
        document_idx = input_dict['document_id']
        pred_test_files.add(document_idx)
        entity_idx_to_origId = input_dict['entity_idx_to_origId']
        first_match = re.match(r'(BB-event-\d+).(T\d+)', entity_idx_to_origId[entity_tag[0]])
        second_match = re.match(r'(BB-event-\d+).(T\d+)', entity_idx_to_origId[entity_tag[1]])
        
        first_entity = first_match.group(2).upper()
        second_entity = second_match.group(2).upper()
        first_doc = first_match.group(1)
        second_doc = second_match.group(1)
        
        try:
            relation_idx_dict[document_idx] += 1
        except KeyError:
            relation_idx_dict[document_idx] = 1
        
        try:
            write_dict[f"{OUTPUT_DIR_PATH}/{model_dir_name}/{document_idx}.a2"]
        except KeyError:
            write_dict[f"{OUTPUT_DIR_PATH}/{model_dir_name}/{document_idx}.a2"] = set()
        write_dict[f"{OUTPUT_DIR_PATH}/{model_dir_name}/{document_idx}.a2"].add(f"R{relation_idx_dict[document_idx]}\tLives_In Bacteria:{first_entity} Location:{second_entity}\n")
        
for key, value in write_dict.items():
    write_str = "".join([i[1] for i in sorted([(int(d.split('\t')[0][1:]), d) for d in list(value)], key=lambda tup: tup[0])])
    f = open(f"{key}", "w")
    f.write(write_str)
    f.close()   
    
pred_test_files = list(pred_test_files)
for test_file in all_test_files:
    if not test_file in pred_test_files:
        f = open(f"{OUTPUT_DIR_PATH}/{model_dir_name}/{test_file}.a2", "a+")
        f.write("")
        f.close()

In [None]:
# Create submission .zip file 
shutil.make_archive(f"{OUTPUT_DIR_PATH}/{model_dir_name}", 'zip', f"{OUTPUT_DIR_PATH}/{model_dir_name}")

In [None]:
# Remove submission folders
shutil.rmtree(f"{OUTPUT_DIR_PATH}/{model_dir_name}")