In [None]:
import random
import numpy as np
import torch
import re
import glob
import io
import time
from statistics import mean bert
import os
import json
import gensim
from tqdm import tqdm

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import mpld3

from transformers import BertTokenizer, BertModel

from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import Vectors

%matplotlib inline
mpld3.enable_notebook()

#torch.cuda.set_device(1)

In [None]:
SEED = 77
BATCH_SIZE = 64
MAX_SEQUENCE = 2048
MAX_VOCAB_SIZE = 25000
HIDDEN_DIM = 100
OUTPUT_DIM = 5
EMBEDDING_DIM = 100
N_EPOCHS = 500
TRAIN_RATIO = 0.8
POS_WEIGHT = torch.tensor([1, 7, 8, 4, 9])
MICRO = 'micro'
MACRO = 'macro'

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
DATA_FOLDER="FastText-Multi"

DROPOUT=0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Utilities

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def f_measure(predictions, labels):
    diagnoses = {}
    MICRO = 'micro'
    MACRO = 'macro'
    predicts = []
    diagnoses[MICRO] = {}
    
    rounded_preds = torch.round(torch.sigmoid(predictions))
    predicts.extend(rounded_preds.data.tolist())
    
    for index, value in enumerate(rounded_preds):
        for did, dvalue in enumerate(rounded_preds[index]):
            v = dvalue.item()                    
            if v == 1:
                if dvalue == labels[index, did]:
                    if did not in diagnoses:
                        diagnoses[did] = {}                                
                    diagnoses[did]['tp'] = diagnoses[did].get('tp', 0) + 1
                    diagnoses[MICRO]['tp'] = diagnoses[MICRO].get('tp', 0) + 1
                else:
                    if did not in diagnoses:
                        diagnoses[did] = {}
                    diagnoses[did]['fp'] = diagnoses[did].get('fp', 0) + 1
                    diagnoses[MICRO]['fp'] = diagnoses[MICRO].get('fp', 0) + 1
            elif v == 0:
                if 1 == labels[index, did].item():
                    if did not in diagnoses:
                        diagnoses[did] = {}
                    diagnoses[did]['fn'] = diagnoses[did].get('fn', 0) + 1
                    diagnoses[MICRO]['fn'] = diagnoses[MICRO].get('fn', 0) + 1
    diagnoses[MACRO] = {}
    for d in diagnoses:        
        if d is MACRO:
            continue
        try:
            diagnoses[d]['p']=diagnoses[d].get('tp', 0)/(diagnoses[d].get('tp', 0)+diagnoses[d].get('fp', 0))            
        except:            
            diagnoses[d]['p']=0.0
        if d is not MICRO:
                diagnoses[MACRO]['p']=diagnoses[MACRO].get('p', 0.0)+diagnoses[d]['p']                
            
        try:
            diagnoses[d]['r']=diagnoses[d].get('tp', 0)/(diagnoses[d].get('tp', 0)+diagnoses[d].get('fn', 0))            
        except:
            diagnoses[d]['r']=0.0
        if d is not MICRO:
            diagnoses[MACRO]['r']=diagnoses[MACRO].get('r', 0.0)+diagnoses[d]['r']
        
        try:
            diagnoses[d]['f']=2/(1/diagnoses[d]['p']+1/diagnoses[d]['r'])            
        except:
            diagnoses[d]['f']=0.0
        if d is not MICRO:
                diagnoses[MACRO]['f']=diagnoses[MACRO].get('f', 0.0)+diagnoses[d]['f']
    if len(diagnoses)-2 > 0:
        diagnoses[MACRO]['f']=diagnoses[MACRO].get('f', 0.0)/float(len(diagnoses)-2)
        diagnoses[MACRO]['p']=diagnoses[MACRO].get('p', 0.0)/float(len(diagnoses)-2)
        diagnoses[MACRO]['r']=diagnoses[MACRO].get('r', 0.0)/float(len(diagnoses)-2)
    else:
        diagnoses[MACRO]['f']='n/a'
        diagnoses[MACRO]['p']='n/a'
        diagnoses[MACRO]['r']='n/a'
    return diagnoses, predicts

In [None]:
def update_fscores(new, overall):
    MICRO = 'micro'
    MACRO = 'macro'
    
    for k in new:
        if k not in overall:
            overall[k] = {}
        overall[k]['tp'] = overall[k].get('tp', 0) + new[k].get('tp', 0)
        overall[k]['fp'] = overall[k].get('fp', 0) + new[k].get('fp', 0)
        overall[k]['fn'] = overall[k].get('fn', 0) + new[k].get('fn', 0)
        overall[MICRO]['tp'] = overall[MICRO].get('tp', 0) + new[k].get('tp', 0)
        overall[MICRO]['fp'] = overall[MICRO].get('fp', 0) + new[k].get('fp', 0)
        overall[MICRO]['fn'] = overall[MICRO].get('fn', 0) + new[k].get('fn', 0)
        
    overall[MACRO] = {}
    for d in overall:        
        if d is MACRO:
            continue
        try:
            overall[d]['p']=overall[d].get('tp', 0)/(overall[d].get('tp', 0)+overall[d].get('fp', 0))            
        except:            
            overall[d]['p']=0.0
        if d is not MICRO:
            overall[MACRO]['p']=overall[MACRO].get('p', 0.0)+overall[d]['p']                
            
        try:
            overall[d]['r']=overall[d].get('tp', 0)/(overall[d].get('tp', 0)+overall[d].get('fn', 0))            
        except:
            overall[d]['r']=0.0
        if d is not MICRO:
            overall[MACRO]['r']=overall[MACRO].get('r', 0.0)+overall[d]['r']
        
        try:
            overall[d]['f']=2/(1/overall[d]['p']+1/overall[d]['r'])            
        except:
            overall[d]['f']=0.0
        if d is not MICRO:
                overall[MACRO]['f']=overall[MACRO].get('f', 0.0)+overall[d]['f']

    
    if len(overall)-2 > 0:
        overall[MACRO]['f']=overall[MACRO].get('f', 0.0)/float(len(overall)-2)
        overall[MACRO]['p']=overall[MACRO].get('p', 0.0)/float(len(overall)-2)
        overall[MACRO]['r']=overall[MACRO].get('r', 0.0)/float(len(overall)-2)
    else:
        overall[MACRO]['f']='n/a'
        overall[MACRO]['p']='n/a'
        overall[MACRO]['r']='n/a'
    return overall

def train(model, iterator, optimizer, criterion, model_type):
    
    epoch_loss = 0
    #epoch_acc = 0
    epoch_fscore = 0
    
    model.train()
    fscores = {}    
    for batch in iterator:        
        optimizer.zero_grad()
        
        if model_type == 0:            
            predictions = model(batch.all_text)
        else:
            predictions = model(batch.bh_text, batch.ep_text)
            
        labels = torch.cat((batch.major_depressive.unsqueeze(1), batch.schizophrenia.unsqueeze(1),
                            batch.biploar.unsqueeze(1), batch.minor_depressive.unsqueeze(1), 
                            batch.dementia.unsqueeze(1)), 1)

        loss = criterion(predictions, labels)
        
        #acc = binary_accuracy(predictions, labels)
        fscore, _ = f_measure(predictions, labels)            
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        #epoch_fscore += fscores['micro']["f"]
        fscores = update_fscores(fscore, fscores)
        
    return epoch_loss / len(iterator), fscores['micro']["f"]

In [None]:
def evaluate(model, iterator, criterion, model_type):
    
    epoch_loss = 0
    #epoch_acc = 0
    epoch_fscore = 0
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            if model_type == 0:
                predictions = model(batch.all_text)
            else:
                predictions = model(batch.bh_text, batch.ep_text)
        
            labels = torch.cat((batch.major_depressive.unsqueeze(1), batch.schizophrenia.unsqueeze(1),
                            batch.biploar.unsqueeze(1), batch.minor_depressive.unsqueeze(1), 
                            batch.dementia.unsqueeze(1)), 1)
        
            loss = criterion(predictions, labels)
            
            fscores, _ = f_measure(predictions, labels)            

            epoch_loss += loss.item()
            epoch_fscore += fscores['micro']["f"]
        
    return epoch_loss / len(iterator), epoch_fscore / len(iterator)

In [None]:
def train_epoch(epoches, model, train_iterator, optimizer, criterion, model_type, model_name, 
                valid_iterator = None, interval = 50, early_stop = False, period = 20, gap = 0.005, threshold = 0.5):
    best_valid_loss = float('inf')
    best_valid_fscore = 0
    train_losses = []
    valid_losses = []
    train_accs = []
    valid_accs = []
    observed_time = 0
    for epoch in range(epoches):
        start_time = time.time()
        
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion, model_type)
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        if valid_iterator:
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, model_type)
            valid_losses.append(valid_loss)
            valid_accs.append(valid_acc)
        else:
            valid_loss = 0 
        
        if (epoch + 1) % interval == 0:
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train micro-F-score: {train_acc*100:.2f}%')
            if valid_iterator:
                #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
                print(f'\t Val. Loss: {valid_loss:.3f} |  Val. micro-F-score: {valid_acc*100:.2f}%')
        elif epoch == epoches - 1:
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train micro-F-score: {train_acc*100:.2f}%')
            if valid_iterator:
                #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
                print(f'\t Val. Loss: {valid_loss:.3f} |  Val. micro-F-score: {valid_acc*100:.2f}%')
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), os.path.join(DATA_FOLDER, model_name + '_loss.pt'))
        if early_stop and best_valid_fscore > threshold and best_valid_fscore - valid_acc > gap:
            observed_time += 1
            print(f'\rBest validation F-measure: {best_valid_fscore:.3f}/Current F-measure: {valid_acc:.3f} [Times: {observed_time}/{period}]')  
            if observed_time >= period:
                print(f'Early stop at epoch {epoch+1:02}.')
                break                        
        if valid_acc > best_valid_fscore:
            best_valid_fscore = valid_acc
            torch.save(model.state_dict(), os.path.join(DATA_FOLDER, model_name + '_fscore.pt'))
            observed_time = 0        
    return train_losses, valid_losses, train_accs, valid_accs

In [None]:
def analysis_plotter(fig, ax, train, valid, title, param_dict1, param_dict2):
    out = ax.plot(train, **param_dict1)
    out = ax.plot(valid, **param_dict2)
    ax.title.set_text(title)
    ax.legend()
    pv = float('inf')
    x = []
    y = []
    for k, v in enumerate(valid):
        if v > pv:
            x.append(k)
            y.append(v)
        pv = v
    scatter = ax.scatter(x, y)
    labels = []
    for x, y in zip(x,y):
        labels.append(f'{x}: {y}')
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)    

In [None]:
def test(model, iterator, criterion, model_type, model_name = None):    
    epoch_loss = 0
    epoch_acc = 0
    if model_name:
        model.load_state_dict(torch.load(os.path.join(DATA_FOLDER, model_name + '.pt')))

    model.eval()
    
    diagnoses = {}
    predicts = []
    MICRO = 'micro'
    MACRO = 'macro'
    diagnoses[MICRO] = {}
    with torch.no_grad():                    
        for batch in iterator:
            if model_type == 0:
                predictions = model(batch.all_text)
            else:
                predictions = model(batch.bh_text, batch.ep_text)
            rounded_preds = torch.round(torch.sigmoid(predictions))
            predicts.extend(rounded_preds.data.tolist())
            labels = torch.cat((batch.major_depressive.unsqueeze(1), batch.schizophrenia.unsqueeze(1),
                            batch.biploar.unsqueeze(1), batch.minor_depressive.unsqueeze(1), 
                            batch.dementia.unsqueeze(1)), 1)
            
            for index, value in enumerate(rounded_preds):
                for did, dvalue in enumerate(rounded_preds[index]):
                    v = dvalue.item()                    
                    if v == 1:
                        if dvalue == labels[index, did]:
                            if did not in diagnoses:
                                diagnoses[did] = {}                                
                            diagnoses[did]['tp'] = diagnoses[did].get('tp', 0) + 1
                            diagnoses[MICRO]['tp'] = diagnoses[MICRO].get('tp', 0) + 1 
                        else:
                            if did not in diagnoses:
                                diagnoses[did] = {}
                            diagnoses[did]['fp'] = diagnoses[did].get('fp', 0) + 1
                            diagnoses[MICRO]['fp'] = diagnoses[MICRO].get('fp', 0) + 1
                    elif v == 0:
                        if 1 == labels[index, did].item():
                            if did not in diagnoses:
                                diagnoses[did] = {}
                            diagnoses[did]['fn'] = diagnoses[did].get('fn', 0) + 1
                            diagnoses[MICRO]['fn'] = diagnoses[MICRO].get('fn', 0) + 1
                        else:
                            if did not in diagnoses:
                                diagnoses[did] = {}
                            diagnoses[did]['tn'] = diagnoses[did].get('tn', 0) + 1
                            diagnoses[MICRO]['tn'] = diagnoses[MICRO].get('tn', 0) + 1
    diagnoses[MACRO] = {}
    for d in diagnoses:        
        if d is MACRO:
            continue
        try:
            diagnoses[d]['p']=diagnoses[d].get('tp', 0)/(diagnoses[d].get('tp', 0)+diagnoses[d].get('fp', 0))
            if d is not MICRO:
                diagnoses[MACRO]['p']=diagnoses[MACRO].get('p', 0.0)+diagnoses[d]['p']                
        except:            
            diagnoses[d]['p']=0.0
            
        try:
            diagnoses[d]['r']=diagnoses[d].get('tp', 0)/(diagnoses[d].get('tp', 0)+diagnoses[d].get('fn', 0))
            if d is not MICRO:
                diagnoses[MACRO]['r']=diagnoses[MACRO].get('r', 0.0)+diagnoses[d]['r']
        except:
            diagnoses[d]['r']=0.0
        
        try:
            diagnoses[d]['f']=2/(1/diagnoses[d]['p']+1/diagnoses[d]['r'])
            if d is not MICRO:
                diagnoses[MACRO]['f']=diagnoses[MACRO].get('f', 0.0)+diagnoses[d]['f']
        except:
            diagnoses[d]['f']=0.0
    diagnoses[MACRO]['f']=diagnoses[MACRO].get('f', 0.0)/float(len(diagnoses)-2)
    diagnoses[MACRO]['p']=diagnoses[MACRO].get('p', 0.0)/float(len(diagnoses)-2)
    diagnoses[MACRO]['r']=diagnoses[MACRO].get('r', 0.0)/float(len(diagnoses)-2)
    return diagnoses, predicts

# Dataset Analysis

## Training Set

In [None]:
names =['ID', 'BH Text', 'EP Text', 'Major Depressive', 'Schizophrenia', 'Biploar', 'Minor Depressive', 'Dementia']
ntuhdataset = pd.read_csv('../Datasets/NTUH/train_preprocessing.txt', sep ='\t', names = names)
ntuhdataset

In [None]:
display(ntuhdataset.groupby('Major Depressive').count().iloc[:,:3])
display(ntuhdataset.groupby('Schizophrenia').count().iloc[:,:3])
display(ntuhdataset.groupby('Biploar').count().iloc[:,:3])
display(ntuhdataset.groupby('Minor Depressive').count().iloc[:,:3])
display(ntuhdataset.groupby('Dementia').count().iloc[:,:3])

In [None]:
bh_text = ntuhdataset['BH Text'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(5,5))
avg_bh_text = mean(bh_text)
plt.title(f'BH Text Length Distribution: Average legnth: {avg_bh_text}')
plt.hist(bh_text, bins = 50)
plt.show()

ep_text = ntuhdataset['EP Text'].apply(lambda x: len(str(x).split()))
avg_ep_text = mean(ep_text)
plt.figure(figsize=(5,5))
plt.title(f'EP Text Length Distribution: Average length: {avg_ep_text}')
plt.hist(ep_text, bins = 50)
plt.show()

# Test Set

In [None]:
ntuhdataset = pd.read_csv('../Datasets/NTUH/test_preprocessing.txt', sep ='\t', names = names)
ntuhdataset

In [None]:
display(ntuhdataset.groupby('Major Depressive').count().iloc[:,:3])
display(ntuhdataset.groupby('Schizophrenia').count().iloc[:,:3])
display(ntuhdataset.groupby('Biploar').count().iloc[:,:3])
display(ntuhdataset.groupby('Minor Depressive').count().iloc[:,:3])
display(ntuhdataset.groupby('Dementia').count().iloc[:,:3])

In [None]:
bh_text = ntuhdataset['BH Text'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(5,5))
plt.title('BH Text Length Distribution')
plt.hist(bh_text, bins = 50)
plt.show()

ep_text = ntuhdataset['EP Text'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(5,5))
plt.title('EP Text Length Distribution')
plt.hist(ep_text, bins = 50)
plt.show()

# Dataset Definition

In [None]:
class NTUHDataset(data.Dataset):
    #urls = ['Datasets\\NTUH\\corpus.txt']
    name = 'ntuh'
    dirname = 'ntuh'
    diagnosis_types = ['major_depressive', 'schizophrenia', 'biploar', 'minor_depressive', 'dementia']
    
    @staticmethod
    def sort_key(ex):
        return len(ex.all_text) # TODO add ep_text?

    def __init__(self, path, bh_text_field, ep_text_field, all_text_field,
                 major_label_field, sch_label_field, bipolar_label_field, minor_label_field, dementia_label_field,
                 **kwargs):
        fields = [('patient_id', None), 
                  ('bh_text', bh_text_field),
                  ('ep_text', ep_text_field),
                  ('all_text', all_text_field),
                  ('major_depressive', major_label_field),
                  ('schizophrenia', sch_label_field),
                  ('biploar', bipolar_label_field),
                  ('minor_depressive', minor_label_field),
                  ('dementia', dementia_label_field)]
        examples = []
        
        for fname in glob.iglob(path + '.txt'):
            with io.open(fname, 'r', encoding="utf-8") as f:
                for line in f:
                    pid, bh_text, ep_text, major_d, sc, bp, minor_d, de = line.strip().split('\t')
                    all_text = "%s <sep> %s" % (bh_text, ep_text)
                    examples.append(data.Example.fromlist([pid, bh_text, ep_text, all_text, major_d, sc, bp, minor_d, de], 
                                                          fields))
        super(NTUHDataset, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, 
               bh_text_field, ep_text_field, all_text_field,
               major_label_field, sch_label_field, bipolar_label_field, minor_label_field, dementia_label_field,
               root='..\\Datasets\\NTUH',
               #train='train', test='test', **kwargs):
               train='train_preprocessing', test='test_preprocessing', **kwargs):
        return super(NTUHDataset, cls).splits(
            path = root, root=root, 
            bh_text_field = bh_text_field, ep_text_field = ep_text_field, all_text_field = all_text_field, 
            major_label_field = major_label_field, sch_label_field = sch_label_field, 
            bipolar_label_field = bipolar_label_field, minor_label_field = minor_label_field, 
            dementia_label_field = dementia_label_field,
            train=train, validation=None, test=test, **kwargs)

# Pre-process

In [None]:
x = ntuhdataset['BH Text'][0] 

In [None]:
tokenize = str.split

In [None]:
def tokenize_and_cut(sentence):
    #sentence = sentence.replace('<unk>', '？')
    sentences = re.split(r'\s*<sep>(?:\s*<sep>)*\s*', sentence)
    filtered_sentence = list(filter(lambda sent: '<unk>' not in sent, sentences))
    sents = [tokenize(sent) for sent in filtered_sentence]
    tokens = []
    sents = [allsents.split() for allsents in 
             [' [SEP] '.join(sent) for sent in [[' '.join(token) for token in sents]]]]    
    tokens.extend(sents[0])
    tokens = tokens[:MAX_SEQUENCE]
    return tokens

In [None]:
tokenize_and_cut(x)

# N-gram features

In [None]:
def generate_ngrams(token_list, n = 2):
    ngram_list = token_list.copy()
    for i in range(2, n+1):
        ngrams = set(zip(*[token_list[j:] for j in range(i)]))
        for ngram in ngrams:
            ngram_list.append(' '.join(ngram))
    return ngram_list

In [None]:
generate_ngrams(x)

In [None]:
BH_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True, lower = True,
                     preprocessing = generate_ngrams)
EP_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True, lower = True,
                     preprocessing = generate_ngrams)
ALL_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True, lower = True,
                     preprocessing = generate_ngrams)

MAJ_LABEL = data.LabelField(dtype = torch.float)
SCH_LABEL = data.LabelField(dtype = torch.float)
BIP_LABEL = data.LabelField(dtype = torch.float)
MIN_LABEL = data.LabelField(dtype = torch.float)
DEM_LABEL = data.LabelField(dtype = torch.float)

In [None]:
full_train_data, test_data = NTUHDataset.splits(BH_TEXT, EP_TEXT, ALL_TEXT, 
                                           MAJ_LABEL, SCH_LABEL, BIP_LABEL, MIN_LABEL, DEM_LABEL)
train_data, valid_data = full_train_data.split(random_state = random.seed(SEED), split_ratio = TRAIN_RATIO)

In [None]:
data_dic = {'ID': []}
for example in train_data.examples:
    v = data_dic.get('ID')
    v.append(len(data_dic['ID']))
    data_dic['ID']=v
    for k in example.__dict__.keys():
        if k in NTUHDataset.diagnosis_types:
            v = data_dic.get(k, [])
            v.append(example.__dict__[k])
            data_dic[k]=v            
            
trainpd = pd.DataFrame(data_dic)
#display(trainpd)
ratios = trainpd.groupby('major_depressive').count().iloc[:,:3]['ID']
print(f'Training set: Major Depressive ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = trainpd.groupby('schizophrenia').count().iloc[:,:3]['ID']
print(f'Training set: Schizophrenia ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = trainpd.groupby('biploar').count().iloc[:,:3]['ID']
print(f'Training set: Biploar ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = trainpd.groupby('minor_depressive').count().iloc[:,:3]['ID']
print(f'Training set: Minor Depressive ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = trainpd.groupby('dementia').count().iloc[:,:3]['ID']
print(f'Training set: Dementia ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')

data_dic = {'ID': []}
for example in valid_data.examples:
    v = data_dic.get('ID')
    v.append(len(data_dic['ID']))
    data_dic['ID']=v
    for k in example.__dict__.keys():
        if k in NTUHDataset.diagnosis_types:
            v = data_dic.get(k, [])
            v.append(example.__dict__[k])
            data_dic[k]=v        

testpd = pd.DataFrame.from_dict(data_dic)
#display(testpd)
ratios = testpd.groupby('major_depressive').count().iloc[:,:3]['ID']
print(f'\nTest set: Major Depressive ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = testpd.groupby('schizophrenia').count().iloc[:,:3]['ID']
print(f'Test set: Schizophrenia ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = testpd.groupby('biploar').count().iloc[:,:3]['ID']
print(f'Test set: Biploar ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = testpd.groupby('minor_depressive').count().iloc[:,:3]['ID']
print(f'Test set: Minor Depressive ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')
ratios = testpd.groupby('dementia').count().iloc[:,:3]['ID']
print(f'Test set: Dementia ({ratios[0]}/{ratios[1]}): {ratios[0]/ratios[1]}')

In [None]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

In [None]:
print(vars(train_data.examples[0]))

# Build Vocab

In [None]:
BH_TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
EP_TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
ALL_TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)

MAJ_LABEL.build_vocab(train_data)
SCH_LABEL.build_vocab(train_data)
BIP_LABEL.build_vocab(train_data)
MIN_LABEL.build_vocab(train_data)
DEM_LABEL.build_vocab(train_data)

print(BH_TEXT.vocab.stoi)
print(EP_TEXT.vocab.stoi)
print(ALL_TEXT.vocab.stoi)
print(MAJ_LABEL.vocab.stoi)
print(SCH_LABEL.vocab.stoi)
print(BIP_LABEL.vocab.stoi)
print(MIN_LABEL.vocab.stoi)
print(DEM_LABEL.vocab.stoi)

# Create Iterator

In [None]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, shuffle = False,
    device = device)

## Check Iterator

In [None]:
batch = next(iter(valid_iterator))
print(vars(batch))

Remember to recreate the iterators or we will miss the first batch.

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

# Model Definition

In [None]:
class FastTextBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.dropout = nn.Dropout(DROPOUT)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
                
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        return self.fc(self.dropout(pooled))

In [None]:
class FastText(nn.Module):
    def __init__(self, vocab_size1, vocab_size2, embedding_dim, output_dim, pad_idx):        
        super().__init__()
        
        self.embedding1 = nn.Embedding(vocab_size1, embedding_dim, padding_idx=pad_idx)
        self.embedding2 = nn.Embedding(vocab_size2, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim*2, output_dim)
        
        self.dropout = nn.Dropout(DROPOUT)
    def forward(self, bh_text, ep_text):
        
        embedded1 = self.embedding1(bh_text)
        embedded2 = self.embedding2(ep_text)
                
        pooled1 = F.avg_pool2d(embedded1, (embedded1.shape[1], 1)).squeeze(1) 
        pooled2 = F.avg_pool2d(embedded2, (embedded2.shape[1], 1)).squeeze(1) 
        
        return self.fc(self.dropout(torch.cat((pooled1, pooled2), 1)))

# Initialize Our Model

In [None]:
INPUT_DIM = len(ALL_TEXT.vocab)
UNK_IDX = ALL_TEXT.vocab.stoi[ALL_TEXT.unk_token]
PAD_IDX = ALL_TEXT.vocab.stoi[ALL_TEXT.pad_token]
SEP_IDX = ALL_TEXT.vocab.stoi['[sep]']

print("Input dimension: %s\nUnknown word index: %s\nPadding index: %s\nSeperator index: %s" % 
      (INPUT_DIM, UNK_IDX, PAD_IDX, SEP_IDX))

In [None]:
model = FastTextBaseline(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
model

In [None]:
print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
model.embedding.weight

In [None]:
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[SEP_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
model.embedding.weight

# Train the Model

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
train_losses, valid_losses, train_accs, valid_accs = \
    train_epoch(N_EPOCHS, model, train_iterator, optimizer, criterion, 0, 'baseline1', valid_iterator)

## Train the Model Type 2

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

INPUT_DIM1 = len(BH_TEXT.vocab)
INPUT_DIM2 = len(EP_TEXT.vocab)

UNK_IDX = ALL_TEXT.vocab.stoi[ALL_TEXT.unk_token]
PAD_IDX = ALL_TEXT.vocab.stoi[ALL_TEXT.pad_token]

print("Input dimension: (%s/%s)\nUnknown word index: %s\nPadding index: %s" % (INPUT_DIM1, INPUT_DIM2, UNK_IDX, PAD_IDX))

model2 = FastText(INPUT_DIM1, INPUT_DIM2, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(model2)

model2.embedding1.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding1.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding1.weight.data[SEP_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding2.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding2.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding2.weight.data[SEP_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
optimizer = optim.Adam(model2.parameters())
criterion = nn.BCEWithLogitsLoss()
model2 = model2.to(device)
criterion = criterion.to(device)

train_losses2, valid_losses2, train_accs2, valid_accs2 = \
    train_epoch(N_EPOCHS, model2, train_iterator, optimizer, criterion, 1, 'baseline2', valid_iterator)

# Result Analysis

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, train_losses, valid_losses, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, train_accs, valid_accs, 'Training/Validation Micro-F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, train_losses2, valid_losses2, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, train_accs2, valid_accs2, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

# Test Set

In [None]:
test_f_scores, predicts = test(model, test_iterator, criterion, 0)

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

In [None]:
test_f_scores, predicts = test(model2, test_iterator, criterion, 1)

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

In [None]:
test_f_scores, predicts = test(model, test_iterator, criterion, 0, 'baseline1_fscore')

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')            

In [None]:
test_f_scores, predicts = test(model2, test_iterator, criterion, 1, 'baseline2_fscore')

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

# Deal with Imbalance

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = FastTextBaseline(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[SEP_IDX] = torch.zeros(EMBEDDING_DIM)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)

In [None]:
model = model.to(device)
criterion = criterion.to(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

train_losses, valid_losses, train_accs, valid_accs = \
    train_epoch(N_EPOCHS, model, train_iterator, optimizer, criterion, 0, 'ft_1_rand', valid_iterator, early_stop=True,
               period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, train_losses, valid_losses, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, train_accs, valid_accs, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
test_f_scores, predicts = test(model, test_iterator, criterion, 0)

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

In [None]:
test_f_scores, predicts = test(model, test_iterator, criterion, 0, 'ft_1_rand_fscore')

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

## Apply Early Stopping

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = FastTextBaseline(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[SEP_IDX] = torch.zeros(EMBEDDING_DIM)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
model = model.to(device)
criterion = criterion.to(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

train_losses, valid_losses, train_accs, valid_accs = \
    train_epoch(N_EPOCHS, model, train_iterator, optimizer, criterion, 0, 
                'imb1_ea', valid_iterator, early_stop=True, period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, train_losses, valid_losses, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, train_accs, valid_accs, 'Training/Validation Accuracy', {'label': 'Training Accuracy'}, {'label': 'Validation Accuracy'})

In [None]:
test_f_scores, predicts = test(model, test_iterator, criterion, 0)

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

In [None]:
test_f_scores, predicts = test(model, test_iterator, criterion, 0, 'imb1_ea_fscore')

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

## Model Type 2

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

model2 = FastText(INPUT_DIM1, INPUT_DIM2, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
model2.embedding1.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding1.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding1.weight.data[SEP_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding2.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding2.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model2.embedding2.weight.data[SEP_IDX] = torch.zeros(EMBEDDING_DIM)
model2 = model2.to(device)

optimizer = optim.Adam(model2.parameters())
criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
criterion = criterion.to(device)

train_losses2, valid_losses2, train_accs2, valid_accs2 = \
    train_epoch(N_EPOCHS, model2, train_iterator, optimizer, criterion, 1, 'ft_2_rand', valid_iterator, early_stop = True
               , period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, train_losses2, valid_losses2, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, train_accs2, valid_accs2, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
test_f_scores, predicts = test(model2, test_iterator, criterion, 1)

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

In [None]:
test_f_scores, predicts = test(model2, test_iterator, criterion, 1, 'ft_2_rand_fscore')

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

# Word2vec

In [None]:
vectors = Vectors(name='word2vec_skipgram_model.bin', cache=DATA_FOLDER)
WV_EMBEDDING_DIM = vectors.vectors.shape[1]
print(vectors.vectors.shape)
print(vectors.vectors)
print(vectors.itos[0])
print(vectors.vectors[vectors.stoi[',']])

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

WV_ALL_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True,
                     preprocessing = generate_ngrams, lower = True)
WV_BH_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True,
                     preprocessing = generate_ngrams, lower = True)
WV_EP_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True,
                     preprocessing = generate_ngrams, lower = True)
WV_MAJ_LABEL = data.LabelField(dtype = torch.float)
WV_SCH_LABEL = data.LabelField(dtype = torch.float)
WV_BIP_LABEL = data.LabelField(dtype = torch.float)
WV_MIN_LABEL = data.LabelField(dtype = torch.float)
WV_DEM_LABEL = data.LabelField(dtype = torch.float)

full_wv_train_data, wv_test_data = NTUHDataset.splits(WV_BH_TEXT, WV_EP_TEXT, WV_ALL_TEXT, 
                                           WV_MAJ_LABEL, WV_SCH_LABEL, WV_BIP_LABEL, 
                                                            WV_MIN_LABEL, WV_DEM_LABEL)

wv_train_data, wv_valid_data = full_wv_train_data.split(random_state = random.seed(SEED), 
                                                                split_ratio = TRAIN_RATIO)

WV_ALL_TEXT.build_vocab(wv_train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = vectors, unk_init = torch.Tensor.normal_)

WV_BH_TEXT.build_vocab(wv_train_data, 
                 max_size = MAX_VOCAB_SIZE), 
                 vectors = vectors, unk_init = torch.Tensor.normal_)

WV_EP_TEXT.build_vocab(wv_train_data, 
                 max_size = MAX_VOCAB_SIZE), 
                 vectors = vectors, unk_init = torch.Tensor.normal_)

WV_MAJ_LABEL.build_vocab(wv_train_data)
WV_SCH_LABEL.build_vocab(wv_train_data)
WV_BIP_LABEL.build_vocab(wv_train_data)
WV_MIN_LABEL.build_vocab(wv_train_data)
WV_DEM_LABEL.build_vocab(wv_train_data)

WV_ALL_INPUT_DIM = len(WV_ALL_TEXT.vocab)
WV_ALL_UNK_IDX = WV_ALL_TEXT.vocab.stoi[WV_ALL_TEXT.unk_token]
WV_ALL_PAD_IDX = WV_ALL_TEXT.vocab.stoi[WV_ALL_TEXT.pad_token]
WV_SEP_IDX_ALL = WV_ALL_TEXT.vocab.stoi['[sep]']
WV_SEP_IDX_BH = WV_BH_TEXT.vocab.stoi['[sep]']
WV_SEP_IDX_EP = WV_EP_TEXT.vocab.stoi['[sep]']

print("All Text\nInput dimension: %s\nUnknown word index: %s\nPadding index: %s\nSeperator index: %s/%s/%s" % 
                (WV_ALL_INPUT_DIM, WV_ALL_UNK_IDX, WV_ALL_PAD_IDX, WV_SEP_IDX_ALL, WV_SEP_IDX_BH, WV_SEP_IDX_EP))

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

wv_train_iterator, wv_valid_iterator, wv_test_iterator = data.BucketIterator.splits(
    (wv_train_data, wv_valid_data, wv_test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

wv_model = FastTextBaseline(WV_ALL_INPUT_DIM, WV_EMBEDDING_DIM, OUTPUT_DIM, WV_ALL_PAD_IDX)
print(wv_model.embedding.weight)

In [None]:
print(wv_model.embedding.weight[WV_ALL_TEXT.vocab.stoi[',']][:10])
print(vectors.vectors[vectors.stoi[',']][:10])
for s in WV_ALL_TEXT.vocab.stoi:    
    if s in vectors.stoi:
        with torch.no_grad():
            wv_model.embedding.weight[WV_ALL_TEXT.vocab.stoi[s]].copy_(vectors.vectors[vectors.stoi[s]])#.clone()
print(wv_model.embedding.weight[WV_ALL_TEXT.vocab.stoi[',']][:10])

In [None]:
wv_model.embedding.weight.data[WV_ALL_UNK_IDX] = torch.zeros(WV_EMBEDDING_DIM)
wv_model.embedding.weight.data[WV_SEP_IDX_ALL] = torch.zeros(WV_EMBEDDING_DIM)
wv_model.embedding.weight.data[WV_ALL_PAD_IDX] = torch.zeros(WV_EMBEDDING_DIM)
print(wv_model.embedding.weight)

In [None]:
wv_optimizer = optim.Adam([param for param in wv_model.parameters() if param.requires_grad == True])
wv_criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
wv_model = wv_model.to(device)
wv_criterion = wv_criterion.to(device)

wv_train_losses, wv_valid_losses, wv_train_accs, wv_valid_accs = \
     train_epoch(N_EPOCHS, wv_model, wv_train_iterator, wv_optimizer, wv_criterion, 0, 
                 'wv1', wv_valid_iterator, early_stop=True, period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, wv_train_losses, wv_valid_losses, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, wv_train_accs, wv_valid_accs, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
test_f_scores, predicts = test(wv_model, wv_test_iterator, criterion, 0)

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

print('#'*40)
test_f_scores, predicts = test(wv_model, wv_test_iterator, criterion, 0, 'wv1_fscore')

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')        

## Model Type 2

In [None]:
WV_INPUT_DIM1 = len(WV_BH_TEXT.vocab)
WV_INPUT_DIM2 = len(WV_EP_TEXT.vocab)
WV_PAD_IDX_BH = WV_BH_TEXT.vocab.stoi[WV_BH_TEXT.pad_token]
WV_UNK_IDX_BH = WV_BH_TEXT.vocab.stoi[WV_BH_TEXT.unk_token]
WV_PAD_IDX_EP = WV_EP_TEXT.vocab.stoi[WV_EP_TEXT.pad_token]
WV_UNK_IDX_EP = WV_EP_TEXT.vocab.stoi[WV_EP_TEXT.unk_token]
WV_SEP_IDX_BH = WV_BH_TEXT.vocab.stoi['[sep]']
WV_SEP_IDX_EP = WV_EP_TEXT.vocab.stoi['[sep]']

print("Input dimension: %s/%s\nUnknown word index: %s/%s\nPadding index: %s/%s\nSeperator index: %s/%s" % 
                (WV_INPUT_DIM1, WV_INPUT_DIM2, WV_UNK_IDX_BH, WV_UNK_IDX_EP, WV_PAD_IDX_BH, 
                 WV_PAD_IDX_EP, WV_SEP_IDX_BH, WV_SEP_IDX_EP))
assert WV_PAD_IDX_EP == WV_PAD_IDX_BH

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

wv_model2 = FastText(WV_INPUT_DIM1, WV_INPUT_DIM2, WV_EMBEDDING_DIM, OUTPUT_DIM, WV_PAD_IDX_EP)

for s in WV_BH_TEXT.vocab.stoi:    
    if s in vectors.stoi:
        with torch.no_grad():
            wv_model2.embedding1.weight[WV_BH_TEXT.vocab.stoi[s]].copy_(vectors.vectors[vectors.stoi[s]])

for s in WV_EP_TEXT.vocab.stoi:    
    if s in vectors.stoi:
        with torch.no_grad():
            wv_model2.embedding2.weight[WV_EP_TEXT.vocab.stoi[s]].copy_(vectors.vectors[vectors.stoi[s]])

wv_model2.embedding1.weight.data[WV_UNK_IDX_BH] = torch.zeros(WV_EMBEDDING_DIM)
wv_model2.embedding1.weight.data[WV_SEP_IDX_BH] = torch.zeros(WV_EMBEDDING_DIM)
wv_model2.embedding1.weight.data[WV_PAD_IDX_BH] = torch.zeros(WV_EMBEDDING_DIM)
wv_model2.embedding2.weight.data[WV_UNK_IDX_EP] = torch.zeros(WV_EMBEDDING_DIM)
wv_model2.embedding2.weight.data[WV_SEP_IDX_EP] = torch.zeros(WV_EMBEDDING_DIM)
wv_model2.embedding2.weight.data[WV_PAD_IDX_EP] = torch.zeros(WV_EMBEDDING_DIM)

wv_optimizer = optim.Adam([param for param in wv_model2.parameters() if param.requires_grad == True])
wv_criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
wv_model2 = wv_model2.to(device)
wv_criterion = wv_criterion.to(device)

wv_train_losses2, wv_valid_losses2, wv_train_accs2, wv_valid_accs2 = \
     train_epoch(N_EPOCHS, wv_model2, wv_train_iterator, wv_optimizer, wv_criterion, 1, 
                 'wv2', wv_valid_iterator, early_stop = True, period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, wv_train_losses2, wv_valid_losses2, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, wv_train_accs2, wv_valid_accs2, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
test_f_scores, predicts = test(wv_model2, wv_test_iterator, wv_criterion, 1)

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')

print('#'*40)
test_f_scores, predicts = test(wv_model2, test_iterator, criterion, 1, 'wv2_fscore')

for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')        

# Glove

In [None]:
GLOVE_BH_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True,
                     preprocessing = generate_ngrams, lower = True)
GLOVE_EP_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True,
                     preprocessing = generate_ngrams, lower = True)
GLOVE_ALL_TEXT = data.Field(tokenize = tokenize_and_cut, batch_first = True,
                     preprocessing = generate_ngrams, lower = True)
GLOVE_MAJ_LABEL = data.LabelField(dtype = torch.float)
GLOVE_SCH_LABEL = data.LabelField(dtype = torch.float)
GLOVE_BIP_LABEL = data.LabelField(dtype = torch.float)
GLOVE_MIN_LABEL = data.LabelField(dtype = torch.float)
GLOVE_DEM_LABEL = data.LabelField(dtype = torch.float)

full_glove_train_data, glove_test_data = NTUHDataset.splits(GLOVE_BH_TEXT, GLOVE_EP_TEXT, GLOVE_ALL_TEXT, 
                                           GLOVE_MAJ_LABEL, GLOVE_SCH_LABEL, GLOVE_BIP_LABEL, 
                                                            GLOVE_MIN_LABEL, GLOVE_DEM_LABEL)

glove_train_data, glove_valid_data = full_glove_train_data.split(random_state = random.seed(SEED), 
                                                                 split_ratio = TRAIN_RATIO)

Here we used Glove and gaussian distribution to initialize our word vectors and unknow words, respectively.

In [None]:
GLOVE_ALL_TEXT.build_vocab(glove_train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.300d", 
                 unk_init = torch.Tensor.normal_)
GLOVE_BH_TEXT.build_vocab(glove_train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.300d", 
                 unk_init = torch.Tensor.normal_)
GLOVE_EP_TEXT.build_vocab(glove_train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.300d", 
                 unk_init = torch.Tensor.normal_)

GLOVE_MAJ_LABEL.build_vocab(glove_train_data)
GLOVE_SCH_LABEL.build_vocab(glove_train_data)
GLOVE_BIP_LABEL.build_vocab(glove_train_data)
GLOVE_MIN_LABEL.build_vocab(glove_train_data)
GLOVE_DEM_LABEL.build_vocab(glove_train_data)

GLOVE_ALL_TEXT.vocab.vectors.shape

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

N_EPOCHS = 500
GLOVE_INPUT_DIM = len(GLOVE_ALL_TEXT.vocab)
GLOVE_PAD_IDX = GLOVE_ALL_TEXT.vocab.stoi[GLOVE_ALL_TEXT.pad_token]
GLOVE_UNK_IDX = GLOVE_ALL_TEXT.vocab.stoi[GLOVE_ALL_TEXT.unk_token]
GLOVE_SEP_IDX = GLOVE_ALL_TEXT.vocab.stoi['[sep]']
GLOVE_EMBEDDING_DIM = GLOVE_ALL_TEXT.vocab.vectors.shape[1]

print("Input dimension: %s\nUnknown word index: %s\nPadding index: %s\nSeparator index: %s" % 
      (GLOVE_INPUT_DIM, GLOVE_UNK_IDX, GLOVE_PAD_IDX, GLOVE_SEP_IDX))

In [None]:
glove_train_iterator, glove_valid_iterator, glove_test_iterator = data.BucketIterator.splits(
    (glove_train_data, glove_valid_data, glove_test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

glove_model = FastTextBaseline(GLOVE_INPUT_DIM, GLOVE_EMBEDDING_DIM, OUTPUT_DIM, GLOVE_PAD_IDX)
print(glove_model.embedding.weight)

In [None]:
glove_model.embedding.weight.data.copy_(GLOVE_ALL_TEXT.vocab.vectors)
print(glove_model.embedding.weight)

In [None]:
glove_model.embedding.weight.data[GLOVE_UNK_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)
glove_model.embedding.weight.data[GLOVE_PAD_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)
glove_model.embedding.weight.data[GLOVE_SEP_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)
print(glove_model.embedding.weight)
glove_model

In [None]:
glove_optimizer = optim.Adam([param for param in glove_model.parameters() if param.requires_grad == True])
glove_criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
glove_model = glove_model.to(device)
glove_criterion = glove_criterion.to(device)

glove_train_losses, glove_valid_losses, glove_train_accs, glove_valid_accs = \
     train_epoch(N_EPOCHS, glove_model, glove_train_iterator, glove_optimizer, glove_criterion, 0, 
                 'glove1', glove_valid_iterator, early_stop = True, period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, glove_train_losses, glove_valid_losses, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, glove_train_accs, glove_valid_accs, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
test_f_scores, predicts = test(glove_model, glove_test_iterator, glove_criterion, 0)
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          
              
print('#'*40)              
test_f_scores, predicts = test(glove_model, glove_test_iterator, glove_criterion, 0, 'glove1_fscore')
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          

## Model Type 2

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

glove_train_iterator, glove_valid_iterator, glove_test_iterator = data.BucketIterator.splits(
    (glove_train_data, glove_valid_data, glove_test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

GLOVE_INPUT_DIM1 = len(GLOVE_BH_TEXT.vocab)
GLOVE_INPUT_DIM2 = len(GLOVE_EP_TEXT.vocab)

glove_model2 = FastText(GLOVE_INPUT_DIM1, GLOVE_INPUT_DIM2, GLOVE_EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
glove_model2.embedding1.weight.data.copy_(GLOVE_BH_TEXT.vocab.vectors)
glove_model2.embedding1.weight.data[GLOVE_UNK_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)
glove_model2.embedding1.weight.data[GLOVE_PAD_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)
glove_model2.embedding1.weight.data[GLOVE_SEP_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)

glove_model2.embedding2.weight.data.copy_(GLOVE_EP_TEXT.vocab.vectors)
glove_model2.embedding2.weight.data[GLOVE_UNK_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)
glove_model2.embedding2.weight.data[GLOVE_PAD_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)
glove_model2.embedding2.weight.data[GLOVE_SEP_IDX] = torch.zeros(GLOVE_EMBEDDING_DIM)

glove_model2

In [None]:
glove_optimizer = optim.Adam([param for param in glove_model2.parameters() if param.requires_grad == True])
glove_criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
glove_mode2 = glove_model2.to(device)
glove_criterion = glove_criterion.to(device)

glove_train_losses2, glove_valid_losses2, glove_train_accs2, glove_valid_accs2 = \
     train_epoch(N_EPOCHS, glove_model2, glove_train_iterator, glove_optimizer, glove_criterion, 1, 
                 'glove2', glove_valid_iterator, early_stop = True, period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, glove_train_losses2, glove_valid_losses2, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, glove_train_accs2, glove_valid_accs2, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
test_f_scores, predicts = test(glove_model2, glove_test_iterator, glove_criterion, 1)
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          
              
print('#'*40)              
test_f_scores, predicts = test(glove_model2, glove_test_iterator, glove_criterion, 1, 'glove2_fscore')
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          

# BERT

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

BATCH_SIZE = 32
N_EPOCHS = 500

In [None]:
BERT_MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case = True)
bert = BertModel.from_pretrained(BERT_MODEL, output_hidden_states = True)

BERT_EOS_TOKEN = tokenizer.sep_token
BERT_PAD_TOKEN = tokenizer.pad_token
BERT_UNK_TOKEN = tokenizer.unk_token

BERT_EOS_IDX = tokenizer.convert_tokens_to_ids(BERT_EOS_TOKEN)
BERT_PAD_IDX = tokenizer.convert_tokens_to_ids(BERT_PAD_TOKEN)
BERT_UNK_IDX = tokenizer.convert_tokens_to_ids(BERT_UNK_TOKEN)

print(f'{BERT_EOS_TOKEN}:{BERT_EOS_IDX}, {BERT_PAD_TOKEN}:{BERT_PAD_IDX}, {BERT_UNK_TOKEN}:{BERT_UNK_IDX}') 

BERT_MAX_SEQUENCE = tokenizer.max_model_input_sizes[BERT_MODEL]
print(BERT_MAX_SEQUENCE)

In [None]:
def bert_tokenize_and_cut(sentence):
    sentences = re.split(r'\s*<sep>(?:\s*<sep>)*\s*', sentence)
    filtered_sentence = list(filter(lambda sent: '<unk>' not in sent, sentences))
    sents = [tokenizer.tokenize(sent[:BERT_MAX_SEQUENCE-2]) for sent in filtered_sentence]
    tokens = []
    sents = [allsents.split() for allsents in 
             [' [SEP] '.join(sent) for sent in [[' '.join(token) for token in sents]]]]
    tokens.extend(sents[0])
    return tokens

In [None]:
def my_convert_tokens_to_ids(sents_tokens):
    sents_tokens = " ".join(sents_tokens)
    sents_tokens = re.split(r'(?i)\s*\[sep\](?:\s*\[sep\])*\s*', sents_tokens)
    sents_tokens = list(filter(lambda x: len(x) > 2, [('[CLS] '+sent+' [SEP]').split() for sent in sents_tokens]))
    sents = [tokenizer.convert_tokens_to_ids(tokens) for tokens in sents_tokens]
    tokens = []
    for sent in sents:
        tokens.extend(sent[:MAX_SEQUENCE-1-len(tokens)])
    tokens.append(BERT_EOS_IDX)
    return tokens

In [None]:
BERT_ALL_TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = bert_tokenize_and_cut,
                  preprocessing = my_convert_tokens_to_ids,
                  pad_token = BERT_PAD_IDX,
                  unk_token = BERT_UNK_IDX, lower = True)

BERT_BH_TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = bert_tokenize_and_cut,
                  preprocessing = my_convert_tokens_to_ids,
                  pad_token = BERT_PAD_IDX,
                  unk_token = BERT_UNK_IDX, lower = True)

BERT_EP_TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = bert_tokenize_and_cut,
                  preprocessing = my_convert_tokens_to_ids,
                  pad_token = BERT_PAD_IDX,
                  unk_token = BERT_UNK_IDX, lower = True)

BERT_MAJ_LABEL = data.LabelField(dtype = torch.float)
BERT_SCH_LABEL = data.LabelField(dtype = torch.float)
BERT_BIP_LABEL = data.LabelField(dtype = torch.float)
BERT_MIN_LABEL = data.LabelField(dtype = torch.float)
BERT_DEM_LABEL = data.LabelField(dtype = torch.float)

In [None]:
full_bert_train_data, bert_test_data = NTUHDataset.splits(BERT_BH_TEXT, BERT_EP_TEXT, BERT_ALL_TEXT, 
                                           BERT_MAJ_LABEL, BERT_SCH_LABEL, BERT_BIP_LABEL, 
                                                            BERT_MIN_LABEL, BERT_DEM_LABEL)

bert_train_data, bert_valid_data = full_bert_train_data.split(random_state = random.seed(SEED), 
                                                                 split_ratio = TRAIN_RATIO)

In [None]:
print(vars(bert_train_data.examples[0])['all_text'])
tokens = tokenizer.convert_ids_to_tokens(vars(bert_train_data.examples[0])['all_text'])
print(tokens)
print(len(tokens))

# Create Cache for BERT

In [None]:
bert_cache={}

In [None]:
def create_attention_masks(ids):
        attention_masks = []
        for id in ids:
            id_mask = [float(i>0) for i in id]            
            attention_masks.append(id_mask)
        return torch.tensor(attention_masks)
    
def generate_bert_embedding(sents, sep_token, pad_idx, bert, embedding_dim):
    # ID:102 is used to separate sentences
    # [batch size, sent len]    
    bert = bert.cpu()
    bert.eval()
    with torch.no_grad():
        sep_idxes = (sents == sep_token).nonzero().squeeze(1).data.tolist()
        seq_lengths = []
        sents_ids = []
        pv = -1
        for k, v in enumerate(sep_idxes):                
            sent_embedding = [pad_idx]*BERT_MAX_SEQUENCE
            if k == 0:
                seq_lengths.append(v+1)
                sent_embedding[:v+1] = sents[:v+1].data.tolist()
            else:
                seq_lengths.append(v-pv)
                sent_embedding[:v-pv] = sents[pv+1:v+1].data.tolist()
            sents_ids.append(sent_embedding)
            pv = v
        attention_masks = create_attention_masks(sents_ids)#.to(device)
        sents_ids = torch.tensor(sents_ids)#.to(device)
        sent_embeddings = []
    
        _, _, hidden_states = bert(sents_ids, attention_masks)
        #del attention_masks, sents_ids, bert
        #torch.cuda.empty_cache()
                
        token_embeddings = torch.stack(hidden_states[:-1], dim=0)#.cpu()
        token_embeddings = token_embeddings.permute(1, 2, 0, 3)
        for id, tks in enumerate(token_embeddings):
            token_vecs = []
            for i in range(seq_lengths[id]):
                #cat_vec = torch.cat((tks[i][-1], tks[i][-2], tks[i][-3], tks[i][-4]), dim =0)
                sum_all_vec = torch.sum(tks[i][:], dim =0)
                token_vecs.append(sum_all_vec)
            token_vecs=torch.stack(token_vecs, 0)
            sent_embeddings.append(token_vecs)
        sent_embeddings = torch.cat(sent_embeddings, 0)                
        if sent_embeddings.shape[0] != MAX_SEQUENCE:
            sent_embeddings = torch.cat((sent_embeddings, \
                    torch.zeros(MAX_SEQUENCE - sent_embeddings.shape[0], embedding_dim)), 0)
        # # sentences, # words, # layers, # features
    return sent_embeddings

i = 0
for data in tqdm(full_bert_train_data):    
    t = torch.tensor(data.all_text)
    key = ' '.join(str(x) for x in t.data.tolist())
    if key not in bert_cache:
        sent_embedding = generate_bert_embedding(t, BERT_EOS_IDX, BERT_PAD_IDX, bert,
                                                bert.config.to_dict()['hidden_size'])        
        bert_cache[key] = sent_embedding
    if i % 10 == 0:
        torch.save(bert_cache, 'full_text_all_cache.pt')
    i+=1
torch.save(bert_cache, 'full_text_all_cache.pt')

In [None]:
for data in tqdm(bert_test_data):    
    t = torch.tensor(data.all_text)
    key = ' '.join(str(x) for x in t.data.tolist())
    if key not in bert_cache:
        sent_embedding = generate_bert_embedding(t, BERT_EOS_IDX, BERT_PAD_IDX, bert,
                                                bert.config.to_dict()['hidden_size'])        
        bert_cache[key] = sent_embedding
    if i % 10 == 0:
        torch.save(bert_cache, 'test_cache.pt')
    i+=1
torch.save(bert_cache, 'test_cache.pt')

# Load BERT Cache

In [None]:
bert_cache=torch.load('full_text_all_cache.pt')
len(bert_cache)

In [None]:
BERT_MAJ_LABEL.build_vocab(bert_train_data)
BERT_SCH_LABEL.build_vocab(bert_train_data)
BERT_BIP_LABEL.build_vocab(bert_train_data)
BERT_MIN_LABEL.build_vocab(bert_train_data)
BERT_DEM_LABEL.build_vocab(bert_train_data)

bert_train_iterator, bert_valid_iterator, bert_test_iterator = data.BucketIterator.splits(
    (bert_train_data, bert_valid_data, bert_test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(next(iter(bert_train_iterator)).all_text[0].data.tolist())
print(tokens)
print(len(tokens))

## BERT Model

In [None]:
class FastTextBERT(nn.Module):
    def __init__(self, bert, output_dim, pad_idx, sep_token):
        super().__init__()
        
        self.pad_idx = pad_idx
        self.bert = bert
        self.sep_token = sep_token
        self.bert.eval()
        self.embedding_dim = bert.config.to_dict()['hidden_size'] #* 4 # here we concatenate the last four layers
        self.fc = nn.Linear(self.embedding_dim, output_dim)
    def create_attention_masks(self, ids):
        attention_masks = []
        for id in ids:
            id_mask = [float(i>0) for i in id]            
            attention_masks.append(id_mask)
        return torch.tensor(attention_masks).to(device)
    
    def embedding(self, batch):
        # ID:102 is used to separate sentences
        # [batch size, sent len]
        batch_embeddings = []
        for sents in batch:      
            key = ' '.join(str(x) for x in sents.data.tolist())
            key = re.sub(r'(\s+0)+\s*', '', key)
            if key in bert_cache:
                sent_embeddings = bert_cache[key]
            else:
                sep_idxes = (sents == self.sep_token).nonzero().squeeze(1).data.tolist()
                seq_lengths = []
                sents_ids = []
                pv = -1
                for k, v in enumerate(sep_idxes):                
                    sent_embedding = [self.pad_idx]*BERT_MAX_SEQUENCE
                    if k == 0:
                        seq_lengths.append(v+1)
                        sent_embedding[:v+1] = sents[:v+1].data.tolist()
                    else:
                        seq_lengths.append(v-pv)
                        sent_embedding[:v-pv] = sents[pv+1:v+1].data.tolist()
                    sents_ids.append(sent_embedding)
                    pv = v
                attention_masks = self.create_attention_masks(sents_ids)
                sents_ids = torch.tensor(sents_ids).to(device)
                sent_embeddings = []
                with torch.no_grad():
                    last_hidden_state, _, hidden_states = self.bert(sents_ids, attention_masks)
                    token_embeddings = torch.stack(hidden_states[:-1], dim=0)
                    token_embeddings = token_embeddings.permute(1, 2, 0, 3)
                    for id, tks in enumerate(token_embeddings):
                        token_vecs = []
                        for i in range(seq_lengths[id]):
                            #cat_vec = torch.cat((tks[i][-1], tks[i][-2], tks[i][-3], tks[i][-4]), dim =0)
                            sum_all_vec = torch.sum(tks[i][:], dim =0)
                            token_vecs.append(sum_all_vec)
                            #token_vecs.append(cat_vec)
                        token_vecs=torch.stack(token_vecs, 0)
                        sent_embeddings.append(token_vecs)
                    sent_embeddings = torch.cat(sent_embeddings, 0)                
                    if sent_embeddings.shape[0] != MAX_SEQUENCE:
                        sent_embeddings = torch.cat((sent_embeddings, \
                                torch.zeros(MAX_SEQUENCE - sent_embeddings.shape[0], self.embedding_dim).to(device)), 0)
                    # # sentences, # words, # layers, # features
                bert_cache[key] = sent_embeddings
            batch_embeddings.append(sent_embeddings.to(device))
        batch_embeddings = torch.stack(batch_embeddings, 0)
        return batch_embeddings        
        
    def forward(self, text):        
        embedded = self.embedding(text)
                
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        return self.fc(pooled)

In [None]:
bert_model = FastTextBERT(bert, OUTPUT_DIM, BERT_PAD_IDX, BERT_EOS_IDX)
bert_model

In [None]:
bert_optimizer = optim.Adam([param for param in bert_model.parameters() if param.requires_grad == True])
bert_criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
bert_model = bert_model.to(device)
bert_criterion = bert_criterion.to(device)

bert_train_losses, bert_valid_losses, bert_train_accs, bert_valid_accs = \
     train_epoch(N_EPOCHS, bert_model, bert_train_iterator, bert_optimizer, bert_criterion, 0, 'bert1', 
                 bert_valid_iterator, early_stop= True, period = 30)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, bert_train_losses, bert_valid_losses, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, bert_train_accs, bert_valid_accs, 'Training/Validation F-Measure', {'label': 'Training F-Measure'}, {'label': 'Validation F-Measure'})

In [None]:
test_f_scores, predicts = test(bert_model, bert_test_iterator, bert_criterion, 0)
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          

In [None]:
test_f_scores, predicts = test(bert_model, bert_test_iterator, bert_criterion, 0, 'bert1_fscore')
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          

In [None]:
bert_bh_cache=torch.load('bh_text_cache_all.pt')
print(len(bert_bh_cache))
bert_ep_cache=torch.load('ep_text_cache_all.pt')
print(len(bert_ep_cache))

In [None]:
class FastText2BERT(nn.Module):
    def __init__(self, bert, output_dim, pad_idx, sep_token):
        super().__init__()
        
        self.pad_idx = pad_idx
        self.bert = bert
        self.sep_token = sep_token
        self.bert.eval()
        self.embedding_dim = bert.config.to_dict()['hidden_size'] #* 4 # here we concatenate the last four layers
        self.fc = nn.Linear(self.embedding_dim*2, output_dim)
        #self.dropout = nn.Dropout(DROPOUT)
        
    def create_attention_masks(self, ids):
        attention_masks = []
        for id in ids:
            id_mask = [float(i>0) for i in id]            
            attention_masks.append(id_mask)
        return torch.tensor(attention_masks).to(device)
    
    def embedding(self, batch, cache):
        # ID:102 is used to separate sentences
        # [batch size, sent len]
        batch_embeddings = []
        for sents in batch:      
            key = ' '.join(str(x) for x in sents.data.tolist())
            key = re.sub(r'(\s+0)+\s*', '', key)
            if key in cache:
                sent_embeddings = cache[key]
            else:
                print('Not_found')
                return
                sep_idxes = (sents == self.sep_token).nonzero().squeeze(1).data.tolist()
                seq_lengths = []
                sents_ids = []
                pv = -1
                for k, v in enumerate(sep_idxes):                
                    sent_embedding = [self.pad_idx]*BERT_MAX_SEQUENCE
                    if k == 0:
                        seq_lengths.append(v+1)
                        sent_embedding[:v+1] = sents[:v+1].data.tolist()
                    else:
                        seq_lengths.append(v-pv)
                        sent_embedding[:v-pv] = sents[pv+1:v+1].data.tolist()
                    sents_ids.append(sent_embedding)
                    pv = v
                attention_masks = self.create_attention_masks(sents_ids)
                sents_ids = torch.tensor(sents_ids).to(device)
                sent_embeddings = []
                with torch.no_grad():
                    last_hidden_state, _, hidden_states = self.bert(sents_ids, attention_masks)
                    token_embeddings = torch.stack(hidden_states[:-1], dim=0)
                    token_embeddings = token_embeddings.permute(1, 2, 0, 3)
                    for id, tks in enumerate(token_embeddings):
                        token_vecs = []
                        for i in range(seq_lengths[id]):
                            #cat_vec = torch.cat((tks[i][-1], tks[i][-2], tks[i][-3], tks[i][-4]), dim =0)
                            sum_all_vec = torch.sum(tks[i][:], dim =0)
                            token_vecs.append(sum_all_vec)
                            #token_vecs.append(cat_vec)
                        token_vecs=torch.stack(token_vecs, 0)
                        sent_embeddings.append(token_vecs)
                    sent_embeddings = torch.cat(sent_embeddings, 0)                
                    if sent_embeddings.shape[0] != MAX_SEQUENCE:
                        sent_embeddings = torch.cat((sent_embeddings, \
                                torch.zeros(MAX_SEQUENCE - sent_embeddings.shape[0], self.embedding_dim).to(device)), 0)
                    # # sentences, # words, # layers, # features
                bert_cache[key] = sent_embeddings
            batch_embeddings.append(sent_embeddings.to(device))
        batch_embeddings = torch.stack(batch_embeddings, 0)
        return batch_embeddings        
        
    def forward(self, bh_text, ep_text):        
        embedded1 = self.embedding(bh_text, bert_bh_cache)
        embedded2 = self.embedding(ep_text, bert_ep_cache)
                
        pooled1 = F.avg_pool2d(embedded1, (embedded1.shape[1], 1)).squeeze(1) 
        pooled2 = F.avg_pool2d(embedded2, (embedded2.shape[1], 1)).squeeze(1) 
        
        return self.fc(torch.cat((pooled1, pooled2), 1))

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

bert_model2 = FastText2BERT(bert, OUTPUT_DIM, BERT_PAD_IDX, BERT_EOS_IDX)

bert_optimizer = optim.Adam([param for param in bert_model2.parameters() if param.requires_grad == True])
bert_criterion = nn.BCEWithLogitsLoss(pos_weight = POS_WEIGHT)
bert_model2 = bert_model2.to(device)
bert_criterion = bert_criterion.to(device)

bert_train_losses2, bert_valid_losses2, bert_train_accs2, bert_valid_accs2 = \
     train_epoch(N_EPOCHS, bert_model2, bert_train_iterator, bert_optimizer, bert_criterion, 1, 'bert2', 
                 bert_valid_iterator, early_stop = True, period = 30)   

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(15,10))
analysis_plotter(fig, ax1, bert_train_losses2, bert_valid_losses2, 'Training/Validation Loss', {'label': 'Training Loss'}, {'label': 'Validation Loss'})
analysis_plotter(fig, ax2, bert_train_accs2, bert_valid_accs2, 'Training/Validation F-score', {'label': 'Training F-score'}, {'label': 'Validation Accuracy'})

test_f_scores, predicts = test(bert_model2, bert_test_iterator, bert_criterion, 1)
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          

In [None]:
test_f_scores, predicts = test(bert_model2, bert_test_iterator, bert_criterion, 1, 'bert2_fscore')
          
for f in test_f_scores:
    if f is MICRO or f is MACRO:
        print(f'{f}-average:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')
    else:
        print(f'{NTUHDataset.diagnosis_types[f]}:\n\tprecision: {test_f_scores[f]["p"]:0.3f}\n\trecall: {test_f_scores[f]["r"]:0.3f}\n\tf-score: {test_f_scores[f]["f"]:0.3f}\n')          