In [1]:
import enum
import glob
import os
from hashlib import new
from pathlib import Path
import time

import functools
from itertools import product

import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from snorkel.labeling.model import LabelModel as LMsnorkel
from snorkel.labeling.model import MajorityLabelVoter

In [2]:
from sklearn.exceptions import UndefinedMetricWarning

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
def list2Nested(l, nested_length):
    return [l[i:i+nested_length] for i in range(0, len(l), nested_length)]

In [4]:
# Fetch UMLS ranks

sum_lf_p = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/lf_p_summary_tuipio2_train.csv'
sum_lf_i = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/lf_i_summary_tuipio2_train.csv'
sum_lf_o = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/lf_o_summary_tuipio2_train.csv'


def fetchRank(sum_lf_d):
    
    ranked_umls_coverage = dict()    
    umls_coverage_ = dict()
    
    data=pd.read_csv(sum_lf_d, sep='\t')
    
    for index, row in data.iterrows():
        if row[0].startswith('UMLS_new_fuzzy_'):
            umls_coverage_[row[0]] = row[3]
    
    umls_coverage_sorted = sorted(umls_coverage_.items(), key=lambda x: x[1], reverse=True)
    
    for i in umls_coverage_sorted:
        k = str(i[0]).split('_lf_')[-1]
        ranked_umls_coverage[k] = i[1]

    return ranked_umls_coverage

ranksorted_p_umls = fetchRank(sum_lf_p)
ranksorted_i_umls = fetchRank(sum_lf_i)
ranksorted_o_umls = fetchRank(sum_lf_o)

In [5]:
# Partition LF's

def partitionLFs(umls_d):
    
    keys = list(umls_d.keys())

    partitioned_lfs = [ ]
    
    for i in range( 0, len(keys) ):

        if i == 0 or i == len(keys):
            if i == 0:
                partitioned_lfs.append( [keys] )
            if i ==len(keys):
                temp3 = list2Nested(keys, 1)
                partitioned_lfs.append( temp3 )
        else:
            temp1, temp2 = keys[:i] , keys[i:]
            temp3 = list2Nested( keys[:i], 1)
            temp3.append( keys[i:] )
            partitioned_lfs.append( temp3 )
    
    return partitioned_lfs


partitioned_p_umls = partitionLFs(ranksorted_p_umls)
partitioned_i_umls = partitionLFs(ranksorted_i_umls)
partitioned_o_umls = partitionLFs(ranksorted_o_umls)

In [6]:
import LMutils

# validation_labels   
# validation_labels_tui_pio2   
file = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/validation_labels_tui_pio2.tsv'
df_data = pd.read_csv(file, sep='\t', header=0)

Y_tokens = df_data['tokens']
df_data_train, df_data_val = train_test_split(df_data, test_size=0.20, shuffle=False)

In [7]:
# Read Candidate labels from multiple LFs
indir = '/mnt/nas2/results/Results/systematicReview/distant_pico/candidate_generation'
pathlist = Path(indir).glob('**/*.tsv')

tokens = []

lfs = dict()

for file in pathlist:

    k = str( file ).split('candidate_generation/')[-1].replace('.tsv', '').replace('/', '_')
    mypath = Path(file)
    if mypath.stat().st_size != 0:
        data = pd.read_csv(file, sep='\t', header=0)
    if len(tokens) == 0:
        tokens.extend( list(data.tokens) )
    
    sab = data.columns[-1]
    if len(list( data[sab] )) == 1354953:
        lfs[str(k)] = list( data[sab] )[:len(Y_tokens)]


print( 'Total number of tokens in validation set: ', len(tokens) )
print( 'Total number of LFs in the dictionary', len(lfs) )

Total number of tokens in validation set:  1354953
Total number of LFs in the dictionary 628


In [8]:
file_test_ebm = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/test_ebm_labels_tui_pio2.tsv'
df_data_test_ebm = pd.read_csv(file_test_ebm, sep='\t', header=0)
y_tokens = df_data_test_ebm['tokens']

In [9]:
# Read Candidate labels from multiple LFs
indir_test_ebm = '/mnt/nas2/results/Results/systematicReview/distant_pico/test_ebm_candidate_generation'
pathlist_test_ebm = Path(indir_test_ebm).glob('**/*.tsv')

tokens_test_ebm = []

lfs_test_ebm = dict()

for file in pathlist_test_ebm:
    k = str( file ).split('test_ebm_candidate_generation/')[-1].replace('.tsv', '').replace('/', '_')
    mypath = Path(file)
    
    if mypath.stat().st_size != 0:
        data_test_ebm = pd.read_csv(file, sep='\t', header=0)
    if len(tokens_test_ebm) == 0:
        tokens_test_ebm.extend( list(data_test_ebm.tokens) )
        
    sab_test_ebm = data_test_ebm.columns[-1]
    if len(list( data_test_ebm[sab_test_ebm] )) == 51784:
        lfs_test_ebm[str(k)] = list( data_test_ebm[sab_test_ebm] )[:len(y_tokens)]


print( 'Total number of tokens in validation set: ', len(tokens_test_ebm) )
print( 'Total number of LFs in the dictionary', len(lfs_test_ebm) )

Total number of tokens in validation set:  51784
Total number of LFs in the dictionary 176


In [10]:
def lf_levels(umls_d:dict, pattern:str, picos:str):

    umls_level = dict()

    for key, value in umls_d.items():   # iter on both keys and values
        search_pattern = pattern + picos
        if key.startswith(search_pattern):
            k = str(key).split('_lf_')[-1]
            umls_level[ k ] = value

    return umls_level

# Level 1: UMLS
umls_p = lf_levels(lfs, 'UMLS_new_fuzzy_', 'p')
umls_i = lf_levels(lfs, 'UMLS_new_fuzzy_', 'i')
umls_o = lf_levels(lfs, 'UMLS_new_fuzzy_', 'o')
umls_p_test_ebm = lf_levels(lfs_test_ebm, 'UMLS_new_fuzzy_', 'p')
umls_i_test_ebm = lf_levels(lfs_test_ebm, 'UMLS_new_fuzzy_', 'i')
umls_o_test_ebm = lf_levels(lfs_test_ebm, 'UMLS_new_fuzzy_', 'o')

# Level 2: non UMLS
nonumls_p = lf_levels(lfs, 'nonUMLS_fuzzy_', 'P')
nonumls_i = lf_levels(lfs, 'nonUMLS_fuzzy_', 'I')
nonumls_o = lf_levels(lfs, 'nonUMLS_fuzzy_', 'O')
nonumls_p_test_ebm = lf_levels(lfs_test_ebm, 'nonUMLS_fuzzy_', 'P')
nonumls_i_test_ebm = lf_levels(lfs_test_ebm, 'nonUMLS_fuzzy_', 'I')
nonumls_o_test_ebm = lf_levels(lfs_test_ebm, 'nonUMLS_fuzzy_', 'O')

# Level 3: DS
ds_p = lf_levels(lfs, 'DS_fuzzy_', 'P')
ds_i = lf_levels(lfs, 'DS_fuzzy_', 'I')
ds_o = lf_levels(lfs, 'DS_fuzzy_', 'O')
ds_p_test_ebm = lf_levels(lfs_test_ebm, 'DS_fuzzy_', 'P')
ds_i_test_ebm = lf_levels(lfs_test_ebm, 'DS_fuzzy_', 'I')
ds_o_test_ebm = lf_levels(lfs_test_ebm, 'DS_fuzzy_', 'O')

# Level 4: dictionary, rules, heuristics
heur_p = lf_levels(lfs, 'heuristics_direct_', 'P')
heur_i = lf_levels(lfs, 'heuristics_direct_', 'I')
heur_o = lf_levels(lfs, 'heuristics_direct_', 'O')
heur_p_test_ebm = lf_levels(lfs_test_ebm, 'heuristics_direct_', 'P')
heur_i_test_ebm = lf_levels(lfs_test_ebm, 'heuristics_direct_', 'I')
heur_o_test_ebm = lf_levels(lfs_test_ebm, 'heuristics_direct_', 'O')

dict_p = lf_levels(lfs, 'dictionary_direct_', 'P')
dict_i = lf_levels(lfs, 'dictionary_direct_', 'I')
dict_o = lf_levels(lfs, 'dictionary_direct_', 'O')
dict_p_test_ebm = lf_levels(lfs_test_ebm, 'dictionary_direct_', 'P')
dict_i_test_ebm = lf_levels(lfs_test_ebm, 'dictionary_direct_', 'I')
dict_o_test_ebm = lf_levels(lfs_test_ebm, 'dictionary_direct_', 'O')

In [11]:
def sample_param_grid(param_grid, seed):
    """ Sample parameter grid

    :param param_grid:
    :param seed:
    :return:
    """
    rstate = np.random.get_state()
    np.random.seed(seed)
    params = list(product(*[param_grid[name] for name in param_grid]))
    np.random.shuffle(params)
    np.random.set_state(rstate)
    return params

In [12]:
def compare(s, t):
    return sorted(s) == sorted(t)

def getLFs(partition:list, umls_d:dict, seed_len:int):

    all_lfs_combined = []
    
    for lf in partition: # for each lf in a partition
        
        combine_here = [0] * seed_len

        for sab in lf:
            new_a = umls_d[sab]
            old_a = combine_here
            temp_a = []
            for o_a, n_a in zip(old_a, new_a):
                if compare([o_a, n_a] ,[-1, 1]) == True:
                    replace_a = max( o_a, n_a )
                    temp_a.append( replace_a )
                elif compare([o_a, n_a] ,[0, 1]) == True:
                    replace_a = max( o_a, n_a )
                    temp_a.append( replace_a )
                elif compare([o_a, n_a] ,[-1, 0]) == True:
                    replace_a = min( o_a, n_a )
                    temp_a.append( replace_a )
                else:
                    temp_a.append( o_a )

            combine_here = temp_a

        all_lfs_combined.append( combine_here )

    return all_lfs_combined

In [13]:
def grid_search(model_class,
                model_class_init,
                param_grid,
                train=None,
                dev=None,
                other_train=None,
                n_model_search=5,
                val_metric='f1_macro',
                seed=1234,
                checkpoint_gt_mv=False,
                tag_fmt_ckpnt='IO'):
    
    
    """Simple grid search helper function

    Parameters
    ----------
    model_class
    model_class_init
    param_grid
    train
    dev
    n_model_search
    val_metric
    seed

    Returns
    -------
    

    """
    
    
    L_train, Y_train = train
    L_dev, Y_dev = dev
    
    # sample configs
    params = sample_param_grid(param_grid, seed)[:n_model_search]
    
    defaults = {'seed': seed}
    best_score, best_config = 0.0, None
    print(f"Grid search over {len(params)} configs")
    
    for i, config in enumerate(params):
        print(f'[{i}] Label Model')
        config = dict(zip(param_grid.keys(), config))
        # update default params if not specified
        config.update({param: value for param, value in defaults.items() if param not in config})

        model = model_class(**model_class_init)
        model.fit(L_train, Y_dev, **config)
        
        y_pred = model.predict(L_dev)
        
        # set gold tags for evaluation
        if tag_fmt_ckpnt == 'IO':
            y_gold = np.array([0 if y == 0 else 1 for y in Y_dev])
        else:
            y_gold = Y_dev
            
            
        if -1 in y_pred:
            print("Label model predicted -1 (TODO: this happens inconsistently)")
            continue
            
        # use internal label model scorer to score the prediction
        metrics = model.score(L=L_dev,
                              Y=y_gold,
                              metrics=['accuracy', 'precision', 'recall', 'f1', 'f1_macro'],
                              tie_break_policy='random')
        
    
        # compare learned model against MV on same labeled dev set
        # skip if LM less than MV
        if checkpoint_gt_mv:
            mv_metrics = model.score(L=L_dev,
                                  Y=y_gold,
                                  metrics=['accuracy', 'precision', 'recall', 'f1', 'f1_macro'],
                                  tie_break_policy='random')

            if metrics[val_metric] < mv_metrics[val_metric]:
                continue
                
        if not best_score or metrics[val_metric] > best_score[val_metric]:
            print(config)
            best_score = metrics
            best_config = config
            
            # print training set score if we have labeled data
            if np.any(Y_train):
                y_pred = model.predict(L_train)

                if tag_fmt_ckpnt == 'IO':
                    y_gold = np.array([0 if y == 0 else 1 for y in Y_train])
                else:
                    y_gold = Y_train

                metrics = model.score(L=L_train,
                                      Y=y_gold,
                                      metrics=['accuracy', 'precision', 'recall', 'f1', 'f1_macro'],
                                      tie_break_policy='random')

                print('[TRAIN] {}'.format(' | '.join([f'{m}: {v * 100:2.2f}' for m, v in metrics.items()])))

            print('[DEV]   {}'.format(' | '.join([f'{m}: {v * 100:2.2f}' for m, v in best_score.items()])))
            print('-' * 88)
            
            
    # retrain best model
    print('BEST')
    print(best_config)
    model = model_class(**model_class_init)
    
    model.fit(L_train, Y_dev, **best_config)
    return model, best_config, best_score

In [14]:
def train(partitioned_d_umls, train_lfs, test_lfs, df_data_train, df_data_val, df_data_test, picos, paramgrid):

    umls_d, non_umls_d, ds_d, heur_d, dict_d = train_lfs
    umls_d_test, non_umls_d_test, ds_d_test, heur_d_test, dict_d_test = test_lfs

    best_f1_macro = 0.0
    best_overall_model = ''
    best_overall_config = ''  
    
    model_class_init = {
        'cardinality': 2, 
        'verbose': True
    }

    num_hyperparams = functools.reduce(lambda x,y:x*y, [len(x) for x in param_grid.values()])
    print("Hyperparamater Search Space:", num_hyperparams)
    n_model_search = 25
    
    
    '''#########################################################################
    # Choosing the number of LF's from UMLS all
    #########################################################################'''
            
    for i, partition in enumerate(partitioned_d_umls):

        combined_lf = getLFs(partition, umls_d, len(Y_tokens))
        assert len(partition) == len(combined_lf)

        print( 'Total number of UMLS partitions: ', len(partition) )
        combined_lf.extend( list(non_umls_d.values()) ) # Combine with level 2
        combined_lf.extend( list(ds_d.values()) ) # Combine with level 3
        combined_lf.extend( list(heur_d.values()) ) # Combine with level 4
        combined_lf.extend( list(dict_d.values()) ) # combine with level 4

        L = np.array(combined_lf)
        L = np.transpose(L)
        L_train, L_val = train_test_split(L, test_size=0.20, shuffle=False)

        Y_train = df_data_train[picos]
        Y_val = df_data_val[picos]
        Y_test = df_data_test[picos]
        
        best_model, best_config, best_score = grid_search(LMsnorkel, 
                                           model_class_init, 
                                           paramgrid,
                                           train = (L_train, Y_train),
                                           dev = (L_val, Y_val),
                                           n_model_search=n_model_search, 
                                           val_metric='f1_macro', 
                                           seed=1234,
                                           tag_fmt_ckpnt='IO')
        
        
        # Use the best model to predict on the test set
        combined_lf_test = getLFs(partition, umls_d_test, len(y_tokens))
        assert len(partition) == len(combined_lf_test)
        
        print( 'Total number of UMLS partitions: ', len(partition) )
        combined_lf_test.extend( list(non_umls_d_test.values()) ) # Combine with level 2
        combined_lf_test.extend( list(ds_d_test.values()) ) # Combine with level 3
        combined_lf_test.extend( list(heur_d_test.values()) ) # Combine with level 4
        combined_lf_test.extend( list(dict_d_test.values()) ) # combine with level 4

        L_test = np.array(combined_lf_test)
        L_test = np.transpose(L_test)
        y_pred_test = best_model.predict(L_test)
        test_metrics = best_model.score(L=L_test,
                              Y=Y_test,
                              metrics=['accuracy', 'precision', 'recall', 'f1', 'f1_macro'],
                              tie_break_policy='random')
        print('[TEST]   {}'.format(' | '.join([f'{m}: {v * 100:2.2f}' for m, v in test_metrics.items()])))
        
        
        if test_metrics['f1_macro'] > best_f1_macro:
            best_f1_macro = test_metrics['f1_macro']
            best_overall_model = best_model
            best_overall_config = best_config
            
        
        print('Best overall macro F1 score: ', best_f1_macro)
        print('Best overall configuration: ', best_overall_config)

In [15]:
param_grid = {
    'lr': [0.001, 0.0001],
    'l2': [0.001, 0.0001],
    'n_epochs': [50, 100, 200, 600, 700, 1000, 2000],
    'prec_init': [0.6, 0.7, 0.8, 0.9],
    'optimizer': ["adamax", "adam", "sgd"],
    'lr_scheduler': ['constant'],
}

In [None]:
train_lfs = umls_p, nonumls_p, ds_p, heur_p, dict_p
test_lfs = umls_p_test_ebm, nonumls_p_test_ebm, ds_p_test_ebm, heur_p_test_ebm, dict_p_test_ebm

train(partitioned_p_umls, train_lfs, test_lfs, df_data_train, df_data_val, df_data_test_ebm, 'p', paramgrid = param_grid)

Hyperparamater Search Space: 336
Total number of UMLS partitions:  1
Grid search over 25 configs
[0] Label Model
{'lr': 0.001, 'l2': 0.0001, 'n_epochs': 200, 'prec_init': 0.8, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
[TRAIN] accuracy: 87.89 | precision: 0.00 | recall: 0.00 | f1: 0.00 | f1_macro: 46.78
[DEV]   accuracy: 87.70 | precision: 0.00 | recall: 0.00 | f1: 0.00 | f1_macro: 46.72
----------------------------------------------------------------------------------------
[1] Label Model
[2] Label Model
[3] Label Model
{'lr': 0.001, 'l2': 0.0001, 'n_epochs': 700, 'prec_init': 0.9, 'optimizer': 'adamax', 'lr_scheduler': 'constant', 'seed': 1234}
[TRAIN] accuracy: 86.32 | precision: 28.94 | recall: 8.88 | f1: 13.60 | f1_macro: 53.09
[DEV]   accuracy: 86.23 | precision: 30.16 | recall: 9.07 | f1: 13.95 | f1_macro: 53.23
----------------------------------------------------------------------------------------
[4] Label Model
[5] Label Model
[6] Label Model
[7] Label M

[12] Label Model
[13] Label Model
[14] Label Model
[15] Label Model
[16] Label Model
[17] Label Model
[18] Label Model
[19] Label Model
[20] Label Model
[21] Label Model
[22] Label Model
[23] Label Model
[24] Label Model
BEST
{'lr': 0.001, 'l2': 0.001, 'n_epochs': 600, 'prec_init': 0.6, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
Total number of UMLS partitions:  4
[TEST]   accuracy: 82.65 | precision: 18.14 | recall: 24.38 | f1: 20.80 | f1_macro: 55.53
Best overall macro F1 score:  0.5562154113512056
Best overall configuration:  {'lr': 0.001, 'l2': 0.001, 'n_epochs': 600, 'prec_init': 0.6, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
Total number of UMLS partitions:  5
Grid search over 25 configs
[0] Label Model
{'lr': 0.001, 'l2': 0.0001, 'n_epochs': 200, 'prec_init': 0.8, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
[TRAIN] accuracy: 87.70 | precision: 41.13 | recall: 3.55 | f1: 6.54 | f1_macro: 49.98
[DEV]   accuracy: 87.53 | p

[6] Label Model
[7] Label Model
[8] Label Model
[9] Label Model
[10] Label Model
[11] Label Model
[12] Label Model
[13] Label Model
[14] Label Model
[15] Label Model
[16] Label Model
[17] Label Model
[18] Label Model
[19] Label Model
[20] Label Model
[21] Label Model
[22] Label Model
[23] Label Model
[24] Label Model
BEST
{'lr': 0.001, 'l2': 0.001, 'n_epochs': 600, 'prec_init': 0.6, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
Total number of UMLS partitions:  8
[TEST]   accuracy: 82.60 | precision: 18.31 | recall: 24.89 | f1: 21.10 | f1_macro: 55.66
Best overall macro F1 score:  0.5567791015521988
Best overall configuration:  {'lr': 0.001, 'l2': 0.001, 'n_epochs': 600, 'prec_init': 0.6, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
Total number of UMLS partitions:  9
Grid search over 25 configs
[0] Label Model
{'lr': 0.001, 'l2': 0.0001, 'n_epochs': 200, 'prec_init': 0.8, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
[TRAIN] accuracy

{'lr': 0.001, 'l2': 0.001, 'n_epochs': 600, 'prec_init': 0.6, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
[TRAIN] accuracy: 80.40 | precision: 20.88 | recall: 22.17 | f1: 21.51 | f1_macro: 55.16
[DEV]   accuracy: 80.37 | precision: 21.45 | recall: 22.39 | f1: 21.91 | f1_macro: 55.34
----------------------------------------------------------------------------------------
[5] Label Model
[6] Label Model
[7] Label Model
[8] Label Model
[9] Label Model
[10] Label Model
[11] Label Model
[12] Label Model
[13] Label Model
[14] Label Model
[15] Label Model
[16] Label Model
[17] Label Model
[18] Label Model
[19] Label Model
[20] Label Model
[21] Label Model
[22] Label Model
[23] Label Model
[24] Label Model
BEST
{'lr': 0.001, 'l2': 0.001, 'n_epochs': 600, 'prec_init': 0.6, 'optimizer': 'adam', 'lr_scheduler': 'constant', 'seed': 1234}
Total number of UMLS partitions:  12
[TEST]   accuracy: 82.41 | precision: 18.19 | recall: 25.22 | f1: 21.14 | f1_macro: 55.62
Best overall mac