# LF summaries

In [1]:
import enum
import glob
import os
from hashlib import new
from pathlib import Path
import time
from itertools import product

import functools

import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from snorkel.labeling.model import LabelModel as LMsnorkel
from snorkel.labeling.model import MajorityLabelVoter

from sklearn.model_selection import train_test_split
import itertools
import ast

In [2]:
candgen_version = 'v4' # version = {v3, v4, ...}

In [22]:
entity = 'S'

In [5]:
import joblib
import json
import collections

In [6]:
from sklearn.exceptions import UndefinedMetricWarning

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [7]:
def list2Nested(l, nested_length):
    return [l[i:i+nested_length] for i in range(0, len(l), nested_length)]

In [8]:
import torch
import numpy as np
import scipy.sparse as sparse
from scipy.sparse import issparse
from pandas import DataFrame, Series
from collections import Counter, defaultdict


def arraylike_to_numpy(array_like):
    """Convert a 1d array-like (e.g,. list, tensor, etc.) to an np.ndarray"""
    
    orig_type = type(array_like)
    
    # Convert to np.ndarray
    if isinstance(array_like, np.ndarray):
        pass
    elif isinstance(array_like, list):
        array_like = np.array(array_like)
    elif issparse(array_like):
        array_like = array_like.toarray()
    elif isinstance(array_like, torch.Tensor):
        array_like = array_like.numpy()
    elif not isinstance(array_like, np.ndarray):
        array_like = np.array(array_like)
    else:
        msg = f"Input of type {orig_type} could not be converted to 1d " "np.ndarray"
        raise ValueError(msg)
    
    # Correct shape
    if (array_like.ndim > 1) and (1 in array_like.shape):
        array_like = array_like.flatten()
    if array_like.ndim != 1:
        raise ValueError("Input could not be converted to 1d np.array")
    
    # Convert to ints
    if any(array_like % 1):
        raise ValueError("Input contains at least one non-integer value.")
    array_like = array_like.astype(np.dtype(int))

    return array_like


############################################################
# Label Matrix Diagnostics
############################################################
def _covered_data_points(L):
    """Returns an indicator vector where ith element = 1 if x_i is labeled by at
    least one LF."""
    return np.ravel(np.where(L.sum(axis=1) != 0, 1, 0))


def _overlapped_data_points(L):
    """Returns an indicator vector where ith element = 1 if x_i is labeled by
    more than one LF."""
    return np.where(np.ravel((L != 0).sum(axis=1)) > 1, 1, 0)


def _conflicted_data_points(L):
    """Returns an indicator vector where ith element = 1 if x_i is labeled by
    at least two LFs that give it disagreeing labels."""
    m = sparse.diags(np.ravel(L.max(axis=1).todense()))
    return np.ravel(np.max(m @ (L != 0) != L, axis=1).astype(int).todense())


def label_coverage(L):
    """Returns the **fraction of data points with > 0 (non-zero) labels**
    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith item
    """
    return _covered_data_points(L).sum() / L.shape[0]


def label_overlap(L):
    """Returns the **fraction of data points with > 1 (non-zero) labels**
    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith item
    """
    return _overlapped_data_points(L).sum() / L.shape[0]


def label_conflict(L):
    """Returns the **fraction of data points with conflicting (disagreeing)
    lablels.**
    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith item
    """
    return _conflicted_data_points(L).sum() / L.shape[0]


def lf_polarities(L):
    """Return the polarities of each LF based on evidence in a label matrix.

    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith candidate
    """
    polarities = [sorted(list(set(L[:, i].data))) for i in range(L.shape[1])]
    return [p[0] if len(p) == 1 else p for p in polarities]


def lf_coverages(L):
    """Return the **fraction of data points that each LF labels.**
    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith candidate
    """
    return np.ravel((L != 0).sum(axis=0)) / L.shape[0]


def lf_raw_coverages(L):
    """Raw number of covered instances"""
    return np.ravel((L != 0).sum(axis=0))


def lf_overlaps(L, normalize_by_coverage=False):
    """Return the **fraction of items each LF labels that are also labeled by at
     least one other LF.**

    Note that the maximum possible overlap fraction for an LF is the LF's
    coverage, unless `normalize_by_coverage=True`, in which case it is 1.

    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith candidate
        normalize_by_coverage: Normalize by coverage of the LF, so that it
            returns the percent of LF labels that have overlaps.
    """
    overlaps = (L != 0).T @ _overlapped_data_points(L) / L.shape[0]
    if normalize_by_coverage:
        overlaps /= lf_coverages(L)
    return np.nan_to_num(overlaps)


def lf_conflicts(L, normalize_by_overlaps=False):
    """Return the **fraction of items each LF labels that are also given a
    different (non-abstain) label by at least one other LF.**

    Note that the maximum possible conflict fraction for an LF is the LF's
        overlaps fraction, unless `normalize_by_overlaps=True`, in which case it
        is 1.

    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith candidate
        normalize_by_overlaps: Normalize by overlaps of the LF, so that it
            returns the percent of LF overlaps that have conflicts.
    """
    conflicts = (L != 0).T @ _conflicted_data_points(L) / L.shape[0]
    if normalize_by_overlaps:
        conflicts /= lf_overlaps(L)
    return np.nan_to_num(conflicts)



def lf_empirical_accuracies(L, Y):
    """Return the **empirical accuracy** against a set of labels Y (e.g. dev
    set) for each LF.
    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith candidate
        Y: an [n] or [n, 1] np.ndarray of gold labels
    """
    # Assume labeled set is small, work with dense matrices
    Y = arraylike_to_numpy(Y)
    L = L.toarray()
    X = np.where(
        L == 0,
        0,
        np.where(L == np.vstack([Y] * L.shape[1]).T, 1, -1)
    )
    return 0.5 * (X.sum(axis=0) / (L != 0).sum(axis=0) + 1)


def lf_summary(L, Y=None, lf_names=None, est_accs=None):
    """Returns a pandas DataFrame with the various per-LF statistics.

    Args:
        L: an n x m scipy.sparse matrix where L_{i,j} is the label given by the
            jth LF to the ith candidate
        Y: an [n] or [n, 1] np.ndarray of gold labels.
            If provided, the empirical accuracy for each LF will be calculated
    """
    n, m = L.shape
    if lf_names is not None:
        col_names = ["j"]
        d = {"j": list(range(m))}
    else:
        lf_names = list(range(m))
        col_names = []
        d = {}

    # Default LF stats
    col_names.extend(["Polarity", "Coverage%", "Overlaps%", "Conflicts%", "Coverage"])
    d["Polarity"] = Series(data=lf_polarities(L), index=lf_names)
    d["Coverage%"] = Series(data=lf_coverages(L), index=lf_names)
    d["Overlaps%"] = Series(data=lf_overlaps(L), index=lf_names)
    d["Conflicts%"] = Series(data=lf_conflicts(L), index=lf_names)

    d["Coverage"] = Series(data=lf_raw_coverages(L), index=lf_names)
    

    if Y is not None:
        col_names.extend(["Correct", "Incorrect", "Emp. Acc."])
        confusions = [
            confusion_matrix(Y, L[:, i], pretty_print=False) for i in range(m)
        ]
        corrects = [np.diagonal(conf).sum() for conf in confusions]
        incorrects = [
            conf.sum() - correct for conf, correct in zip(confusions, corrects)
        ]
        accs = lf_empirical_accuracies(L, Y)
        d["Correct"] = Series(data=corrects, index=lf_names)
        d["Incorrect"] = Series(data=incorrects, index=lf_names)
        d["Emp. Acc."] = Series(data=accs, index=lf_names)

    if est_accs is not None:
        col_names.append("Learned Acc.")
        d["Learned Acc."] = Series(est_accs, index=lf_names)

    return DataFrame(data=d, index=lf_names)[col_names]


def single_lf_summary(Y_p, Y=None):
    """Calculates coverage, overlap, conflicts, and accuracy for a single LF

    Args:
        Y_p: a np.array or torch.Tensor of predicted labels
        Y: a np.array or torch.Tensor of true labels (if known)
    """
    L = sparse.csr_matrix(arraylike_to_numpy(Y_p).reshape(-1, 1))
    return lf_summary(L, Y)


def error_buckets(gold, pred, X=None):
    """Group items by error buckets

    Args:
        gold: an array-like of gold labels (ints)
        pred: an array-like of predictions (ints)
        X: an iterable of items
    Returns:
        buckets: A dict of items where buckets[i,j] is a list of items with
            predicted label i and true label j. If X is None, return indices
            instead.

    For a binary problem with (1=positive, 2=negative):
        buckets[1,1] = true positives
        buckets[1,2] = false positives
        buckets[2,1] = false negatives
        buckets[2,2] = true negatives
    """
    buckets = defaultdict(list)
    gold = arraylike_to_numpy(gold)
    pred = arraylike_to_numpy(pred)
    for i, (y, l) in enumerate(zip(pred, gold)):
        buckets[y, l].append(X[i] if X is not None else i)
    return buckets


def confusion_matrix(
    gold, pred, null_pred=False, null_gold=False, normalize=False, pretty_print=True
):
    """A shortcut method for building a confusion matrix all at once.

    Args:
        gold: an array-like of gold labels (ints)
        pred: an array-like of predictions (ints)
        null_pred: If True, include the row corresponding to null predictions
        null_gold: If True, include the col corresponding to null gold labels
        normalize: if True, divide counts by the total number of items
        pretty_print: if True, pretty-print the matrix before returning
    """
    conf = ConfusionMatrix(null_pred=null_pred, null_gold=null_gold)
    gold = arraylike_to_numpy(gold)
    pred = arraylike_to_numpy(pred)
    conf.add(gold, pred)
    mat = conf.compile()

    if normalize:
        mat = mat / len(gold)

    if pretty_print:
        conf.display(normalize=normalize)

    return mat


class ConfusionMatrix(object):
    """
    An iteratively built abstention-aware confusion matrix with pretty printing

    Assumed axes are true label on top, predictions on the side.
    """

    def __init__(self, null_pred=False, null_gold=False):
        """
        Args:
            null_pred: If True, include the row corresponding to null
                predictions
            null_gold: If True, include the col corresponding to null gold
                labels

        """
        self.counter = Counter()
        self.mat = None
        self.null_pred = null_pred
        self.null_gold = null_gold

    def __repr__(self):
        if self.mat is None:
            self.compile()
        return str(self.mat)

    def add(self, gold, pred):
        """
        Args:
            gold: a np.ndarray of gold labels (ints)
            pred: a np.ndarray of predictions (ints)
        """
        self.counter.update(zip(gold, pred))

    def compile(self, trim=True):
        k = max([max(tup) for tup in self.counter.keys()]) + 1  # include 0

        mat = np.zeros((k, k), dtype=int)
        for (y, l), v in self.counter.items():
            mat[l, y] = v

        if trim and not self.null_pred:
            mat = mat[1:, :]
        if trim and not self.null_gold:
            mat = mat[:, 1:]

        self.mat = mat
        return mat

    def display(self, normalize=False, indent=0, spacing=2, decimals=3, mark_diag=True):
        mat = self.compile(trim=False)
        m, n = mat.shape
        tab = " " * spacing
        margin = " " * indent

        # Print headers
        s = margin + " " * (5 + spacing)
        for j in range(n):
            if j == 0 and not self.null_gold:
                continue
            s += f" y={j} " + tab
        print(s)

        # Print data
        for i in range(m):
            # Skip null predictions row if necessary
            if i == 0 and not self.null_pred:
                continue
            s = margin + f" l={i} " + tab
            for j in range(n):
                # Skip null gold if necessary
                if j == 0 and not self.null_gold:
                    continue
                else:
                    if i == j and mark_diag and normalize:
                        s = s[:-1] + "*"
                    if normalize:
                        s += f"{mat[i,j]/sum(mat[i,1:]):>5.3f}" + tab
                    else:
                        s += f"{mat[i,j]:^5d}" + tab
            print(s)

In [9]:
import os
import glob
from pathlib import Path
import pandas as pd
import numpy as np
import scipy

from pathlib import Path

from LabelModelTrain import LMutils
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [10]:
def mapTrueLabels(l):
    
    updated_values = []
    for l_i in l:
        updated_values.append( label_mapper[l_i] )
        
    return updated_values

In [12]:
train_file = f'/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/{candgen_version}/gt/train_ebm_labels_tui_pio3.tsv'
training_data = pd.read_csv(train_file, sep='\t', header=0)
training_data.rename( columns={'Unnamed: 0':'series'}, inplace=True )

val_file = f'/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/{candgen_version}/gt/val_studytype_tui_pio3.tsv'
val_data = pd.read_csv(val_file, sep='\t', header=0)
val_data.rename( columns={'Unnamed: 0':'series'}, inplace=True )

ebm_test_file = f'/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/{candgen_version}/gt/test_ebm_labels_tui_pio3.tsv'
test_ebm_data = pd.read_csv(ebm_test_file, sep='\t', header=0)
test_ebm_data.rename( columns={'Unnamed: 0':'series'}, inplace=True )

physio_test_file = f'/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/{candgen_version}/gt/test_physio_labels_tui_pio3.tsv'
test_physio_data = pd.read_csv(physio_test_file, sep='\t', header=0)
test_physio_data.rename( columns={'Unnamed: 0':'series'}, inplace=True )

ebm_test_corrected_file = f'/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/{candgen_version}/gt/test_ebm_st_corr_tui_pio3.tsv'
test_ebm_corrected_data = pd.read_csv(ebm_test_corrected_file, sep='\t', header=0)
test_ebm_corrected_data.rename( columns={'Unnamed: 0':'series'}, inplace=True )

In [13]:
# Filter Training set to remove Validation set IDs

cond = training_data['pmid'].isin(val_data['pmid'])
training_data.drop(training_data[cond].index, inplace = True)

In [14]:
def flatten_df(df):

    df_series = [ index for index, value in df.tokens.items() for word in ast.literal_eval(value) ]
    df_tokens = [ word for index, value in df.tokens.items() for word in ast.literal_eval(value) ]
    df_pos = [ word for index, value in df.pos.items() for word in ast.literal_eval(value) ]
    df_offsets = [ word for index, value in df.offsets.items() for word in ast.literal_eval(value) ]


    df_p = [ int(lab) for index, value in df.p.items() for lab in ast.literal_eval(value) ]
    df_p_fine = [ int(lab) for index, value in df.p_f.items() for lab in ast.literal_eval(value) ]
    df_i = [ int(lab) for index, value in df.i.items() for lab in ast.literal_eval(value) ]
    df_i_fine = [ int(lab) for index, value in df.i_f.items() for lab in ast.literal_eval(value) ]
    df_o = [ int(lab) for index, value in df.o.items() for lab in ast.literal_eval(value) ]
    df_o_fine = [ int(lab) for index, value in df.o_f.items() for lab in ast.literal_eval(value) ]
    df_s = [ int(lab) for index, value in df.s.items() for lab in ast.literal_eval(value) ]
    df_s_fine = [ int(lab) for index, value in df.s_f.items() for lab in ast.literal_eval(value) ]
    
    df_flattened = pd.DataFrame({ 'series': df_series,
                        'tokens' : df_tokens,
                        'offsets': df_offsets,
                        'pos': df_pos,
                        'p' : df_p,
                        'i' : df_i,
                        'o' : df_o,
                        's' : df_s,
                        'p_f' : df_p_fine,
                        'i_f' : df_i_fine,
                        'o_f' : df_o_fine,
                        's_f' : df_s_fine})
    
    return df_flattened

In [15]:
# Flatten the dataframes (currently only the training dataframe and test ebm dataframe with corrected labels can be flattened)
data_df = flatten_df(training_data)
val_df = flatten_df(val_data)
test_ebm_data = flatten_df(test_ebm_data)
test_ebm_corr_df = flatten_df(test_ebm_corrected_data)

In [16]:
series = [
    data_df.series.to_numpy() ,
    val_df.series.to_numpy() ,
    test_ebm_data.series.to_numpy() ,
    test_physio_data.series.to_numpy(),   
    test_ebm_corr_df.series.to_numpy()
]


sents = [
    data_df.tokens.to_numpy() ,
    val_df.tokens.to_numpy() ,
    test_ebm_data.tokens.to_numpy() ,
    test_physio_data.tokens.to_numpy(),   
    test_ebm_corr_df.tokens.to_numpy()    
]


part_of_speech = [
    data_df.pos.to_numpy() ,
    val_df.pos.to_numpy() ,
    test_ebm_data.pos.to_numpy() ,
    test_physio_data.pos.to_numpy(),   
    test_ebm_corr_df.pos.to_numpy()     
]


offsets = [
    data_df.offsets.to_numpy() ,
    val_df.offsets.to_numpy() ,
    test_ebm_data.offsets.to_numpy() ,
    test_physio_data.offsets.to_numpy(),   
    test_ebm_corr_df.offsets.to_numpy() 
]


Y_p = [
    data_df.p.to_numpy() , # 0 -9
    data_df.p_f.to_numpy() , # 1 -8
    val_df.p.to_numpy() , # 2 -7
    val_df.p_f.to_numpy() , # 3 -6
    test_ebm_data.p.to_numpy() , # 4 -5
    test_ebm_data.p_f.to_numpy() , # 5 -4
    test_physio_data.p.to_numpy(),  # 6 -3
    test_ebm_corr_df.p.to_numpy(),   # 7 -2
    test_ebm_corr_df.p_f.to_numpy() # 8 -1
]


Y_i = [
    data_df.i.to_numpy() , # 0 -9
    data_df.i_f.to_numpy() , # 1 -8
    val_df.i.to_numpy() , # 2 -7
    val_df.i_f.to_numpy() , # 3 -6
    test_ebm_data.i.to_numpy() , # 4 -5
    test_ebm_data.i_f.to_numpy() , # 5 -4
    test_physio_data.i.to_numpy(),  # 6 -3
    test_ebm_corr_df.i.to_numpy(),   # 7 -2
    test_ebm_corr_df.i_f.to_numpy() # 8 -1
]


Y_o = [
    data_df.o.to_numpy() , # 0 -9
    data_df.o_f.to_numpy() , # 1 -8
    val_df.o.to_numpy() , # 2 -7
    val_df.o_f.to_numpy() , # 3 -6
    test_ebm_data.o.to_numpy() , # 4 -5
    test_ebm_data.o_f.to_numpy() , # 5 -4
    test_physio_data.o.to_numpy(),  # 6 -3
    test_ebm_corr_df.o.to_numpy(),   # 7 -2
    test_ebm_corr_df.o_f.to_numpy() # 8 -1
]


Y_s = [
    data_df.s.to_numpy() , # 0 -9
    data_df.s_f.to_numpy() , # 1 -8
    val_df.s.to_numpy() , # 2 -7
    val_df.s_f.to_numpy() , # 3 -6
    test_ebm_data.s.to_numpy() , # 4 -5
    test_ebm_data.s_f.to_numpy() , # 5 -4
    test_physio_data.s.to_numpy(),  # 6 -3
    test_ebm_corr_df.s.to_numpy(),   # 7 -2
    test_ebm_corr_df.s_f.to_numpy() # 8 -1
]

In [25]:
def df_to_list(data_column):
    return [ word for index, value in data_column.items() for word in ast.literal_eval(value) ]

In [26]:
def df_to_array(data_column):
    return np.array( [ word for index, value in data_column.items() for word in ast.literal_eval(value) ] )

In [27]:
def dict_to_array(label_column):
    return np.array( [ labelModel_mapper_LF[int(lab)] for index, value in label_column.items() for k, lab in ast.literal_eval(value).items() ] )

In [17]:
# 1:1 positive to positive
# -1:0 negative cand_gen to negative in label model
# 0:-1 Abstain cand_gen to abstain in label model

# In study type, abstain is actually a negative instance 
#labelModel_mapper_LF = {1:1, 0:0, -1:-1}
#labelModel_mapper_LF = {1:1, -1:0, 0:-1}

label_mapper_GT = {1:1, 0:2}
#labelModel_mapper_LF = {1:1, -1:2, 0:0}
labelModel_mapper_LF = {1:1, 0:0, -1:-1}



#if picos == 'S':
#    labelModel_mapper_LF = {1:1, 0:2, -1:0}
#else:
#    labelModel_mapper_LF = {1:1, -1:2, 0:0}

In [23]:
def get_lfs(indir, invalid_pmids = None):
    
    pathlist = Path(indir).glob('**/*.tsv')

    tokens = ''

    lfs = dict()
    lfs_lm = dict()

    for counter, file in enumerate(pathlist):
        
        if f'/{entity}/' in str(file):

            k = str( file ).split(f'/{candgen_version}/')[-1].replace('.tsv', '').replace('/', '_')
            mypath = Path(file)
            if mypath.stat().st_size != 0:
                data = pd.read_csv(file, sep='\t', header=0)
                
                # Remove validation PMIDs
                if invalid_pmids is not None:
                    cond = data['pmid'].isin(val_data['pmid'])
                    data.drop(data[cond].index, inplace = True)

                data_tokens = data.tokens
                if len(tokens) < 5:
                    tokens = df_to_array(data_tokens)

                data_labels = data.labels
                
                labels = dict_to_array(data_labels)
                
                if len(labels) != len(tokens):
                    print(k, len(labels) , len(tokens) )
                #assert len(labels) == len(tokens)
                lfs[k] = labels


    print( 'Total number of tokens in data set: ', len(tokens) )
    print( 'Total number of LFs in the dictionary', len(lfs) )
    
    return lfs

In [28]:
indir = f'/mnt/nas2/results/Results/systematicReview/distant_pico/tui_pio_v4/training_ebm_candidate_generation/{candgen_version}'
train_ebm_lfs = get_lfs(indir, invalid_pmids = val_data.pmid)

Total number of tokens in data set:  1102837
Total number of LFs in the dictionary 268


In [29]:
indir_val_ebm = f'/mnt/nas2/results/Results/systematicReview/distant_pico/tui_pio_v4/val_studytype_candidate_generation/{candgen_version}'
val_ebm_lfs = get_lfs(indir_val_ebm)

Total number of tokens in data set:  211870
Total number of LFs in the dictionary 268


In [30]:
indir_test_ebm_corr = f'/mnt/nas2/results/Results/systematicReview/distant_pico/tui_pio_v4/test_ebm_anjani_candidate_generation/{candgen_version}'
test_ebm_corr_lfs = get_lfs(indir_test_ebm_corr)

Total number of tokens in data set:  52582
Total number of LFs in the dictionary 268


In [31]:
indir_test_ebm = f'/mnt/nas2/results/Results/systematicReview/distant_pico/tui_pio_v4/test_ebm_candidate_generation/{candgen_version}'
test_ebm_lfs = get_lfs(indir_test_ebm)

Total number of tokens in data set:  51784
Total number of LFs in the dictionary 268


In [32]:
# drop some lfs
def drop_nopositive(lfs_d):
    
    dropped_conditions = dict()

    for k, v in lfs_d.items():
        
        #if '_negs' not in str(k):
        dropped_conditions[k] = v
        
        '''       
        if '_negs' not in str(k):
            

            if 'stdtype_types' in str(k):
                print(k)
                k_mod = str(k) + '_'
                dropped_conditions[k_mod] = v

        else:
            pass
        '''
            
    return dropped_conditions

In [33]:
train_ebm_lfs_dropped = drop_nopositive(train_ebm_lfs)
val_ebm_lfs_dropped = drop_nopositive(val_ebm_lfs)
test_ebm_corr_lfs_dropped = drop_nopositive(test_ebm_corr_lfs)
test_ebm_lfs_dropped = drop_nopositive(test_ebm_lfs)

In [45]:
val_dict_lfs = pd.DataFrame({ key:pd.Series(value) for key, value in val_ebm_lfs.items() })

In [49]:
# Map 
label_mapper_GT = {1:1, 0:2}

val_y_mapped = [ label_mapper_GT[i] for i in val_df.s_f]

In [47]:
val_lf_names = [*val_dict_lfs]

In [46]:
def getSummary_and_write(df, df_col_head, true_labels, file_path):
    
    #convert to sciy matrix
    scipy_mat = scipy.sparse.csr_matrix(df.values)
    
    lf_summary_df = lf_summary(scipy_mat, Y=true_labels, lf_names=df_col_head)
    
    lf_summary_df.to_csv(file_path, sep='\t')
    
    return lf_summary_df

In [52]:
# Changed the label mapper for the validation candidates

getSummary_and_write(df=val_dict_lfs, df_col_head=val_lf_names, true_labels = np.array(val_y_mapped), file_path= f'/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/lf_{picos.lower()}_summary_tuipio3_val.csv')

  return 0.5 * (X.sum(axis=0) / (L != 0).sum(axis=0) + 1)


Unnamed: 0,j,Polarity,Coverage%,Overlaps%,Conflicts%,Coverage,Correct,Incorrect,Emp. Acc.
dictionary_fuzzy_S_lf_dict_s_type_negs,0,"[-1, 1]",0.029636,0.029636,0.029570,6279,6205,74,0.143813
dictionary_fuzzy_S_lf_dict_s_comp_type_negs,1,"[-1, 1]",0.030552,0.030552,0.030283,6473,6181,292,0.137958
dictionary_direct_S_lf_dict_s_type_negs,2,"[-1, 1]",0.029636,0.029636,0.029570,6279,6205,74,0.143813
dictionary_direct_S_lf_dict_s_comp_type_negs,3,"[-1, 1]",0.030552,0.030552,0.030283,6473,6181,292,0.137958
UMLS_fuzzy_S_lf_PCDS,4,"[-1, 1]",0.001529,0.001529,0.001525,324,309,15,0.000000
...,...,...,...,...,...,...,...,...,...
heuristics_direct_S_lf_regex_blinding_negs,263,"[-1, 1]",0.376618,0.376618,0.374550,79794,77159,2635,0.005389
heuristics_direct_S_lf_regex_stdtype_basicplus_negs,264,"[-1, 1]",0.375943,0.375943,0.374555,79651,76990,2661,0.003277
heuristics_direct_S_lf_lf_lf_s_heurpattern_labels_2,265,1,0.025501,0.025501,0.017407,5403,2860,2543,0.529336
heuristics_direct_S_lf_regex_blinding,266,1,0.003804,0.003804,0.001737,806,784,22,0.972705
