In [1]:
import enum
import glob
import os
from hashlib import new
from pathlib import Path
import time

import functools

import numpy as np
import pandas as pd
import scipy
from flyingsquid.label_model import LabelModel as LMsquid
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from snorkel.labeling.model import LabelModel as LMsnorkel
from snorkel.labeling.model import MajorityLabelVoter

In [2]:
from sklearn.exceptions import UndefinedMetricWarning

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
from typing import Tuple, Dict


class Document(object):

    def __init__(self, name, sentences):
        self.name = name
        self.sentences = sentences
        for s in sentences:
            s.document = self
        self.annotations = {i:{} for i in range(len(sentences))}
        self.props = {}
        self._text = None

    @property
    def text(self):
        if not self._text:
            t = ""
            for s in self.sentences:
                if len(t) != s.abs_char_offsets[0]:
                    t += ' ' * (s.abs_char_offsets[0] - len(t))
                t += s.text
            self._text = t
        return self._text
    
    def __repr__(self):
        return "Document({})".format(self.name)
        
        
class Sentence(object):

    def __init__(self, **kwargs):
        self.document = None
        self.__dict__.update(kwargs)
        self._text = None

    @property
    def text(self):
        if not self._text:
            txt = ""
            offset = self.abs_char_offsets[0]
            for i,w in enumerate(self.words):
                if len(txt) != self.abs_char_offsets[i] - offset:
                    txt += ' ' * (self.abs_char_offsets[i] - offset - len(txt))
                txt += w
            self._text = txt
        return self._text

    @property
    def position(self):
        return self.i
    
    @property
    def char_offsets(self):
        offset = self.abs_char_offsets[0]
        return [i - offset for i in self.abs_char_offsets]
               
    def __repr__(self):
        max_len = 25
        s = self.text.strip().replace("\n"," ")
        return "Sentence({})".format(
            s if len(s) < max_len else s[0:max_len] + '...'
        )
        

class Span(object):

    def __init__(self, char_start, char_end, sentence, attrib='words'):
        self.sentence   = sentence
        self.char_start = char_start
        self.char_end   = char_end
        self.attrib     = attrib
        self.props      = {}
        self.normalized = None
    
    @property
    def abs_char_start(self):
        return self.char_start + self.sentence.abs_char_offsets[0]

    @property
    def abs_char_end(self):
        return self.abs_char_start + (self.char_end - self.char_start)
    
    @property
    def text(self):
        return self.sentence.text[self.char_start:self.char_end + 1]

    def get_word_start(self):
        return self.char_to_word_index(self.char_start)

    def get_word_end(self):
        return self.char_to_word_index(self.char_end)

    def get_n(self):
        return self.get_word_end() - self.get_word_start() + 1

    def char_to_word_index(self, ci):
        i = None
        for i, co in enumerate(self.sentence.char_offsets):
            if ci == co:
                return i
            elif ci < co:
                return i-1
        return i

    def word_to_char_index(self, wi):
        return self.sentence.char_offsets[wi]

    def get_attrib_tokens(self, a):
        return self.sentence.__getattribute__(a)[self.get_word_start():self.get_word_end() + 1]
    
    def __repr__(self):
        return "Span({})".format(self.text.replace("\n"," "))
     
    def get_attrib_span(self, a, sep=" "):
        if a == 'words':
            return self.sentence.text[self.char_start:self.char_end + 1]
        else:
            return sep.join(self.get_attrib_tokens(a))

    def get_span(self, sep=" "):
        return self.get_attrib_span('words', sep)

    def __contains__(self, other_span):
        return other_span.abs_char_start >= self.abs_char_start and other_span.abs_char_end <= self.abs_char_end
    
    
class Candidate(object):
    """A collection of spans"""
    def __init__(self, spans):
        self.spans = spans


class Relation(object):

    def __init__(self,
                 type_name:str,
                 args: Dict[str, Span]) -> None:
        self.type_name = type_name
        self.args = args
        self.__dict__.update(args)

    def __iter__(self):
        for span in self.args.values():
            yield span

    def __getitem__(self, item):
        return list(self.args.values())[item]

    def __repr__(self):
        strs = [span.__repr__() for span in self.args.values()]
        return f"Relation[{self.type_name}]({','.join(strs)})"

    def __eq__(self, other):
        hashes = {name:span.__hash__() for name,span in self.args.items()}
        other = {name:span.__hash__() for name,span in other.args.items()}
        return hashes == other

    def __hash__(self):
        return hash(sum([s.__hash__() for s in self.args.values()]))

    @property
    def sentence(self):
        """We assume spans all live in the same sentence"""
        return self.__dict__[self.arg_names[0]].sentence



class Annotation(object):
    def __init__(self, doc_name: str,
                 span: Tuple[Tuple[int,int], ...],
                 etype: str,
                 text: str = None,
                 cid: str = None) -> None:
        """

        :param doc_name:
        :param span:
        :param etype:
        :param text:
        :param cid:
        """
        self.abs_char_start = span[0][0]
        self.abs_char_end = span[0][-1]

        self.doc_name = doc_name
        self.span = tuple([tuple(s) for s in span])
        self.text = text
        self.etype = etype
        self.cid = cid
        
    def __repr__(self):
        text = self.text.replace('\n',' ') + '|' if self.text else ''
        i,j = self.abs_char_start, self.abs_char_end
        sep = '...' if len(self.span) > 1 else '-'
        return f"Annotation[{self.etype}]({text}{i}{sep}{j})"

    @property
    def type(self):
        return self.etype

    def __hash__(self):
        return hash((self.etype, self.doc_name, self.span))

    def __eq__(self, other):
        return False if not isinstance(other, type(self)) else True

In [4]:
import re
import os
import gzip
import json
import glob
import numpy as np
import logging
import itertools
import collections

logger = logging.getLogger(__name__)


def parse_doc(d) -> Document:
    """
    Convert JSON into container objects. Transforming to
    Document/Sentence objects comes at ~13% overhead.
    """
    sents = [Sentence(**s) for s in d['sentences']]
    doc = Document(d['name'], sents)
    if 'metadata' in d:
        for key,value in d['metadata'].items():
            doc.props[key] = value
    return doc


class DocumentLoader:

    def __init__(self, fpath):
        self.fpath = fpath
        self.formatter = parse_doc

    def filelist(self):
        return glob.glob(f'{self.fpath}/*.json') \
            if os.path.isdir(self.fpath) else [self.fpath]

    def __iter__(self):
        for fpath in self.filelist():
            fopen = gzip.open if fpath.split(".")[-1] == 'gz' else open
            with fopen(fpath, 'rb') as fp:
                for line in fp:
                    yield self.formatter(json.loads(line))


def load_json_dataset(fpath,
                      tokenizer,
                      tag_fmt = 'IO',
                      contiguous_only = False):
    """Load JSON dataset and initialize sequence tagged labels.

    Parameters
    ----------
    fpath
        JSON file path
    tokenizer

    tag_fmt
        token tagging scheme with values in {'IO','IOB', 'IOBES'}
    """
    documents, entities = [], {}
    fopen = gzip.open if fpath.split(".")[-1] == 'gz' else open
    with fopen(fpath, 'rb') as fp:
        for line in fp:
            # initialize context objects
            d = json.loads(line)
            doc = Document(d['name'], [Sentence(**s) for s in d['sentences']])
            documents.append(doc)
            # load entities
            entities[doc.name] = set()
            if 'entities' not in d:
                continue
            for entity in d['entities']:
                del entity['abs_char_start']
                del entity['abs_char_end']
                if 'doc_name' not in entity:
                    entity['doc_name'] = doc.name
                anno = Annotation(**entity)
                if len(anno.span) > 1 and contiguous_only:
                    continue
                entities[doc.name].add(Annotation(**entity))

    return NerDocumentDataset(documents,
                              entities,
                              tag_fmt=tag_fmt,
                              tokenizer=tokenizer)


#################################################################################
#
#  Sequence Tag Creation
#
#################################################################################

def entity_tag(length, tag_fmt="IOB"):
    """
    IO, IOB, or IOBES (equiv. to BILOU) tagging

    :param tokens:
    :param is_heads:
    :param tag_fmt:
    :return:
    """
    tags = ['O'] * length
    tag_fmt = set(tag_fmt)

    if tag_fmt == set("IOB"):
        tags[0] = 'B'
        tags[1:] = len(tags[1:]) * "I"

    elif tag_fmt == set("IOBES") or tag_fmt == set("BILOU"):
        if len(tags) == 1:
            tags[0] = 'S'
        else:
            tags[0] = 'B'
            tags[1:-1] = len(tags[1:-1]) * "I"
            tags[-1:] = "E"

    elif tag_fmt == set("IO"):
        tags = ['I'] * len(tags)
    return tags


def map_sent_entities(document, entities, verbose=True):
    """
    Given (1) a document split into sentences and (2) a list of entities
    defined by absolute char offsets, map each entity to it's parent sentence.

    :param:
    :param:
    :return tuple of sentence index and tag,
    """
    errors = 0
    spans = []
    char_index = [s.abs_char_offsets[0] for s in document.sentences]

    for t in entities:
        position = None
        for i in range(len(char_index) - 1):
            if t.abs_char_start >= char_index[i] and t.abs_char_end <= char_index[i + 1]:
                position = i
                break

        if position == None and t.abs_char_start >= char_index[-1]:
            position = len(char_index) - 1

        if position == None:
            values = (document.name, t.abs_char_start, t.abs_char_end)
            if verbose:
                msg = f"{[t.text]} {t.span} {t.doc_name}"
                logger.warning(f"Cross-sentence mention {msg}")
            errors += 1
            continue
        try:
            shift = document.sentences[position].abs_char_offsets[0]
            span = document.sentences[position].text[t.abs_char_start - shift:t.abs_char_end - shift]
            spans.append((position, t, span))
        except Exception as e:
            logger.error(f'{e}')

    idx = collections.defaultdict(list)
    for i, entity, _ in spans:
        idx[i].append(entity)

    return idx, errors


def retokenize(sent, tokenizer, subword='##'):
    """
    Given a default tokenization, compute absolute character offsets for
    a new tokenization (e.g., BPE). By convention, wordpiece tokens are
    prefixed by ##.

    """
    tokens = []
    abs_char_offsets = []

    for i in range(len(sent.words)):
        toks = tokenizer.tokenize(sent.words[i])
        offsets = [sent.abs_char_offsets[i]]
        for w in toks[0:-1]:
            offsets.append(
                len(w if w[:len(subword)] != subword else w[len(subword):]) + offsets[-1]
            )
        abs_char_offsets.extend(offsets)
        tokens.extend(toks)

    return tokens, abs_char_offsets


def tokens_to_tags(sent,
                   entities,
                   tag_fmt='BIO',
                   tokenizer=None,
                   max_seq_len=512):
    """

    :param sent:
    :param entities:
    :param tag_fmt:
    :param tokenizer:
    :param max_seq_len:
    :return:
    """

    toks, abs_char_offsets = retokenize(sent, tokenizer) if tokenizer \
        else (sent.words, sent.abs_char_offsets)

    # truncate long sequences
    if len(toks) > max_seq_len - 2:
        toks = toks[0:max_seq_len - 2]
        abs_char_offsets = abs_char_offsets[0:max_seq_len - 2]

    # use original tokenization to assign token heads
    is_heads = [1 if i in sent.abs_char_offsets else 0 for i in abs_char_offsets]
    tags = ['O'] * len(toks)

    errs = 0
    for entity in entities:

        # currently we only support contiguous entity spans
        if len(entity.span) != 1:
            logger.warning(f"Non-contiguous entities not supported {entity} {sent.document.name}")
            continue

        head = entity.span[0]
        if head[0] in abs_char_offsets:
            start = abs_char_offsets.index(head[0])
            end = len(abs_char_offsets)

            for j, offset in enumerate(abs_char_offsets):
                if head[-1] > offset:
                    continue
                end = j
                break

            # tokenization error
            if is_heads[start] == 0:
                errs += 1
                logger.warning(f"Tokenization Error: Token is not a head token {entity} {sent.document.name}")
                continue

            tok_len = is_heads[start:end].count(1)
            head_tags = entity_tag(tok_len, tag_fmt=tag_fmt)
            head_tags = [f'{t}-{entity.type}' for t in head_tags]
            io_tags = ['O'] * len(toks[start:end])

            for i in range(len(io_tags)):
                if is_heads[start:end][i] == 1:
                    t = head_tags.pop(0)
                io_tags[i] = t

            tags[start:end] = io_tags

            # Error Checking: do spans match?
            s1 = ''.join([w if w[:2] != '##' else w[2:] for w in toks[start:end]]).lower()
            s2 = re.sub(r'''(\s)+''', '', entity.text).lower()


            if s1 != s2:
                if len(entity.span) == 1:
                    msg = f"{s1} != {s2}"
                    logger.error(f"Span does not match {msg}")
                errs += 1
        else:
            errs += 1
            logger.error(f"Tokenization Error: Token head not found in abs_char_offsets {entity}")

    return (toks, tags, is_heads), errs





#################################################################################
#
#  Datasets
#
#################################################################################


class NerDocumentDataset(object):
    """
    Document + Annotation objects
    entities are defined as abs char offsets per document
    """

    def __init__(self, documents: dict,
                 entities: dict,
                 tag_fmt: str = 'IO',
                 tokenizer=None) -> None:
        """
        Convert Document objects with a corresponding
        entity set into tagged sequences

        :param documents:
        :param entities:
        :param tag_fmt:
        :param tokenizer:
        """
        self.documents = documents
        self.entities = entities
        self.tag_fmt = tag_fmt
        self.tokenizer = tokenizer
        self.tag2idx = self._get_tag_index(entities, tag_fmt)

        self._init_sequences(documents)

    def _get_tag_index(self, entities, tag_fmt):
        """
        Given a collection of entity types, initialize an integer tag mapping
        e.g., B-Drug I-Drug O

        :param entities:
        :param tag_fmt:
        :return:
        """
        entity_types = {t.type for doc_name in entities for t in entities[doc_name]}
        tags = [t for t in list(tag_fmt) if t != 'O']
        tags = [f'{tag}-{etype}' for tag, etype in itertools.product(tags, entity_types)]
        tags = ['X', 'O', ] + tags
        return {t: i for i, t in enumerate(tags)}

    def __len__(self) -> int:
        return len(self.data)

    def tagged(self, idx):
        """
        Return tagged words
        :return:
        """
        X, _, _, Y, _, _ = self.__getitem__(idx)
        return X[1:-1], Y[1:-1]

    def _init_sequences(self, documents):
        """
        Transform Documents into labeled sequences.

        :param documents:
        :return:
        """
        self.data = []
        self.sentences = []
        num_errors, num_missing_heads, num_entities = 0, 0, 0

        for doc in documents:
            self.sentences.extend(doc.sentences)
            annotations = self.entities[doc.name] if doc.name in self.entities else {}
            num_entities += len(annotations)
            # tag sentences
            sent_entities, errs = map_sent_entities(doc, annotations)
            num_errors += errs

            for sentence in doc.sentences:
                entities = sent_entities[sentence.i] if sentence.i in sent_entities else []
                seqs, errs = tokens_to_tags(sentence, entities, self.tag_fmt, tokenizer=self.tokenizer)
                num_errors += errs

                x, y, is_heads = seqs
                if not (len(x) == len(y) == len(is_heads)):
                    print(seqs)

                self.data.append(seqs)

        assert len(self.data) == len(self.sentences)
        if num_errors:
            msg = f'Errors: Span Alignment: {num_errors}/{num_entities} ({num_errors / num_entities * 100:2.1f}%)'
            logger.warning(msg)

        print(f'Tagged Entities: {num_entities - num_errors}')

    def __getitem__(self, idx):

        toks, tags, is_heads = self.data[idx]

        words = [w for w in self.sentences[idx].words if w.strip()]
        words = self.sentences[idx].words
        # original tags (head words only)
        tags = [t for i, t in enumerate(tags) if is_heads[i] == 1]

        if len(words) != len(tags):
            print(len(words), len(tags))
            print(words)
            print(tags)
            print('-' * 50)

        words = ['[CLS]'] + words + ['[SEP]']
        toks = ['[CLS]'] + toks + ['[SEP]']
        tags = ['X'] + tags + ['X']

        X = self.tokenizer.convert_tokens_to_ids(toks)
        Y = [self.tag2idx[t] if h == 1 else self.tag2idx['X'] for t, h in zip(tags, is_heads)]

        return words, X, is_heads, tags, Y, len(Y)

In [5]:
import numpy as np


def mv(L, break_ties, abstain=-1):
    """Simple majority vote"""
    from statistics import mode
    y_hat = []
    for row in L:
        # get non abstain votes
        row = row[row != abstain]
        try:
            l = mode(row)
        except:
            l = break_ties
        y_hat.append(l)
    return np.array(y_hat).astype(np.int)

def smv(L, abstain=-1, uncovered=0):
    """Soft majority vote"""
    y_hat = []
    k = np.unique(L[L != abstain]).astype(int)
    k = list(range(min(k), max(k) + 1))
    for row in L:
        # get non abstain votes
        row = list(row[row != abstain])
        N = len(row)
        if not N:
            y_hat.append([1.0, 0])
        else:
            p = []
            for i in k:
                p.append(row.count(i) / N)
            y_hat.append(p)
    return np.array(y_hat).astype(np.float)

In [6]:
import itertools
import numpy as np
from typing import List, Set, Dict, Tuple, Pattern, Match, Iterable
import seqeval.metrics


def split_by_seq_len(X, X_lens) -> np.ndarray:
    """Given a matrix X of M elements, partition into N variable length
    sequences where [xi, ..., xN] lengths are defined by X_lens[i].

    This is used to partition a stacked matrix of words back into sentences.

    Parameters
    ----------
    X
    X_lens

    Returns
    -------

    """
    splits = [np.sum(X_lens[0:i]) for i in range(1, X_lens.shape[0])]
    return np.split(X, splits)


def convert_tag_fmt(
        seq: List[str],
        etype: str,
        tag_fmt: str = 'IOB') -> List[str]:
    """Convert between tagging schemes. This is a lossy conversion
    when converting to IO, i.e., mapping {IOB, IOBES} -> IO
    drops information on adjacent entities.

    IOB -> O B I I B I O
    IO  -> O I I I I I O
    IOB -> O B I I I I O

    Parameters
    ----------
    seq
    etype
    tag_fmt

    Returns
    -------

    """
    # TODO: Only works for IO -> {IOB, IOBES}
    assert set(seq).issubset(set('IO'))
    # divide into contiguous chunks
    chunks = [list(g) for _, g in itertools.groupby(seq)]
    # remap to new tagging scheme
    seq = list(itertools.chain.from_iterable(
        [tags if 'O' in tags else entity_tag(len(tags), tag_fmt)
         for tags in chunks]
    ))
    return [t if t == 'O' else f'{t}-{etype}' for t in seq]


def tokens_to_sequences(y_gold,
                        y_pred,
                        seq_lens,
                        idx2tag=None,
                        tag_fmt=None):
    """Convert token labels to sentences for sequence model evaluation.

    Parameters
    ----------
    y_gold
    y_pred
    seq_lens
    idx2tag
    tag_fmt

    Returns
    -------

    """
    idx2tag = {1: 'I', 0: 'O'} if not idx2tag else idx2tag
    y_gold_seqs = []
    for s in split_by_seq_len(y_gold, seq_lens):
        y = [idx2tag[i] for i in s]
        if tag_fmt is not None:
            y_hat = convert_tag_fmt(y, etype='ENTITY', tag_fmt='IOB')
        else:
            y_hat = y
        y_gold_seqs.append(y_hat)

    y_pred_seqs = []
    for s in split_by_seq_len(y_pred, seq_lens):
        # Sometimes -1 labels make it into evaluation due to Snorkel
        # label model. Just treat these as 'O'
        y = [idx2tag[i] if i in idx2tag else 'O' for i in s]
        if tag_fmt is not None:
            y_hat = convert_tag_fmt(y, etype='ENTITY', tag_fmt='IOB')
        else:
            y_hat = y
        y_pred_seqs.append(y_hat)

    return y_gold_seqs, y_pred_seqs


def score_sequences(y_true: List[List[int]],
                    y_pred: List[List[int]],
                    metrics: Set[str] = None) -> Dict[str, float]:
    """
    Sequence model evaluation using seqeval
    https://github.com/chakki-works/seqeval

    Parameters
    ----------
    y_gold
    y_pred

    Returns
    -------

    """
    scorers = {
        'accuracy': sklearn.metrics.accuracy_score,
        'precision': sklearn.metrics.precision_score,
        'recall': sklearn.metrics.recall_score,
        'f1': sklearn.metrics.f1_score
    }
    metrics = metrics if metrics is not None else scorers
    try:
        return {name: scorers[name](y_true, y_pred, average='macro') for name in metrics}
    except:
        return {name: 0.0 for name in metrics}


def eval_label_model(model, L, Y, seq_lens):

    idx2tag = {0: 'O', 1: 'I-X', 2: 'B-X'}

    # label model
    y_pred = model.predict(L)
    scores = score_sequences(*tokens_to_sequences(Y, y_pred, seq_lens, idx2tag=idx2tag))
    print('[Label Model]   {}'.format(
        ' | '.join([f'{m}: {v * 100:2.2f}' for m, v in scores.items()]))
    )

    # MV baseline
    y_pred = mv(L, 0)
    scores = score_sequences(*tokens_to_sequences(Y, y_pred, seq_lens, idx2tag=idx2tag))
    print('[Majority Vote] {}'.format(
        ' | '.join([f'{m}: {v * 100:2.2f}' for m, v in scores.items()]))
    )

In [7]:
import numpy as np
from itertools import product
from sklearn.metrics import (
    precision_score, recall_score,
    f1_score, accuracy_score,
    precision_recall_fscore_support
)




def sample_param_grid(param_grid, seed):
    """ Sample parameter grid

    :param param_grid:
    :param seed:
    :return:
    """
    rstate = np.random.get_state()
    np.random.seed(seed)
    params = list(product(*[param_grid[name] for name in param_grid]))
    np.random.shuffle(params)
    np.random.set_state(rstate)
    return params


def compute_metrics(y_gold, y_pred, average='binary'):
    """

    :param y_gold:
    :param y_pred:
    :param average:
    :return:
    """
    return {
        'accuracy': accuracy_score(y_gold, y_pred),
        'precision': precision_score(y_gold, y_pred, average=average),
        'recall': recall_score(y_gold, y_pred, average=average),
        'f1': f1_score(y_gold, y_pred, average=average)
    }


def grid_search_span(model_class,
                     model_class_init,
                     param_grid,
                     train=None,
                     dev=None,
                     n_model_search=5,
                     val_metric='f1',
                     seed=1234,
                     verbose=True):
    """Simple grid search helper function

    """
    L_train, Y_train = train if len(train) == 2 else (train[0], None)
    L_dev, Y_dev = dev

    # sample configs
    params = sample_param_grid(param_grid, seed)[:n_model_search]

    defaults = {'seed': seed}
    best_score, best_config = 0.0, None
    # set scoring mode based on the number of classes
    average = 'binary' if np.unique(Y_dev).shape[0] == 2 else 'micro'

    print(f"Grid search over {len(params)} configs")
    print(f'Averaging: {average}')

    for i, config in enumerate(params):
        print(f'[{i}] Label Model')
        config = dict(zip(param_grid.keys(), config))
        # update default params if not specified
        config.update({
            param: value for param, value in defaults.items() \
            if param not in config})

        model = model_class(**model_class_init)
        # fit (estimate class balance with Y_dev)
        model.fit(L_train, Y_dev, **config)

        y_pred = model.predict(L_dev)
        y_gold = Y_dev

        # Snorkel sometimes emits -1 predictions
        if -1 in y_pred:
            continue

        # only evaluate dev score
        mask = []
        for i in range(L_dev.shape[0]):
            if not np.all(L_dev[i] == -1):
                mask.append(i)

        mask = np.array(mask)
        metrics = compute_metrics(Y_dev[mask], model.predict(L_dev[mask]))

        msgs = []
        if not best_score or metrics[val_metric] > best_score[val_metric]:
            print(config)
            best_score = metrics
            best_config = config

            # mask uncovered data points
            mask = [i for i in range(L_train.shape[0]) \
                    if not np.all(L_train[i] == -1)]
            msgs.append(
                f'Coverage: {(len(mask) / L_train.shape[0] * 100):2.1f}%'
            )

            if Y_train is not None:
                # filter out candidate spans without gold labels
                y_mask = [i for i in range(len(Y_train)) if Y_train[i] != -1]
                mask = np.array(sorted(list(set(y_mask).intersection(mask))))
                metrics = compute_metrics(Y_train[mask],
                                          model.predict(L_train[mask]))
                msgs.append(
                    'TRAIN {}'.format(' | '.join(
                        [f'{m}: {v * 100:2.2f}' for m, v in metrics.items()])
                    )
                )

            msgs.append(
                'DEV   {}'.format(' | '.join(
                    [f'{m}: {v * 100:2.2f}' for m, v in best_score.items()]))
            )

        if verbose and msgs:
            print('\n'.join(msgs) + ('\n' + '-' * 80))

        if i % 50 == 0:
            print(f'[{i}] Label Model')

    # retrain best model
    if verbose:
        print('BEST')
        print(best_config)
    model = model_class(**model_class_init)
    model.fit(L_train, Y_dev, **best_config)
    return model, best_config


def grid_search(model_class,
                model_class_init,
                param_grid,
                train=None,
                dev=None,
                other_train=None,
                n_model_search=5,
                val_metric='f1',
                seed=1234,
                seq_eval=True,
                checkpoint_gt_mv=True,
                tag_fmt_ckpnt='BIO'):
    """Simple grid search helper function

    Parameters
    ----------
    model_class
    model_class_init
    param_grid
    train
    dev
    n_model_search
    val_metric
    seed
    seq_eval

    Returns
    -------

    """
    print(f"Using {'TOKEN' if not seq_eval else 'SEQUENCE'} dev checkpointing")
    if seq_eval:
        print(f"Using {tag_fmt_ckpnt} dev checkpointing")

    idx2tag = {0:'O', 1:'I-X', 2:'B-X'}

    L_train, Y_train, X_train_lens = train
    L_dev, Y_dev, X_dev_lens = dev

    # sample configs
    params = sample_param_grid(param_grid, seed)[:n_model_search]

    defaults = {'seed': seed}
    best_score, best_config = 0.0, None
    print(f"Grid search over {len(params)} configs")

    for i, config in enumerate(params):
        print(f'[{i}] Label Model')
        config = dict(zip(param_grid.keys(), config))
        # update default params if not specified
        config.update({param: value for param, value in defaults.items() if param not in config})

        model = model_class(**model_class_init)
        # fit (estimate class balance with Y_dev)
        # HACK for BIO tag evaluation
        if len(np.unique(Y_dev)) != 2:
            Y_dev_hat = np.array([0 if y == 0 else 1 for y in Y_dev])
        else:
            Y_dev_hat = Y_dev
        model.fit(L_train, Y_dev_hat, **config)

        y_pred = model.predict(L_dev)

        # set gold tags for evaluation
        if tag_fmt_ckpnt == 'IO':
            y_gold = np.array([0 if y == 0 else 1 for y in Y_dev])
        else:
            y_gold = Y_dev

        if -1 in y_pred:
            print("Label model predicted -1 (TODO: this happens inconsistently)")
            continue

        # score on dev set (token or sequence-level)
        if seq_eval:
            metrics = score_sequences(*tokens_to_sequences(y_gold, y_pred, X_dev_lens, idx2tag=idx2tag))
        else:
            # use internal label model scorer
            metrics = model.score(L=L_dev,
                                  Y=y_gold,
                                  metrics=['accuracy', 'precision', 'recall', 'f1'],
                                  tie_break_policy=0)

        # compare learned model against MV on same labeled dev set
        # skip if LM less than MV
        if checkpoint_gt_mv:
            if seq_eval:
                mv_y_pred = mv(L_dev, 0)
                mv_metrics = score_sequences(
                    *tokens_to_sequences(y_gold, mv_y_pred, X_dev_lens, idx2tag=idx2tag)
                )
            else:
                metrics = model.score(L=L_dev,
                                      Y=y_gold,
                                      metrics=['accuracy', 'precision', 'recall', 'f1'],
                                      tie_break_policy=0)

            if metrics[val_metric] < metrics[val_metric]:
                continue

        if not best_score or metrics[val_metric] > best_score[val_metric]:
            print(config)
            best_score = metrics
            best_config = config

            # print training set score if we have labeled data
            if np.any(Y_train):
                y_pred = model.predict(L_train)

                if tag_fmt_ckpnt == 'IO':
                    y_gold = np.array([0 if y == 0 else 1 for y in Y_train])
                else:
                    y_gold = Y_train

                if seq_eval:
                    metrics = score_sequences(*tokens_to_sequences(y_gold, y_pred, X_train_lens, idx2tag=idx2tag))
                else:
                    metrics = model.score(L=L_train,
                                          Y=y_gold,
                                          metrics=['accuracy', 'precision', 'recall', 'f1'],
                                          tie_break_policy=0)

                print('[TRAIN] {}'.format(' | '.join([f'{m}: {v * 100:2.2f}' for m, v in metrics.items()])))

            print('[DEV]   {}'.format(' | '.join([f'{m}: {v * 100:2.2f}' for m, v in best_score.items()])))
            print('-' * 88)

    # retrain best model
    print('BEST')
    print(best_config)
    model = model_class(**model_class_init)

    # HACK for BIO tag evaluation
    if len(np.unique(Y_dev)) != 2:
        Y_dev_hat = np.array([0 if y == 0 else 1 for y in Y_dev])
    else:
        Y_dev_hat = Y_dev
    model.fit(L_train, Y_dev_hat, **best_config)
    return model, best_config

In [8]:
def list2Nested(l, nested_length):
    return [l[i:i+nested_length] for i in range(0, len(l), nested_length)]

In [9]:
# Fetch UMLS ranks

sum_lf_p = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/lf_p_summary_train.csv'
sum_lf_i = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/lf_i_summary_train.csv'
sum_lf_o = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/lf_o_summary_train.csv'


def fetchRank(sum_lf_d):
    
    ranked_umls_coverage = dict()    
    umls_coverage_ = dict()
    
    data=pd.read_csv(sum_lf_d, sep='\t')
    
    for index, row in data.iterrows():
        if row[0].startswith('UMLS_fuzzy_'):
            umls_coverage_[row[0]] = row[3]
    
    umls_coverage_sorted = sorted(umls_coverage_.items(), key=lambda x: x[1], reverse=True)
    
    for i in umls_coverage_sorted:
        k = str(i[0]).split('_')[-1]
        ranked_umls_coverage[k] = i[1]

    return ranked_umls_coverage

ranksorted_p_umls = fetchRank(sum_lf_p)
ranksorted_i_umls = fetchRank(sum_lf_i)
ranksorted_o_umls = fetchRank(sum_lf_o)

In [10]:
# Partition LF's

def partitionLFs(umls_d):
    
    keys = list(umls_d.keys())

    partitioned_lfs = [ ]
    
    for i in range( 0, len(keys) ):

        if i == 0 or i == len(keys):
            if i == 0:
                partitioned_lfs.append( [keys] )
            if i ==len(keys):
                temp3 = list2Nested(keys, 1)
                partitioned_lfs.append( temp3 )
        else:
            temp1, temp2 = keys[:i] , keys[i:]
            temp3 = list2Nested( keys[:i], 1)
            temp3.append( keys[i:] )
            partitioned_lfs.append( temp3 )
    
    return partitioned_lfs


partitioned_p_umls = partitionLFs(ranksorted_p_umls)
partitioned_i_umls = partitionLFs(ranksorted_i_umls)
partitioned_o_umls = partitionLFs(ranksorted_o_umls)

In [11]:
import LMutils

# validation_labels   
# validation_labels_tui_pio2   
file = '/mnt/nas2/results/Results/systematicReview/distant_pico/EBM_PICO_GT/validation_labels_tui_pio2.tsv'
df_data = pd.read_csv(file, sep='\t', header=0)

In [12]:
Y_tokens = df_data['tokens']

In [13]:
Y_tokens = df_data['tokens']
#Y_p = df_data['p']
#Y_i = df_data['i']
#Y_o = df_data['o']
df_data_train, df_data_val = train_test_split(df_data, test_size=0.20, shuffle=False)

In [14]:
splits = ['train', 'dev']
X_sents = [
    df_data_train.tokens,
    df_data_val.tokens,
]

In [15]:
X_seq_lens = [
    np.array([len(str(s)) for s in X_sents[i]])
    for i,name in enumerate(splits)
]

In [16]:
X_seq_lens = [
    np.array( [ len(X_sents[i]) ] )
    for i,name in enumerate(splits)
]

In [17]:
# Read Candidate labels from multiple LFs
indir = '/mnt/nas2/results/Results/systematicReview/distant_pico/candidate_generation'
pathlist = Path(indir).glob('**/*.tsv')

tokens = []

lfs = dict()

for file in pathlist:

    k = str( file ).split('candidate_generation/')[-1].replace('.tsv', '').replace('/', '_')
    mypath = Path(file)
    if mypath.stat().st_size != 0:
        data = pd.read_csv(file, sep='\t', header=0)
    if len(tokens) == 0:
        tokens.extend( list(data.tokens) )
    
    sab = data.columns[-1]
    if len(list( data[sab] )) == 1354953:
        lfs[str(k)] = list( data[sab] )[:len(Y_tokens)]


print( 'Total number of tokens in validation set: ', len(tokens) )
print( 'Total number of LFs in the dictionary', len(lfs) )

Total number of tokens in validation set:  1354953
Total number of LFs in the dictionary 613


In [18]:
def lf_levels(umls_d:dict, pattern:str, picos:str):

    umls_level = dict()

    for key, value in umls_d.items():   # iter on both keys and values
        search_pattern = pattern + picos
        if key.startswith(search_pattern):
            k = str(key).split('_')[-1]
            umls_level[ k ] = value

    return umls_level

# Level 1: UMLS
umls_p = lf_levels(lfs, 'UMLS_fuzzy_', 'p')
umls_i = lf_levels(lfs, 'UMLS_fuzzy_', 'i')
umls_o = lf_levels(lfs, 'UMLS_fuzzy_', 'o')

# Level 2: non UMLS
nonumls_p = lf_levels(lfs, 'nonUMLS_fuzzy_', 'P')
nonumls_i = lf_levels(lfs, 'nonUMLS_fuzzy_', 'I')
nonumls_o = lf_levels(lfs, 'nonUMLS_fuzzy_', 'O')

# Level 3: DS
ds_p = lf_levels(lfs, 'DS_fuzzy_', 'P')
ds_i = lf_levels(lfs, 'DS_fuzzy_', 'I')
ds_o = lf_levels(lfs, 'DS_fuzzy_', 'O')

# Level 4: dictionary, rules, heuristics
heur_p = lf_levels(lfs, 'heuristics_direct_', 'P')
heur_i = lf_levels(lfs, 'heuristics_direct_', 'I')
heur_o = lf_levels(lfs, 'heuristics_direct_', 'O')

dict_p = lf_levels(lfs, 'dictionary_direct_', 'P')
dict_i = lf_levels(lfs, 'dictionary_direct_', 'I')
dict_o = lf_levels(lfs, 'dictionary_direct_', 'O')

In [19]:
def compare(s, t):
    return sorted(s) == sorted(t)

def getLFs(partition:list, umls_d:dict, seed_len:int):

    all_lfs_combined = []
    
    for lf in partition: # for each lf in a partition
        
        combine_here = [0] * seed_len

        for sab in lf:
            new_a = umls_d[sab]
            old_a = combine_here
            temp_a = []
            for o_a, n_a in zip(old_a, new_a):
                if compare([o_a, n_a] ,[-1, 1]) == True:
                    replace_a = max( o_a, n_a )
                    temp_a.append( replace_a )
                elif compare([o_a, n_a] ,[0, 1]) == True:
                    replace_a = max( o_a, n_a )
                    temp_a.append( replace_a )
                elif compare([o_a, n_a] ,[-1, 0]) == True:
                    replace_a = min( o_a, n_a )
                    temp_a.append( replace_a )
                else:
                    temp_a.append( o_a )

            combine_here = temp_a

        all_lfs_combined.append( combine_here )

    return all_lfs_combined

In [20]:
def train(partitioned_d_umls, umls_d, non_umls_d, ds_d, heur_d, dict_d, df_data_train, df_data_val, picos):
   
    model_class_init = {
        'cardinality': 2, 
        'verbose': True
    }


    '''#########################################################################
    # Choosing the number of LF's from UMLS all
    #########################################################################'''
    
    for i, partition in enumerate(partitioned_d_umls):

        combined_lf = getLFs(partition, umls_d, len(Y_tokens))
        assert len(partition) == len(combined_lf)

        print( 'Total number of UMLS partitions: ', len(partition) )
        combined_lf.extend( list(non_umls_d.values()) ) # Combine with level 2
        combined_lf.extend( list(ds_d.values()) ) # Combine with level 3
        combined_lf.extend( list(heur_d.values()) ) # Combine with level 4
        combined_lf.extend( list(dict_d.values()) ) # combine with level 4

        L = np.array(combined_lf)
        L = np.transpose(L)
        L_train, L_val = train_test_split(L, test_size=0.20, shuffle=False)

        Y_train = df_data_train[picos]
        Y_val = df_data_val[picos]
      
        model = LMsnorkel(cardinality=2)
        #model.fit( L_train, n_epochs=2000, mu_eps=0.0000001, lr=0.001 )
        model.fit( L_train, n_epochs=2000, mu_eps=0.0000001, lr=0.001, optimizer='adam' )
        
        preds = model.predict( L_val )
        cr = classification_report( preds, Y_val )
        scores = model.score( L_val, Y_val, metrics=['f1_macro'] )
        
        
        print( cr )
        print( scores )

In [21]:
train(partitioned_p_umls, umls_p, nonumls_p, ds_p, heur_p, dict_p, df_data_train, df_data_val, 'p')

Total number of UMLS partitions:  1




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245557
           1       0.14      0.18      0.16     25434

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5274760046581259}
Total number of UMLS partitions:  2




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245451
           1       0.14      0.18      0.16     25540

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5272705067679955}
Total number of UMLS partitions:  3




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245389
           1       0.14      0.18      0.16     25602

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5275549365252}
Total number of UMLS partitions:  4




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245360
           1       0.14      0.18      0.16     25631

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.527806543889889}
Total number of UMLS partitions:  5




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245301
           1       0.14      0.18      0.16     25690

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5277360962143686}
Total number of UMLS partitions:  6




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245256
           1       0.14      0.18      0.16     25735

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5277194079425347}
Total number of UMLS partitions:  7




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245256
           1       0.14      0.18      0.16     25735

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5277194079425347}
Total number of UMLS partitions:  8




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245256
           1       0.14      0.18      0.16     25735

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5277194079425347}
Total number of UMLS partitions:  9




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245165
           1       0.14      0.18      0.16     25826

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5276264870688147}
Total number of UMLS partitions:  10




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245179
           1       0.14      0.18      0.16     25812

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5276612241926623}
Total number of UMLS partitions:  11




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245179
           1       0.14      0.18      0.16     25812

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5276802042929518}
Total number of UMLS partitions:  12




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    244661
           1       0.14      0.18      0.16     26330

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.83      0.82      0.82    270991

{'f1_macro': 0.5281328936590346}
Total number of UMLS partitions:  13




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    244757
           1       0.14      0.18      0.16     26234

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5282963385711005}
Total number of UMLS partitions:  14




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    244852
           1       0.14      0.18      0.16     26139

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5291374377363072}
Total number of UMLS partitions:  15




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245720
           1       0.14      0.19      0.16     25271

    accuracy                           0.82    270991
   macro avg       0.53      0.53      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5295071979426323}
Total number of UMLS partitions:  16




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245399
           1       0.14      0.19      0.16     25592

    accuracy                           0.82    270991
   macro avg       0.53      0.54      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5302837513561901}
Total number of UMLS partitions:  17




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245493
           1       0.14      0.19      0.16     25498

    accuracy                           0.82    270991
   macro avg       0.53      0.54      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5304828130747223}
Total number of UMLS partitions:  18




              precision    recall  f1-score   support

           0       0.91      0.88      0.90    245851
           1       0.14      0.19      0.16     25140

    accuracy                           0.82    270991
   macro avg       0.53      0.54      0.53    270991
weighted avg       0.84      0.82      0.83    270991

{'f1_macro': 0.5304116813474122}
Total number of UMLS partitions:  19




              precision    recall  f1-score   support

           0       0.92      0.88      0.90    248169
           1       0.14      0.20      0.16     22822

    accuracy                           0.83    270991
   macro avg       0.53      0.54      0.53    270991
weighted avg       0.86      0.83      0.84    270991

{'f1_macro': 0.532034694021219}
Total number of UMLS partitions:  20




              precision    recall  f1-score   support

           0       0.92      0.88      0.90    247998
           1       0.14      0.20      0.16     22993

    accuracy                           0.83    270991
   macro avg       0.53      0.54      0.53    270991
weighted avg       0.86      0.83      0.84    270991

{'f1_macro': 0.531914144125108}
Total number of UMLS partitions:  21




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    257052
           1       0.12      0.28      0.17     13939

    accuracy                           0.85    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.91      0.85      0.88    270991

{'f1_macro': 0.5438266179699208}
Total number of UMLS partitions:  22




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    257317
           1       0.12      0.28      0.17     13674

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.88    270991

{'f1_macro': 0.5428631013350294}
Total number of UMLS partitions:  23




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    258882
           1       0.11      0.30      0.16     12109

    accuracy                           0.86    270991
   macro avg       0.54      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5404392579499879}
Total number of UMLS partitions:  24




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259641
           1       0.10      0.31      0.16     11350

    accuracy                           0.86    270991
   macro avg       0.54      0.60      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5396998809982368}
Total number of UMLS partitions:  25




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    260266
           1       0.10      0.31      0.15     10725

    accuracy                           0.86    270991
   macro avg       0.53      0.60      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5373998252237397}
Total number of UMLS partitions:  26




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259924
           1       0.10      0.30      0.15     11067

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5379491711943838}
Total number of UMLS partitions:  27




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259890
           1       0.10      0.30      0.15     11101

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5379034482705422}
Total number of UMLS partitions:  28




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259836
           1       0.10      0.30      0.15     11155

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5381449164442954}
Total number of UMLS partitions:  29




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259828
           1       0.10      0.30      0.15     11163

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5381225915832561}
Total number of UMLS partitions:  30




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259790
           1       0.10      0.30      0.15     11201

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5383591570085335}
Total number of UMLS partitions:  31




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259550
           1       0.10      0.30      0.15     11441

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5387618550687951}
Total number of UMLS partitions:  32




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259418
           1       0.10      0.30      0.15     11573

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5387086844691252}
Total number of UMLS partitions:  33




              precision    recall  f1-score   support

           0       0.97      0.88      0.92    259416
           1       0.10      0.30      0.15     11575

    accuracy                           0.86    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5387030926886975}
Total number of UMLS partitions:  34




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    258342
           1       0.11      0.29      0.16     12649

    accuracy                           0.86    270991
   macro avg       0.54      0.59      0.54    270991
weighted avg       0.92      0.86      0.89    270991

{'f1_macro': 0.5408267916406049}
Total number of UMLS partitions:  35




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    258056
           1       0.11      0.28      0.16     12935

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.89    270991

{'f1_macro': 0.5402353464510513}
Total number of UMLS partitions:  36




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    258058
           1       0.11      0.28      0.16     12933

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.89    270991

{'f1_macro': 0.5402409582212769}
Total number of UMLS partitions:  37




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    258058
           1       0.11      0.28      0.16     12933

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.89    270991

{'f1_macro': 0.5402409582212769}
Total number of UMLS partitions:  38




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    258064
           1       0.11      0.28      0.16     12927

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.89    270991

{'f1_macro': 0.5402577952095263}
Total number of UMLS partitions:  39




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    258077
           1       0.11      0.28      0.16     12914

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.89    270991

{'f1_macro': 0.5402942839874324}
Total number of UMLS partitions:  40




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    257788
           1       0.11      0.29      0.16     13203

    accuracy                           0.86    270991
   macro avg       0.54      0.59      0.54    270991
weighted avg       0.92      0.86      0.88    270991

{'f1_macro': 0.5416252386470387}
Total number of UMLS partitions:  41




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    257347
           1       0.12      0.28      0.16     13644

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.88    270991

{'f1_macro': 0.542598860290865}
Total number of UMLS partitions:  42




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    257385
           1       0.12      0.28      0.16     13606

    accuracy                           0.86    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.92      0.86      0.88    270991

{'f1_macro': 0.5425435834655317}
Total number of UMLS partitions:  43




              precision    recall  f1-score   support

           0       0.92      0.88      0.90    246885
           1       0.14      0.19      0.16     24106

    accuracy                           0.82    270991
   macro avg       0.53      0.54      0.53    270991
weighted avg       0.85      0.82      0.83    270991

{'f1_macro': 0.5293070692506696}
Total number of UMLS partitions:  44




              precision    recall  f1-score   support

           0       0.92      0.88      0.90    246986
           1       0.14      0.19      0.16     24005

    accuracy                           0.82    270991
   macro avg       0.53      0.54      0.53    270991
weighted avg       0.85      0.82      0.83    270991

{'f1_macro': 0.5290735284346096}
Total number of UMLS partitions:  45




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    255369
           1       0.12      0.26      0.17     15622

    accuracy                           0.85    270991
   macro avg       0.54      0.57      0.54    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.5414448995936689}
Total number of UMLS partitions:  46




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    255241
           1       0.13      0.26      0.17     15750

    accuracy                           0.85    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.5435975015432797}
Total number of UMLS partitions:  47




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    255768
           1       0.12      0.27      0.17     15223

    accuracy                           0.85    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.91      0.85      0.88    270991

{'f1_macro': 0.5428351404465072}
Total number of UMLS partitions:  48




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    255952
           1       0.12      0.27      0.17     15039

    accuracy                           0.85    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.91      0.85      0.88    270991

{'f1_macro': 0.5428107101147026}
Total number of UMLS partitions:  49




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    256023
           1       0.12      0.27      0.17     14968

    accuracy                           0.85    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.91      0.85      0.88    270991

{'f1_macro': 0.5425567775349975}
Total number of UMLS partitions:  50




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    255958
           1       0.12      0.27      0.17     15033

    accuracy                           0.85    270991
   macro avg       0.54      0.58      0.54    270991
weighted avg       0.91      0.85      0.88    270991

{'f1_macro': 0.5445758390814586}
Total number of UMLS partitions:  51




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    255515
           1       0.16      0.34      0.21     15476

    accuracy                           0.86    270991
   macro avg       0.56      0.61      0.57    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5675458578764905}
Total number of UMLS partitions:  52




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    255464
           1       0.16      0.33      0.21     15527

    accuracy                           0.86    270991
   macro avg       0.56      0.61      0.57    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5674464469739096}
Total number of UMLS partitions:  53




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    255547
           1       0.16      0.34      0.21     15444

    accuracy                           0.86    270991
   macro avg       0.56      0.61      0.57    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5675155061072596}
Total number of UMLS partitions:  54




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    254800
           1       0.17      0.35      0.23     16191

    accuracy                           0.86    270991
   macro avg       0.56      0.62      0.58    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5758493106049511}
Total number of UMLS partitions:  55




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    254984
           1       0.17      0.35      0.23     16007

    accuracy                           0.86    270991
   macro avg       0.56      0.62      0.57    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5741588113434045}
Total number of UMLS partitions:  56




              precision    recall  f1-score   support

           0       0.96      0.89      0.92    254725
           1       0.17      0.35      0.23     16266

    accuracy                           0.86    270991
   macro avg       0.56      0.62      0.58    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5756605212593807}
Total number of UMLS partitions:  57




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    253990
           1       0.17      0.34      0.23     17001

    accuracy                           0.86    270991
   macro avg       0.56      0.61      0.57    270991
weighted avg       0.90      0.86      0.88    270991

{'f1_macro': 0.5744533852436252}
Total number of UMLS partitions:  58




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    253876
           1       0.17      0.34      0.23     17115

    accuracy                           0.86    270991
   macro avg       0.56      0.62      0.58    270991
weighted avg       0.90      0.86      0.88    270991

{'f1_macro': 0.5757319558181513}
Total number of UMLS partitions:  59




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    252926
           1       0.18      0.33      0.23     18065

    accuracy                           0.85    270991
   macro avg       0.56      0.61      0.58    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.5758463691246715}
Total number of UMLS partitions:  60




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    252898
           1       0.18      0.33      0.23     18093

    accuracy                           0.85    270991
   macro avg       0.56      0.61      0.58    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.5762679926652328}
Total number of UMLS partitions:  61




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    253111
           1       0.18      0.33      0.23     17880

    accuracy                           0.86    270991
   macro avg       0.56      0.61      0.58    270991
weighted avg       0.90      0.86      0.87    270991

{'f1_macro': 0.5764908888690703}
Total number of UMLS partitions:  62




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    252866
           1       0.18      0.33      0.24     18125

    accuracy                           0.85    270991
   macro avg       0.57      0.61      0.58    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.577534641451636}
Total number of UMLS partitions:  63




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    252761
           1       0.18      0.33      0.24     18230

    accuracy                           0.85    270991
   macro avg       0.57      0.61      0.58    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.5777795940545045}
Total number of UMLS partitions:  64




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    252480
           1       0.18      0.33      0.24     18511

    accuracy                           0.85    270991
   macro avg       0.57      0.61      0.58    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.5775771560308435}
Total number of UMLS partitions:  65




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    252311
           1       0.18      0.33      0.24     18680

    accuracy                           0.85    270991
   macro avg       0.57      0.61      0.58    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5776029084722311}
Total number of UMLS partitions:  66




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    252034
           1       0.19      0.33      0.24     18957

    accuracy                           0.85    270991
   macro avg       0.57      0.61      0.58    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5780472050566368}
Total number of UMLS partitions:  67




              precision    recall  f1-score   support

           0       0.94      0.89      0.92    250467
           1       0.19      0.31      0.24     20524

    accuracy                           0.85    270991
   macro avg       0.57      0.60      0.58    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.577144473075367}
Total number of UMLS partitions:  68




              precision    recall  f1-score   support

           0       0.94      0.89      0.92    250499
           1       0.19      0.31      0.24     20492

    accuracy                           0.85    270991
   macro avg       0.57      0.60      0.58    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.5772095948203292}
Total number of UMLS partitions:  69




              precision    recall  f1-score   support

           0       0.94      0.89      0.91    249500
           1       0.20      0.31      0.24     21491

    accuracy                           0.85    270991
   macro avg       0.57      0.60      0.58    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.5772949340826197}
Total number of UMLS partitions:  70




              precision    recall  f1-score   support

           0       0.94      0.89      0.91    249711
           1       0.20      0.31      0.24     21280

    accuracy                           0.85    270991
   macro avg       0.57      0.60      0.58    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.5777494004502668}
Total number of UMLS partitions:  71




              precision    recall  f1-score   support

           0       0.94      0.89      0.92    249793
           1       0.20      0.31      0.24     21198

    accuracy                           0.85    270991
   macro avg       0.57      0.60      0.58    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.5778992254726523}
Total number of UMLS partitions:  72




              precision    recall  f1-score   support

           0       0.94      0.89      0.92    249807
           1       0.20      0.31      0.24     21184

    accuracy                           0.85    270991
   macro avg       0.57      0.60      0.58    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.578190469379257}


In [22]:
train(partitioned_i_umls, umls_i, nonumls_i, ds_i, heur_i, dict_i, df_data_train, df_data_val, 'i')

Total number of UMLS partitions:  1




              precision    recall  f1-score   support

           0       0.78      0.93      0.85    206888
           1       0.41      0.16      0.23     64103

    accuracy                           0.75    270991
   macro avg       0.59      0.54      0.54    270991
weighted avg       0.69      0.75      0.70    270991

{'f1_macro': 0.539295174591835}
Total number of UMLS partitions:  2




              precision    recall  f1-score   support

           0       0.80      0.93      0.86    211470
           1       0.40      0.17      0.24     59521

    accuracy                           0.76    270991
   macro avg       0.60      0.55      0.55    270991
weighted avg       0.71      0.76      0.72    270991

{'f1_macro': 0.5486688976176318}
Total number of UMLS partitions:  3




              precision    recall  f1-score   support

           0       0.80      0.93      0.86    212623
           1       0.39      0.17      0.24     58368

    accuracy                           0.76    270991
   macro avg       0.60      0.55      0.55    270991
weighted avg       0.72      0.76      0.73    270991

{'f1_macro': 0.5499396772710761}
Total number of UMLS partitions:  4




              precision    recall  f1-score   support

           0       0.81      0.93      0.87    215246
           1       0.38      0.17      0.24     55745

    accuracy                           0.77    270991
   macro avg       0.60      0.55      0.55    270991
weighted avg       0.72      0.77      0.74    270991

{'f1_macro': 0.5512573751877873}
Total number of UMLS partitions:  5




              precision    recall  f1-score   support

           0       0.89      0.93      0.91    234549
           1       0.33      0.23      0.27     36442

    accuracy                           0.83    270991
   macro avg       0.61      0.58      0.59    270991
weighted avg       0.81      0.83      0.82    270991

{'f1_macro': 0.5861801582215389}
Total number of UMLS partitions:  6




              precision    recall  f1-score   support

           0       0.90      0.93      0.91    238308
           1       0.31      0.24      0.27     32683

    accuracy                           0.84    270991
   macro avg       0.60      0.58      0.59    270991
weighted avg       0.83      0.84      0.83    270991

{'f1_macro': 0.5911166529653048}
Total number of UMLS partitions:  7




              precision    recall  f1-score   support

           0       0.92      0.92      0.92    245578
           1       0.27      0.27      0.27     25413

    accuracy                           0.86    270991
   macro avg       0.60      0.60      0.60    270991
weighted avg       0.86      0.86      0.86    270991

{'f1_macro': 0.5977665973907892}
Total number of UMLS partitions:  8




              precision    recall  f1-score   support

           0       0.93      0.93      0.93    246091
           1       0.27      0.28      0.27     24900

    accuracy                           0.87    270991
   macro avg       0.60      0.60      0.60    270991
weighted avg       0.87      0.87      0.87    270991

{'f1_macro': 0.599666898871669}
Total number of UMLS partitions:  9




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    246275
           1       0.27      0.27      0.27     24716

    accuracy                           0.87    270991
   macro avg       0.60      0.60      0.60    270991
weighted avg       0.87      0.87      0.87    270991

{'f1_macro': 0.5987193071674344}
Total number of UMLS partitions:  10




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    246293
           1       0.27      0.27      0.27     24698

    accuracy                           0.87    270991
   macro avg       0.60      0.60      0.60    270991
weighted avg       0.87      0.87      0.87    270991

{'f1_macro': 0.5986336342817021}
Total number of UMLS partitions:  11




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    249200
           1       0.20      0.24      0.22     21791

    accuracy                           0.86    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.87      0.86      0.87    270991

{'f1_macro': 0.5721504282843115}
Total number of UMLS partitions:  12




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    249041
           1       0.20      0.23      0.22     21950

    accuracy                           0.86    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.87      0.86      0.87    270991

{'f1_macro': 0.5709378573170726}
Total number of UMLS partitions:  13




              precision    recall  f1-score   support

           0       0.97      0.92      0.94    260073
           1       0.13      0.30      0.18     10918

    accuracy                           0.89    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.93      0.89      0.91    270991

{'f1_macro': 0.5601698394545112}
Total number of UMLS partitions:  14




              precision    recall  f1-score   support

           0       0.97      0.92      0.94    260061
           1       0.13      0.30      0.18     10930

    accuracy                           0.89    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.93      0.89      0.91    270991

{'f1_macro': 0.5607484740521336}
Total number of UMLS partitions:  15




              precision    recall  f1-score   support

           0       0.96      0.92      0.94    258423
           1       0.14      0.29      0.19     12568

    accuracy                           0.89    270991
   macro avg       0.55      0.60      0.57    270991
weighted avg       0.93      0.89      0.90    270991

{'f1_macro': 0.5652911354504772}
Total number of UMLS partitions:  16




              precision    recall  f1-score   support

           0       0.96      0.92      0.94    258294
           1       0.14      0.29      0.19     12697

    accuracy                           0.89    270991
   macro avg       0.55      0.60      0.57    270991
weighted avg       0.92      0.89      0.90    270991

{'f1_macro': 0.5656793722337516}
Total number of UMLS partitions:  17




              precision    recall  f1-score   support

           0       0.97      0.92      0.94    259334
           1       0.13      0.29      0.18     11657

    accuracy                           0.89    270991
   macro avg       0.55      0.60      0.56    270991
weighted avg       0.93      0.89      0.91    270991

{'f1_macro': 0.5599558570736477}
Total number of UMLS partitions:  18




              precision    recall  f1-score   support

           0       0.97      0.92      0.94    259025
           1       0.13      0.28      0.18     11966

    accuracy                           0.89    270991
   macro avg       0.55      0.60      0.56    270991
weighted avg       0.93      0.89      0.91    270991

{'f1_macro': 0.5611322064184675}
Total number of UMLS partitions:  19




              precision    recall  f1-score   support

           0       0.97      0.92      0.94    259040
           1       0.13      0.28      0.18     11951

    accuracy                           0.89    270991
   macro avg       0.55      0.60      0.56    270991
weighted avg       0.93      0.89      0.91    270991

{'f1_macro': 0.5612135310884424}
Total number of UMLS partitions:  20




              precision    recall  f1-score   support

           0       0.96      0.92      0.94    257532
           1       0.14      0.26      0.18     13459

    accuracy                           0.88    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.92      0.88      0.90    270991

{'f1_macro': 0.5595149517745668}
Total number of UMLS partitions:  21




              precision    recall  f1-score   support

           0       0.96      0.92      0.94    257202
           1       0.14      0.26      0.18     13789

    accuracy                           0.88    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.92      0.88      0.90    270991

{'f1_macro': 0.5596106074866305}
Total number of UMLS partitions:  22




              precision    recall  f1-score   support

           0       0.96      0.92      0.94    256324
           1       0.15      0.25      0.19     14667

    accuracy                           0.88    270991
   macro avg       0.55      0.58      0.56    270991
weighted avg       0.91      0.88      0.89    270991

{'f1_macro': 0.5601029532202182}
Total number of UMLS partitions:  23




              precision    recall  f1-score   support

           0       0.96      0.92      0.94    256321
           1       0.15      0.25      0.19     14670

    accuracy                           0.88    270991
   macro avg       0.55      0.58      0.56    270991
weighted avg       0.91      0.88      0.89    270991

{'f1_macro': 0.5605248104476996}
Total number of UMLS partitions:  24




              precision    recall  f1-score   support

           0       0.95      0.92      0.93    254953
           1       0.16      0.25      0.20     16038

    accuracy                           0.88    270991
   macro avg       0.56      0.58      0.56    270991
weighted avg       0.90      0.88      0.89    270991

{'f1_macro': 0.5648170275827026}
Total number of UMLS partitions:  25




              precision    recall  f1-score   support

           0       0.92      0.92      0.92    245667
           1       0.22      0.22      0.22     25324

    accuracy                           0.85    270991
   macro avg       0.57      0.57      0.57    270991
weighted avg       0.85      0.85      0.85    270991

{'f1_macro': 0.5682397787888596}
Total number of UMLS partitions:  26




              precision    recall  f1-score   support

           0       0.92      0.92      0.92    245759
           1       0.22      0.22      0.22     25232

    accuracy                           0.85    270991
   macro avg       0.57      0.57      0.57    270991
weighted avg       0.85      0.85      0.85    270991

{'f1_macro': 0.5682985661669224}
Total number of UMLS partitions:  27




              precision    recall  f1-score   support

           0       0.93      0.92      0.92    248249
           1       0.21      0.23      0.22     22742

    accuracy                           0.86    270991
   macro avg       0.57      0.57      0.57    270991
weighted avg       0.87      0.86      0.86    270991

{'f1_macro': 0.570071316571885}
Total number of UMLS partitions:  28




              precision    recall  f1-score   support

           0       0.93      0.92      0.92    248341
           1       0.20      0.23      0.22     22650

    accuracy                           0.86    270991
   macro avg       0.57      0.57      0.57    270991
weighted avg       0.87      0.86      0.86    270991

{'f1_macro': 0.5699902332250396}
Total number of UMLS partitions:  29




              precision    recall  f1-score   support

           0       0.93      0.92      0.92    248494
           1       0.20      0.23      0.22     22497

    accuracy                           0.86    270991
   macro avg       0.57      0.57      0.57    270991
weighted avg       0.87      0.86      0.87    270991

{'f1_macro': 0.5701814347013603}
Total number of UMLS partitions:  30




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    249399
           1       0.20      0.23      0.22     21592

    accuracy                           0.86    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.87      0.86      0.87    270991

{'f1_macro': 0.5705198356235444}
Total number of UMLS partitions:  31




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    249528
           1       0.20      0.23      0.22     21463

    accuracy                           0.86    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.88      0.86      0.87    270991

{'f1_macro': 0.5706525941760321}
Total number of UMLS partitions:  32




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    249813
           1       0.20      0.24      0.22     21178

    accuracy                           0.87    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.88      0.87      0.87    270991

{'f1_macro': 0.570727773348274}
Total number of UMLS partitions:  33




              precision    recall  f1-score   support

           0       0.93      0.92      0.93    249950
           1       0.20      0.24      0.21     21041

    accuracy                           0.87    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.88      0.87      0.87    270991

{'f1_macro': 0.570651480097438}
Total number of UMLS partitions:  34




              precision    recall  f1-score   support

           0       0.94      0.92      0.93    252488
           1       0.18      0.24      0.21     18503

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.5679113180592024}
Total number of UMLS partitions:  35




              precision    recall  f1-score   support

           0       0.94      0.92      0.93    252960
           1       0.18      0.25      0.20     18031

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.5679077237443908}
Total number of UMLS partitions:  36




              precision    recall  f1-score   support

           0       0.95      0.92      0.93    253038
           1       0.18      0.25      0.21     17953

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.568879249784466}
Total number of UMLS partitions:  37




              precision    recall  f1-score   support

           0       0.95      0.92      0.93    253014
           1       0.18      0.25      0.21     17977

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.5691727996985391}
Total number of UMLS partitions:  38




              precision    recall  f1-score   support

           0       0.94      0.92      0.93    252907
           1       0.18      0.25      0.21     18084

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.5693787818292492}
Total number of UMLS partitions:  39




              precision    recall  f1-score   support

           0       0.94      0.92      0.93    252627
           1       0.18      0.25      0.21     18364

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.5688110851475349}
Total number of UMLS partitions:  40




              precision    recall  f1-score   support

           0       0.94      0.92      0.93    252612
           1       0.18      0.25      0.21     18379

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.5692074794268004}
Total number of UMLS partitions:  41




              precision    recall  f1-score   support

           0       0.94      0.92      0.93    252392
           1       0.18      0.25      0.21     18599

    accuracy                           0.87    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.89      0.87      0.88    270991

{'f1_macro': 0.5700611447793629}


In [23]:
train(partitioned_o_umls, umls_o, nonumls_o, ds_o, heur_o, dict_o, df_data_train, df_data_val, 'o')

Total number of UMLS partitions:  1




              precision    recall  f1-score   support

           0       0.79      0.91      0.85    208169
           1       0.41      0.20      0.27     62822

    accuracy                           0.75    270991
   macro avg       0.60      0.56      0.56    270991
weighted avg       0.70      0.75      0.71    270991

{'f1_macro': 0.5587517796368752}
Total number of UMLS partitions:  2




              precision    recall  f1-score   support

           0       0.79      0.91      0.85    208588
           1       0.40      0.20      0.27     62403

    accuracy                           0.75    270991
   macro avg       0.60      0.56      0.56    270991
weighted avg       0.70      0.75      0.71    270991

{'f1_macro': 0.5569956304812641}
Total number of UMLS partitions:  3




              precision    recall  f1-score   support

           0       0.79      0.91      0.85    208610
           1       0.40      0.20      0.27     62381

    accuracy                           0.75    270991
   macro avg       0.60      0.56      0.56    270991
weighted avg       0.70      0.75      0.71    270991

{'f1_macro': 0.5568876844324367}
Total number of UMLS partitions:  4




              precision    recall  f1-score   support

           0       0.94      0.89      0.92    252614
           1       0.15      0.25      0.18     18377

    accuracy                           0.85    270991
   macro avg       0.54      0.57      0.55    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.550206067551809}
Total number of UMLS partitions:  5




              precision    recall  f1-score   support

           0       0.94      0.89      0.92    252560
           1       0.15      0.25      0.19     18431

    accuracy                           0.85    270991
   macro avg       0.54      0.57      0.55    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5516162524704323}
Total number of UMLS partitions:  6




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    253329
           1       0.14      0.26      0.18     17662

    accuracy                           0.85    270991
   macro avg       0.54      0.57      0.55    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5511372883423108}
Total number of UMLS partitions:  7




              precision    recall  f1-score   support

           0       0.94      0.89      0.92    253076
           1       0.14      0.25      0.18     17915

    accuracy                           0.85    270991
   macro avg       0.54      0.57      0.55    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5514148266707836}
Total number of UMLS partitions:  8




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    255936
           1       0.12      0.25      0.16     15055

    accuracy                           0.86    270991
   macro avg       0.54      0.57      0.54    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5416416625629974}
Total number of UMLS partitions:  9




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    255841
           1       0.12      0.25      0.17     15150

    accuracy                           0.86    270991
   macro avg       0.54      0.57      0.54    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.543652146692352}
Total number of UMLS partitions:  10




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    256108
           1       0.12      0.26      0.17     14883

    accuracy                           0.86    270991
   macro avg       0.54      0.57      0.54    270991
weighted avg       0.91      0.86      0.88    270991

{'f1_macro': 0.5438300051980101}
Total number of UMLS partitions:  11




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    259195
           1       0.10      0.28      0.15     11796

    accuracy                           0.86    270991
   macro avg       0.53      0.58      0.54    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5391737863960446}
Total number of UMLS partitions:  12




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260121
           1       0.10      0.28      0.14     10870

    accuracy                           0.87    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5360592519363849}
Total number of UMLS partitions:  13




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260235
           1       0.10      0.28      0.14     10756

    accuracy                           0.87    270991
   macro avg       0.53      0.59      0.54    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5350911129909782}
Total number of UMLS partitions:  14




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259884
           1       0.10      0.28      0.14     11107

    accuracy                           0.87    270991
   macro avg       0.53      0.58      0.54    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5354531284485659}
Total number of UMLS partitions:  15




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260355
           1       0.09      0.27      0.14     10636

    accuracy                           0.87    270991
   macro avg       0.53      0.58      0.53    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5318909687402938}
Total number of UMLS partitions:  16




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260335
           1       0.09      0.27      0.14     10656

    accuracy                           0.87    270991
   macro avg       0.53      0.58      0.53    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5313216573640511}
Total number of UMLS partitions:  17




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259767
           1       0.09      0.27      0.14     11224

    accuracy                           0.86    270991
   macro avg       0.53      0.58      0.53    270991
weighted avg       0.93      0.86      0.89    270991

{'f1_macro': 0.5331190373103338}
Total number of UMLS partitions:  18




              precision    recall  f1-score   support

           0       0.98      0.89      0.93    262917
           1       0.08      0.31      0.13      8074

    accuracy                           0.87    270991
   macro avg       0.53      0.60      0.53    270991
weighted avg       0.95      0.87      0.91    270991

{'f1_macro': 0.5289169319918116}
Total number of UMLS partitions:  19




              precision    recall  f1-score   support

           0       0.98      0.89      0.93    262783
           1       0.08      0.32      0.13      8208

    accuracy                           0.87    270991
   macro avg       0.53      0.60      0.53    270991
weighted avg       0.95      0.87      0.91    270991

{'f1_macro': 0.5310112632165932}
Total number of UMLS partitions:  20




              precision    recall  f1-score   support

           0       0.98      0.89      0.93    262312
           1       0.09      0.32      0.14      8679

    accuracy                           0.87    270991
   macro avg       0.53      0.61      0.54    270991
weighted avg       0.95      0.87      0.91    270991

{'f1_macro': 0.5357763578539071}
Total number of UMLS partitions:  21




              precision    recall  f1-score   support

           0       0.98      0.89      0.93    262627
           1       0.09      0.32      0.14      8364

    accuracy                           0.87    270991
   macro avg       0.53      0.61      0.53    270991
weighted avg       0.95      0.87      0.91    270991

{'f1_macro': 0.5332481493701278}
Total number of UMLS partitions:  22




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    261914
           1       0.09      0.31      0.14      9077

    accuracy                           0.87    270991
   macro avg       0.53      0.60      0.53    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5349299879492632}
Total number of UMLS partitions:  23




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    261485
           1       0.09      0.31      0.14      9506

    accuracy                           0.87    270991
   macro avg       0.53      0.60      0.54    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5368345518964297}
Total number of UMLS partitions:  24




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    261038
           1       0.10      0.31      0.15      9953

    accuracy                           0.87    270991
   macro avg       0.54      0.60      0.54    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5400303049365984}
Total number of UMLS partitions:  25




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259745
           1       0.11      0.31      0.16     11246

    accuracy                           0.87    270991
   macro avg       0.54      0.60      0.55    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5460108564974792}
Total number of UMLS partitions:  26




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259753
           1       0.11      0.31      0.16     11238

    accuracy                           0.87    270991
   macro avg       0.54      0.60      0.55    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5460093231802706}
Total number of UMLS partitions:  27




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    261142
           1       0.10      0.32      0.15      9849

    accuracy                           0.87    270991
   macro avg       0.54      0.60      0.54    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5406980930449408}
Total number of UMLS partitions:  28




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260444
           1       0.11      0.32      0.16     10547

    accuracy                           0.87    270991
   macro avg       0.54      0.61      0.55    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5454349445024316}
Total number of UMLS partitions:  29




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    261386
           1       0.10      0.33      0.16      9605

    accuracy                           0.87    270991
   macro avg       0.54      0.61      0.54    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5435452431720595}
Total number of UMLS partitions:  30




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    261440
           1       0.10      0.33      0.16      9551

    accuracy                           0.87    270991
   macro avg       0.54      0.61      0.54    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5432832678154913}
Total number of UMLS partitions:  31




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    261461
           1       0.10      0.33      0.16      9530

    accuracy                           0.87    270991
   macro avg       0.54      0.61      0.54    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.54316063270338}
Total number of UMLS partitions:  32




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260426
           1       0.11      0.34      0.17     10565

    accuracy                           0.87    270991
   macro avg       0.54      0.62      0.55    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5509845979749698}
Total number of UMLS partitions:  33




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260458
           1       0.11      0.34      0.17     10533

    accuracy                           0.87    270991
   macro avg       0.54      0.62      0.55    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5506191180069387}
Total number of UMLS partitions:  34




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260624
           1       0.11      0.34      0.17     10367

    accuracy                           0.87    270991
   macro avg       0.54      0.62      0.55    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5489573901322257}
Total number of UMLS partitions:  35




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260480
           1       0.11      0.33      0.16     10511

    accuracy                           0.87    270991
   macro avg       0.54      0.61      0.55    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.5470680370552808}
Total number of UMLS partitions:  36




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260738
           1       0.11      0.33      0.16     10253

    accuracy                           0.87    270991
   macro avg       0.54      0.61      0.55    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.545382370814633}
Total number of UMLS partitions:  37




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259399
           1       0.12      0.34      0.18     11592

    accuracy                           0.87    270991
   macro avg       0.55      0.62      0.56    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5555141648418773}
Total number of UMLS partitions:  38




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    260548
           1       0.11      0.34      0.17     10443

    accuracy                           0.87    270991
   macro avg       0.54      0.62      0.55    270991
weighted avg       0.94      0.87      0.90    270991

{'f1_macro': 0.551390920878836}
Total number of UMLS partitions:  39




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259767
           1       0.12      0.34      0.18     11224

    accuracy                           0.87    270991
   macro avg       0.55      0.62      0.56    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5555720912412108}
Total number of UMLS partitions:  40




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    258929
           1       0.13      0.34      0.19     12062

    accuracy                           0.87    270991
   macro avg       0.55      0.62      0.56    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5588046284605603}
Total number of UMLS partitions:  41




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259017
           1       0.13      0.34      0.19     11974

    accuracy                           0.87    270991
   macro avg       0.55      0.62      0.56    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5586141406204472}
Total number of UMLS partitions:  42




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    259033
           1       0.13      0.34      0.19     11958

    accuracy                           0.87    270991
   macro avg       0.55      0.62      0.56    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5586158519374662}
Total number of UMLS partitions:  43




              precision    recall  f1-score   support

           0       0.97      0.89      0.93    258885
           1       0.13      0.34      0.19     12106

    accuracy                           0.87    270991
   macro avg       0.55      0.62      0.56    270991
weighted avg       0.93      0.87      0.90    270991

{'f1_macro': 0.5588370276006389}
Total number of UMLS partitions:  44




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    258307
           1       0.13      0.33      0.19     12684

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.93      0.87      0.89    270991

{'f1_macro': 0.5586338960391589}
Total number of UMLS partitions:  45




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    257393
           1       0.14      0.32      0.19     13598

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.92      0.87      0.89    270991

{'f1_macro': 0.5595324495095643}
Total number of UMLS partitions:  46




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    257237
           1       0.14      0.32      0.19     13754

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.92      0.87      0.89    270991

{'f1_macro': 0.5593459939325899}
Total number of UMLS partitions:  47




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    257272
           1       0.14      0.32      0.19     13719

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.92      0.87      0.89    270991

{'f1_macro': 0.5592166011807146}
Total number of UMLS partitions:  48




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    257553
           1       0.14      0.32      0.19     13438

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.92      0.87      0.89    270991

{'f1_macro': 0.559002772388485}
Total number of UMLS partitions:  49




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    257616
           1       0.14      0.32      0.19     13375

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.92      0.87      0.89    270991

{'f1_macro': 0.5589130119889969}
Total number of UMLS partitions:  50




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    257666
           1       0.14      0.32      0.19     13325

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.92      0.87      0.89    270991

{'f1_macro': 0.558854259902748}
Total number of UMLS partitions:  51




              precision    recall  f1-score   support

           0       0.96      0.89      0.93    257641
           1       0.14      0.32      0.19     13350

    accuracy                           0.87    270991
   macro avg       0.55      0.61      0.56    270991
weighted avg       0.92      0.87      0.89    270991

{'f1_macro': 0.5587740812421373}
Total number of UMLS partitions:  52




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    253619
           1       0.15      0.27      0.19     17372

    accuracy                           0.85    270991
   macro avg       0.55      0.58      0.56    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.556753785699741}
Total number of UMLS partitions:  53




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    253586
           1       0.15      0.27      0.19     17405

    accuracy                           0.85    270991
   macro avg       0.55      0.58      0.56    270991
weighted avg       0.90      0.85      0.87    270991

{'f1_macro': 0.5571250835785295}
Total number of UMLS partitions:  54




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    252757
           1       0.16      0.27      0.20     18234

    accuracy                           0.85    270991
   macro avg       0.55      0.58      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5594964238197684}
Total number of UMLS partitions:  55




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    252840
           1       0.16      0.27      0.20     18151

    accuracy                           0.85    270991
   macro avg       0.55      0.58      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5593771889703203}
Total number of UMLS partitions:  56




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    253208
           1       0.15      0.27      0.20     17783

    accuracy                           0.85    270991
   macro avg       0.55      0.58      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5588718650351093}
Total number of UMLS partitions:  57




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    253702
           1       0.15      0.28      0.20     17289

    accuracy                           0.86    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.90      0.86      0.87    270991

{'f1_macro': 0.5581148229897761}
Total number of UMLS partitions:  58




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    253770
           1       0.15      0.28      0.20     17221

    accuracy                           0.86    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.90      0.86      0.87    270991

{'f1_macro': 0.5586196479114833}
Total number of UMLS partitions:  59




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    253963
           1       0.15      0.28      0.20     17028

    accuracy                           0.86    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.90      0.86      0.88    270991

{'f1_macro': 0.5588592977885022}
Total number of UMLS partitions:  60




              precision    recall  f1-score   support

           0       0.95      0.89      0.92    254016
           1       0.15      0.28      0.19     16975

    accuracy                           0.86    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.90      0.86      0.88    270991

{'f1_macro': 0.5578221647824557}
Total number of UMLS partitions:  61




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    253696
           1       0.16      0.28      0.20     17295

    accuracy                           0.86    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.90      0.86      0.88    270991

{'f1_macro': 0.5611866809607555}
Total number of UMLS partitions:  62




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    253138
           1       0.16      0.28      0.21     17853

    accuracy                           0.86    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.89      0.86      0.87    270991

{'f1_macro': 0.562963892455353}
Total number of UMLS partitions:  63




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    253304
           1       0.16      0.28      0.20     17687

    accuracy                           0.86    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.90      0.86      0.87    270991

{'f1_macro': 0.5617459746901445}
Total number of UMLS partitions:  64




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    252934
           1       0.16      0.28      0.20     18057

    accuracy                           0.85    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5616495138116079}
Total number of UMLS partitions:  65




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    252798
           1       0.16      0.28      0.20     18193

    accuracy                           0.85    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5623755325348361}
Total number of UMLS partitions:  66




              precision    recall  f1-score   support

           0       0.95      0.90      0.92    252787
           1       0.16      0.28      0.21     18204

    accuracy                           0.85    270991
   macro avg       0.55      0.59      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5626513594894157}
Total number of UMLS partitions:  67




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    252169
           1       0.17      0.28      0.21     18822

    accuracy                           0.85    270991
   macro avg       0.56      0.59      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5643295186380543}
Total number of UMLS partitions:  68




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    251866
           1       0.17      0.28      0.21     19125

    accuracy                           0.85    270991
   macro avg       0.56      0.59      0.57    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5656598890664923}
Total number of UMLS partitions:  69




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    251969
           1       0.17      0.28      0.21     19022

    accuracy                           0.85    270991
   macro avg       0.56      0.59      0.57    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5654433968708342}
Total number of UMLS partitions:  70




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    251682
           1       0.17      0.28      0.21     19309

    accuracy                           0.85    270991
   macro avg       0.56      0.59      0.57    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5659423830280954}
Total number of UMLS partitions:  71




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    251829
           1       0.17      0.28      0.21     19162

    accuracy                           0.85    270991
   macro avg       0.56      0.59      0.56    270991
weighted avg       0.89      0.85      0.87    270991

{'f1_macro': 0.5647782469556979}
Total number of UMLS partitions:  72




              precision    recall  f1-score   support

           0       0.94      0.90      0.92    250271
           1       0.18      0.28      0.22     20720

    accuracy                           0.85    270991
   macro avg       0.56      0.59      0.57    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.5687381238825907}
Total number of UMLS partitions:  73




              precision    recall  f1-score   support

           0       0.93      0.90      0.92    249483
           1       0.19      0.28      0.22     21508

    accuracy                           0.85    270991
   macro avg       0.56      0.59      0.57    270991
weighted avg       0.88      0.85      0.86    270991

{'f1_macro': 0.5699938510950203}
Total number of UMLS partitions:  74




              precision    recall  f1-score   support

           0       0.93      0.90      0.91    247603
           1       0.20      0.27      0.23     23388

    accuracy                           0.84    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.87      0.84      0.85    270991

{'f1_macro': 0.5704351529533755}
Total number of UMLS partitions:  75




              precision    recall  f1-score   support

           0       0.93      0.90      0.91    247744
           1       0.20      0.27      0.23     23247

    accuracy                           0.84    270991
   macro avg       0.56      0.58      0.57    270991
weighted avg       0.87      0.84      0.85    270991

{'f1_macro': 0.5696449427929444}
Total number of UMLS partitions:  76




              precision    recall  f1-score   support

           0       0.92      0.90      0.91    246362
           1       0.21      0.27      0.24     24629

    accuracy                           0.84    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.86      0.84      0.85    270991

{'f1_macro': 0.5734914440950931}
Total number of UMLS partitions:  77




              precision    recall  f1-score   support

           0       0.92      0.90      0.91    246071
           1       0.21      0.27      0.24     24920

    accuracy                           0.84    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.86      0.84      0.85    270991

{'f1_macro': 0.573191912730997}
Total number of UMLS partitions:  78




              precision    recall  f1-score   support

           0       0.92      0.90      0.91    244278
           1       0.22      0.26      0.24     26713

    accuracy                           0.84    270991
   macro avg       0.57      0.58      0.57    270991
weighted avg       0.85      0.84      0.84    270991

{'f1_macro': 0.5748329579558308}
Total number of UMLS partitions:  79




              precision    recall  f1-score   support

           0       0.90      0.90      0.90    239204
           1       0.26      0.26      0.26     31787

    accuracy                           0.83    270991
   macro avg       0.58      0.58      0.58    270991
weighted avg       0.83      0.83      0.83    270991

{'f1_macro': 0.5813238086069942}
Total number of UMLS partitions:  80




              precision    recall  f1-score   support

           0       0.90      0.90      0.90    237730
           1       0.27      0.25      0.26     33261

    accuracy                           0.82    270991
   macro avg       0.58      0.58      0.58    270991
weighted avg       0.82      0.82      0.82    270991

{'f1_macro': 0.5809819124711737}
Total number of UMLS partitions:  81




              precision    recall  f1-score   support

           0       0.90      0.90      0.90    238061
           1       0.27      0.25      0.26     32930

    accuracy                           0.82    270991
   macro avg       0.58      0.58      0.58    270991
weighted avg       0.82      0.82      0.82    270991

{'f1_macro': 0.5802909573267367}
