# Definitions

Notebook for creating definitions of medical terms in a document

The code here assumes that the document was already parsed with PAWLs

In [13]:
import os
import sys
import pandas as pd
import json
import uuid
import transformers
import datasets
import pandas as pd
from elasticsearch import Elasticsearch
from spacy.tokens import Doc
from wordfreq import word_frequency
from wiktionaryparser import WiktionaryParser
import tqdm
import inflect
import spacy 
import re 

# renaming the cache to point to nobackup
DIR = ''
DATA_DIR = '{}/data'.format(DIR)

sys.path.append('{}/lib'.format(DIR))
import sauce_defs


inflect_engine = inflect.engine()

In [3]:
with open('pdf_structure.json', 'r') as f:
    pawls_structure = json.load(f)

# Get the Tokens for a Doc

In [4]:
def find_matching_token(pages, docs):
    matching = {}
    for d, p in zip(docs, pages):
        matching_tokens = []
        ents_index = [(s.start, s.end) for s in d.ents]
        for indices in ents_index:
            matching_tokens.append({'tokens': p['tokens'][indices[0]:indices[1]], 'indices':indices})
        matching[p['page']['index']] = matching_tokens
    return matching

In [7]:
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from spacy.tokens import Doc


sci_nlp = spacy.load("en_core_sci_sm")

sci_nlp.add_pipe("abbreviation_detector")
sci_nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, 
                                            "linker_name": "umls", 
                                            "filter_for_definitions": True,
                                            "max_entities_per_mention": 5,
                                            "threshold": 0.75,
                                            "no_definition_threshold": 1.0},)




<scispacy.linking.EntityLinker at 0x7fd8c99fb518>

# Link using scispacy NER

In [8]:
ner = sci_nlp.get_pipe("ner")
abbr_det = sci_nlp.get_pipe("abbreviation_detector")
linker = sci_nlp.get_pipe("scispacy_linker")

In [9]:
def get_docs(pawls_structure):
    tokens = [[t['text'] for t in p['tokens']] for p in pawls_structure]
    docs = []
    for p in tokens:
        doc = Doc(sci_nlp.vocab, words=p)
        doc = ner(doc)
        doc = abbr_det(doc)
        doc = linker(doc)
        docs.append(doc)
    return tokens, docs

# filter ents for those with umls defs
def filter_for_kb_links(e):
    return len(e._.kb_ents) > 0

In [10]:
# Expanding a definition from one keyword to all matching keywords in a paper
sle_tokens, sle_docs = get_docs(pawls_structure)

# get all entities with link to KB
sle_ents = [d.ents for d in sle_docs]
sle_ents = [e for ent in sle_ents for e in ent]
sle_ents_with_defs = list(filter(filter_for_kb_links, sle_ents))

  global_matches = self.global_matcher(doc)
  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


# Get UMLS defs

Need to do this while still have ent. spans

In [11]:

def sing_lower_match(s1, s2):
    return sauce_defs.make_singular(s1.lower()) == sauce_defs.make_singular(s2.lower())

def get_KB_def(e):
    kb_ents = []
    for k in e._.kb_ents:
        kb_ent = linker.kb.cui_to_entity[k[0]]
        if sing_lower_match(kb_ent[1], e.text):
            return {'term': e.text, 'UMLS': kb_ent.definition}
    return {'term': e.text, 'UMLS': None}

def get_KB_ents(e):
    print(e)
    for kb_ent in e._.kb_ents:
        print(linker.kb.cui_to_entity[kb_ent[0]], kb_ent[1])
    print('---------------------')
    return e._.kb_ents 

In [14]:
sle_UMLS_defs = pd.DataFrame([get_KB_def(e) for e in sle_ents_with_defs])

# clean in the same way as the sauce lib
sle_UMLS_defs['cleaned_term'] = [re.sub('\.|;|,|\(|\)', '', s.lower()) for s in sle_UMLS_defs['term']]
sle_UMLS_defs['cleaned_term'] = [sauce_defs.make_singular(s) for s in sle_UMLS_defs['cleaned_term']]

# Get Wikitionariy defs

We have the UMLS defs above, now let's get the wiktionary ones

In [15]:
def get_paper_defs(matching_tokens):
    df_defs = []

    for k in matching_tokens.keys():
        df_defs.append(sauce_defs.make_definitions_df(matching_tokens[k]))
        
    return pd.concat(df_defs)

# rather than getting all the defs, just get the terms
def get_paper_terms(matching_tokens):
    df_terms = []

    for k in matching_tokens.keys():
        df_terms.append(sauce_defs.make_term_df(matching_tokens[k]))
        
    return pd.concat(df_terms)


In [16]:
sle_matching_tokens = find_matching_token(pawls_structure, sle_docs)
df_terms_lupus = get_paper_terms(sle_matching_tokens)

## Filtering

In [17]:
def get_quantile(df, quantile, col='freq', side='less'):
    
    if side == 'less':
        return df[df[col] < df[col].quantile(quantile)]
    elif side == 'greater':
        return df[df[col] > df[col].quantile(quantile)]
    
    print('Unknown value for side:', side)
    return None

def filter_terms(df):
    # first takee out any non-words
    df_cleaned = df[df['freq'] > 0].copy()

    # remove any words less than 2 chars (3 chars == SLE so can't have that)
    df_cleaned = df_cleaned[df_cleaned['str_len'] > 2]

    # and any words above 29 (28 = length of lupus)
    df_cleaned = df_cleaned[df_cleaned['str_len'] < 29]

    # clean of very common words (top 80% of data)
    df_cleaned = get_quantile(df_cleaned, 0.8, side='less')

    # making new df because we want to keep the original terms to match back to the doc
    return df_cleaned, df_cleaned.drop_duplicates(subset=['cleaned_term'])

In [18]:
# make columns for filtering
df_terms_lupus['freq'] = [word_frequency(t, 'en') for t in df_terms_lupus['cleaned_term']]
df_terms_lupus['str_len'] = df_terms_lupus['cleaned_term'].str.len()

In [19]:
# filter 
df_terms_full_lupus, df_terms_defs_lupus = filter_terms(df_terms_lupus)

In [22]:
parser = WiktionaryParser()

wiktionary_tags = ['medicine', 'organism', 'pathology', 'biochemistry', 'autoantigen', 'genetics', 'cytology', 'physics', 'chemistry', 'organic chemistry', 'immunology', 'pharmacology', 'anatomy', 'neuroanatomy']
def get_wikitionary_def(term):
    word = parser.fetch(term)
    try:
        return word[0]['definitions'][0]['text'][1]
    except:
        return None

df_terms_defs_lupus['wikitionary'] = [get_wikitionary_def(t) for t in tqdm.tqdm(df_terms_defs_lupus['cleaned_term'])]


  0%|          | 0/1858 [00:00<?, ?it/s][A
  0%|          | 1/1858 [00:01<33:10,  1.07s/it][A
  0%|          | 2/1858 [00:01<19:16,  1.60it/s][A
  0%|          | 3/1858 [00:02<20:08,  1.53it/s][A
  0%|          | 4/1858 [00:02<16:10,  1.91it/s][A
  0%|          | 5/1858 [00:02<13:23,  2.31it/s][A
  0%|          | 6/1858 [00:03<13:35,  2.27it/s][A
  0%|          | 7/1858 [00:03<11:54,  2.59it/s][A
  0%|          | 8/1858 [00:03<12:34,  2.45it/s][A
  0%|          | 9/1858 [00:04<12:21,  2.49it/s][A

KeyboardInterrupt: 

In [None]:
# merge with original term defs so we have all the terms (rather than just a singular cleaned one)
df_terms_lupus_cleaned_merged = pd.merge(df_terms_full_lupus, df_terms_defs_lupus[['cleaned_term', 'wikitionary']], how='left', on='cleaned_term')


# merge with UMLS 
def lookup_UMLS_def(t, UMLS_lookup):
    try:
        return UMLS_lookup.loc[t]['UMLS']
    except:
        return None
    
    
# df_terms_lupus_cleaned_merged_umls = pd.merge(df_terms_lupus_cleaned_merged, UMLS_defs[['cleaned_term', 'UMLS']], how='left', on='cleaned_term')
UMLS_lookup_sle = sle_UMLS_defs[['cleaned_term', 'UMLS']].drop_duplicates(subset=['cleaned_term']).set_index('cleaned_term')

df_terms_lupus_cleaned_merged['UMLS'] = [lookup_UMLS_def(t, UMLS_lookup_sle) for t in df_terms_lupus_cleaned_merged['cleaned_term']]


In [None]:
# save 
df_terms_lupus_cleaned_merged.to_csv('term_definitions.csv')
