This notebook process a text file (specified in `INPUT_FILE`), which should be of the form one search string per line. 

It attempts to automatically match each line in the file to a SNOMED code. 
If this is not possible, it will prompt the user to enter a relevant code. 

Mappings are saved incrementally to `.json` files

# Setup

## Imports

In [3]:
import glob
import os

from IPython.display import display, HTML
import json
import pandas as pd

from snomed import Snomed

snomed = Snomed()


## Function Definitions

### Loading Data

In [4]:
def load_data(file_path: str):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().splitlines()
    return data

### Condition Mapping

In [5]:
def automatically_map_conditions(data: list[str], snomed: Snomed):
    known = {}
    print('Automatically mapping conditions to SNOMED-CT...')
    for condition_name in data:
        cui = snomed.find_cui(condition_name)
        if cui:
            known[condition_name] = int(cui)
            concept = snomed.get_primary_concept(cui)
            print(f'\t{condition_name} mapped to {concept['name']}')

    unknown = [condition for condition in data if condition not in known]
    if unknown:
        print(len(known), 'conditions mapped to SNOMED-CT.')
        print(len(data) - len(known), 'conditions not mapped to SNOMED-CT:')
        for condition in unknown:
            print(f'\t{condition}')
    else:
        print(f'All {len(known)} conditions mapped to SNOMED-CT')

    return known, unknown

def get_user_input_cui(condition_name: str, snomed: Snomed):
    print(f'\nSearching for partial matches for {condition_name}...')
    matches = snomed.find_concepts(condition_name)
    if len(matches):
        with pd.option_context("display.max_rows", None):
            df = matches[matches.name_status=='P'].set_index('cui')
            
            display(HTML("<div style='max-height: 400px; overflow: auto; width: 700px'>" +
                        df[['name']].style.to_html() +
                        "</div>"), clear=True)
    else:
        print(f'\tNo partial matches found for {condition_name}')

    return input(f'Enter the CUI for {condition_name}. Suggested options shown below, but you can enter any CUI. Press Enter to skip. ')

def get_user_input_cuis(unknown_conditions: list[str], snomed: Snomed):
    for condition_name in unknown_conditions:
        MANUALLY_MAPPED_CUIS[condition_name] = get_user_input_cui(condition_name, snomed)
    
def process_manually_mapped_cuis(manually_mapped_cuis: dict[str, str], 
                                 known_conditions: dict[str, int], 
                                 unknown_conditions: list[str], 
                                 snomed: Snomed):
    print(f'Manually mapped {len([v for v in manually_mapped_cuis.values() if v])} conditions to SNOMED-CT:')
    for raw_name, manual_cui in manually_mapped_cuis.items():
        if manual_cui:
            try:
                cui = int(manual_cui)
            except ValueError:
                print(f'\t{manual_cui} is not a valid CUI. Skipping {raw_name}')
                continue

            concept = snomed.get_primary_concept(cui)
            print(f'\t{raw_name} mapped to {concept['name']} ({cui})')
            unknown_conditions.remove(raw_name)
            known_conditions[raw_name] = int(cui)

    print(f'{len([v for v in manually_mapped_cuis.values() if not v])} conditions skipped:')
    for raw_name,manual_cui in manually_mapped_cuis.items():
        if not manual_cui:
            print(f'\t{raw_name}')

    return known_conditions, unknown_conditions

### Saving and Loading mapped concepts

In [6]:
def get_mappping_file_path(n: int):
    return f'{OUTPUT_LOCATION}_mapped_{n}.json'

def get_most_recent_mapping_file() -> int:
    existing_output_files = glob.glob(f'{OUTPUT_LOCATION}_mapped_*.json')
    if existing_output_files:
        return max([int(f.split('_')[-1].replace('.json', '')) for f in existing_output_files])
    return 0

def save_mapping():
    last_mapping_file_number = get_most_recent_mapping_file()
    output_file = get_mappping_file_path(last_mapping_file_number + 1)  

    print(f'Saving mapping to {output_file}... ', end='')
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump({'known': KNOWN_CONDITIONS, 'unknown': UNKNOWN_CONDITIONS}, file, indent=4)
    print('Done')

def load_mapping_file(n: int):
    mapping_file = get_mappping_file_path(n)
    print(f'Loading mapping file {mapping_file}... ', end='')
    with open(mapping_file, 'r', encoding='utf-8') as file:
        mapping = json.load(file)
    print('Done')
    return mapping['known'], mapping['unknown']

def load_most_recent_mapping_file():
    last_mapping_file_number = get_most_recent_mapping_file()
    if last_mapping_file_number:
        return load_mapping_file(last_mapping_file_number)
    else:
        return None, None

def clear_mapping_files():
    response = input('Are you sure you want to clear all mapping files? (y/N) ')
    if response.lower() == 'y':
        existing_output_files = glob.glob(f'{OUTPUT_LOCATION}_mapped_*.json')
        for f in existing_output_files:
            os.remove(f)

# Configuration

In [27]:
# INPUT_FILE = 'data/inputs_example.txt' # Contains conditions that directly map to SNOMED-CT
# INPUT_FILE = 'data/inputs_example_partial.txt' # Contains a condition ('sneez') that does not directly map to SNOMED-CT, but should be a partial match
INPUT_FILE = 'data/inputs_example_unmatchable.txt' # Contains a condition ('abcdefghijklmnopqrstuvwxyz') for which no partial match should be found

OUTPUT_LOCATION = os.path.splitext(INPUT_FILE)[0] # Automatically save output to the same location as the input file; can be manually specified to save somewhere else. 

# Load Data and Automatic Mapping

In [None]:
UNKNOWN_CONDITIONS = load_data(INPUT_FILE)
save_mapping()
KNOWN_CONDITIONS, UNKNOWN_CONDITIONS = automatically_map_conditions(UNKNOWN_CONDITIONS, snomed)
save_mapping()

# Manual Mapping of CUIs

In [None]:
# This cell runs a loop, asking for manual input of CUIs for unmapped conditions. 
# To skip a condition, leave the CUI emtpy and press Enter.
#
# To stop the loop, interrupt the kernel (e.g. press the stop button in Jupyter Notebook), then leave the CUI empty and press Enter.
# Any matched CUIs previously entered will still be available in MANUALLY_MAPPED_CUIS.

MANUALLY_MAPPED_CUIS = {}
get_user_input_cuis(UNKNOWN_CONDITIONS, snomed)
display('Done!', clear=True)

In [None]:
KNOWN_CONDITIONS, UNKNOWN_CONDITIONS = process_manually_mapped_cuis(MANUALLY_MAPPED_CUIS, known_conditions=KNOWN_CONDITIONS, unknown_conditions=UNKNOWN_CONDITIONS, snomed=snomed)
save_mapping()
if 'MANUALLY_MAPPED_CUIS' in globals():
    del MANUALLY_MAPPED_CUIS

# Load - continue from previous session

In [None]:
KNOWN_CONDITIONS, UNKNOWN_CONDITIONS = load_mapping_file(1)

In [None]:
KNOWN_CONDITIONS, UNKNOWN_CONDITIONS = load_most_recent_mapping_file()

In [26]:
# Use with caution... this will delete all mapping files!
clear_mapping_files()

In [None]:
a = input('Press Enter to exit')
a