In [2]:
import os

import pandas as pd
import duckdb

In [3]:
for i in range(10):
    cwd = os.getcwd()
    r, c = os.path.split(cwd)
    if c == 'repo':
        print(f'new working dir: {cwd}')
        break
    os.chdir(r)

new working dir: c:\Users\aknof\Documents\GT\CSE_6250_BD4H\Project\repo


In [8]:
data_file = os.path.join('data', 'NOTEEVENTS.csv')
df = pd.read_csv(data_file)
df.head()

  df = pd.read_csv(data_file)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [7]:
agg_file = os.path.join('data', 'consumed', 'aggregate_note_events-pipe-header.csv')
pd.read_csv(agg_file, sep='|').count()

SUBJECT_ID     46146
PATIENT_DOC    46146
dtype: int64

In [3]:
con = duckdb.connect()

In [8]:
df_agg = con.execute("""
select SUBJECT_ID
,string_agg(TEXT, '\n') as PATIENT_DOC
from df
group by SUBJECT_ID
""").df()
df_agg.head()

Unnamed: 0,SUBJECT_ID,PATIENT_DOC
0,4367,Sinus rhythm. Borderline left ventricular hype...
1,83395,Sinus tachycardia. Possible left atrial abnorm...
2,21740,Atrial fibrillation with a rapid ventricular r...
3,3995,Sinus rhythm\nMultifocal PVCs\nRight bundle br...
4,5680,"Sinus rhythm. T wave inversions in leads I, aV..."


In [12]:
for index, row in df_agg.iterrows():
    ctakes_file = os.path.join('data', 'ctakes_in', f'pat-{row["SUBJECT_ID"]}.txt')
    with open(ctakes_file, 'w') as outfile:
        _ = outfile.write(row['PATIENT_DOC'])

In [177]:
import json
from skr_web_api import Submission, METAMAP_INTERACTIVE_URL

def metamap_nlp(inputtext, email, apikey, # add creds as default here
                params='-I --JSONn --negex --conj'):
    
    inst = Submission(email, apikey)
    inst.set_serviceurl(METAMAP_INTERACTIVE_URL)
    inst.init_mm_interactive(inputtext, args=params)
    response = inst.submit()
    if response.status_code != '200':
        print('response status: {}'.format(response.status_code))
    content = format(response.content.decode())
    try:
        metadata, data = content.split('\n', maxsplit=1)
    except ValueError as e:
        print(content.split('\n'))
        raise ValueError(e)
    data = json.loads(data, strict=False)
    data['metadata'] = metadata
    return data

In [162]:
# d['AllDocuments'][0]['Document']['Utterances'][0]['Phrases'][0]['Mappings'][0]['MappingCandidates'][0]['CandidateCUI']

def parse_response(data):
    # data = json.loads(data)
    cuis = []
    for d in data['AllDocuments']:
        for p in d['Document']['Utterances']:
            for m in p['Phrases']:
                for mc in m['Mappings']:
                    for cand in mc['MappingCandidates']:
                        # print(cand)
                        if 'CandidateCUI' in cand:
                            # print(1)
                            cui = cand['CandidateCUI']
                            if cui not in cuis:
                                cuis.append(cui)
    return cuis

In [252]:
def get_nlp(index, df=df, max_len=10000):
    inputtext = df.loc[index, 'TEXT'].replace('\n', ' ')
    inputtext = ' '.join(inputtext.split())
    # print(inputtext[:100])
    pat = df.loc[index, 'SUBJECT_ID']
    # MAX_LEN = 10000
    # MAX_LEN = int(17228 / 80)
    # MAX_LEN = 250
    chars = len(inputtext)
    chunks = chars // max_len
    chunks = chunks if chunks == chars / max_len else chunks + 1
    chunks = 1 if chunks == 0 else chunks
    # print(f'chars={chars}, chunks={chunks}')
    for c in range(chunks):
        # print(f'chunk params: c={c}, cl={c*max_len}, cu={(c+1)*max_len}')
        try:
            data = metamap_nlp(inputtext[c*max_len:(c+1)*max_len])
            raw_file = os.path.join('data', 'nlp_raw', f'index-{index}_patient-{pat}_chunk-{c}.json')
            with open(raw_file, "w") as outfile:
                json.dump(data, outfile)
        except:
            print(f'failure at index={index}, chunk={c}')
            return c


In [210]:
# df.loc[17, 'TEXT'][1722:3444]
# df.loc[17, 'TEXT'][2583:3444]
# df.loc[17, 'TEXT'][2580:3010]
df.loc[17, 'TEXT'].replace('\n', ' ')[:100]

'Admission Date:  [**2194-7-18**]              Discharge Date:   [**2194-7-25**]  Date of Birth:  [**'

In [228]:
get_nlp(index=17)

Admission Date: [**2194-7-18**] Discharge Date: [**2194-7-25**] Date of Birth: [**2123-12-24**] Sex:
chars=17065, chunks=2
chunk params: c=0, cl=0, cu=10000
chunk params: c=1, cl=10000, cu=20000
response status: 200


In [245]:
from datetime import datetime

def write_exception(index, chunk):
    exc_file = os.path.join('data', 'consumed', 'failed.json')

    if os.path.isfile(exc_file):
        with open(exc_file, 'r') as infile:
            exc_json = json.load(infile)
    else:
        exc_json = {'failed': []}

    fail = {'index': index, 'at_chunk': chunk, 'ts': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}    
    exc_json['failed'].append(fail)

    with open(exc_file, 'w') as outfile:
        json.dump(exc_json, outfile)
        

In [253]:
# from collections import defaultdict
# from uuid import uuid4
# import numpy as np

# patient_cuis = defaultdict(list)

for index, row in df.iloc[17:].iterrows():
    chunk = get_nlp(index=index)
    if chunk is not None:
        write_exception(index, chunk)

        # todo: rolling window chunks

failure at index=17, chunk=0
response status: 200
response status: 200
response status: 200
response status: 200
failure at index=21, chunk=0
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
failure at index=25, chunk=1
response status: 200
failure at index=26, chunk=0
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
failure at index=39, chunk=0
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status: 200
response status

In [None]:
from collections import defaultdict
# from uuid import uuid4
import numpy as np

patient_cuis = defaultdict(list)

# todo: parse raw files 

# cuis = np.array(parse_response(data)).flatten()
#         try:
#             patient_cuis[pat] = np.unique(np.append(cuis, np.array(patient_cuis[pat]).flatten())).tolist()
#         except TypeError as e:
#             print(patient_cuis[pat])
#             print(cuis)
#             raise TypeError(e)

In [None]:
patient_cui_file = os.path.join('data', 'consumed', 'patient_cuis.json')
with open(patient_cui_file, "w") as outfile:
    json.dump(patient_cuis, outfile)

In [None]:
!C:\Users\aknof\Documents\GT\CSE_6250_BD4H\Project\repo\code\ctakes-scut-admin.lnk