# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [3]:
# coding: utf-8
import os
import re
import pandas as pd
import glob
import numpy as np

Helper function to create a directory under the specified path, gracefully handling errors.

In [4]:
def __mkdir(*args):
    path = os.path.join(*args)
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise
    return path

In [5]:
# Create the project directory holding the downloaded data, serialized dataframes and MetaMap install.
# working_dir = __mkdir(os.path.expanduser("~"), "Medframes")

# Set working directory as the current directory of the ipython notebook
working_dir = os.getcwd()
data_dir = __mkdir(working_dir, "data")
print("Working directory: %s" % working_dir)

Working directory: /Users/Lo/Work/CS109Project


# About MedEx

MedEx is an system for extracting medication and signature information from clinical text developed by Hua Xu, Josh Denny, and Min Jiang at Vanderbilt University. We use the Java implemented version MedEx-UIMA 1.3 for semantic‐based parsing of drug names and signatures in clinical trials data. You can find more information about MedEx at https://sbmi.uth.edu/ccb/resources/medex.htm.

MedEx reads all the files in the specified input folder, automatically splits text in the files into sentences, and export parsing results to the designated output folder.

# MedEx Settings

We configure MedEx tagging settings here.

In [6]:
# medex input & output directory
medex_in = working_dir + "/data/medex_in/"
medex_out = working_dir + "/data/medex_out/"

keep_txt_files = True

# Load data for MedEx processing

We use lemmatized criteria sentences from the filtered clinical trial results obtained in extract criteria step.

In [7]:
%time
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
criteria.head()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.11 µs


Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((time, NN),), ((prior, RB), (to, TO), (admis...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((frequency, NN), (by, IN)), ((video, NN),), ...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS), (of, IN)), ((any, DT), (age,...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((express, NN), (willingness, NN), (to, TO)),...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4


# Preprocess the data by creating criteria text files

Create MedEx input files by exporting each criteria to a text file to "medex_in" directory.

In [8]:
def add_medex_prep(df, medex_in):
    for (cid, row) in zip(criteria.criteria_id, criteria.Lemmas):
        f = open(medex_in + "medex_" + str(cid) + ".txt", "w")
        #for word in row:
        for word in [k[1] for k in row]:
            f.write(word.encode('ascii', 'replace') + u' ')
        f.close()

add_medex_prep(criteria, medex_in)

# Process the data with MedEx

At this moment MedEx is ran in the system shell outside of the notebook.

Command Format (in MedEx installation folder):

$ java -Xmx1024m -cp lib/*:bin org.apache.medex.Main -i [input directory] -o [output directory]

eg.) $ java -Xmx1024m -cp lib/*:bin org.apache.medex.Main -i '/Users/Lo/Work/cs109project/data/medex_in/' -o '/Users/Lo/Work/cs109project/data/medex_out/'

# Parse MedEx data and write to criteria dataframe

For each line in the MedEx output files, the output includes all the drug signatures and their positions (start from 0). Fields are separated by "|".
		  
		  Sentence index (start from 1) 
		  Sentence text
		  Drug name      (e.g. 'simvastatin[0, 11]')
          Brand name     (e.g. 'zocor[12, 17]')
          Drug form      (e.g. 'tablet[19, 25]')
          Strength       (e.g. '10mg[20, 24]')
          Dose amount    (e.g. '2 tablets[2, 11]')
          Route          (e.g. 'by mouth[10, 18]')
          Frequency      (normalized frequency) (e.g. 'b.i.d.(R1P12H)[10, 16]', 
                          'R1P12H' is the TIMEX3 format of 'b.i.d.') 
          Duration       (e.g. 'for 10 days[10, 21]')
          Neccessity     (e.g. 'prn[10, 13]')
          UMLS CUI
          RXNORM RxCUI
          RXNORM RxCUI for generic name
		  Generic name   (associated with RXCUI code)

eg. 1	failed standard therapy i.e. refractory to corticotropin at least 40 iu/day for 14 day a follows persistent infantile spasm or recurrent spasm after discontinuation or taper or complication require dose modification|corticotropin[43,56]|||40iu[66,71]||||14 day[80,86]||C0985137|315719|376|corticotropin

We then extract MedEx terms as tags from the Drug Name and the Generic Name field. 

In [12]:
# parse medex output results, add column, delete the generated txt files
def create_medex_df(df, medex_in, medex_out, keeptxt=keep_txt_files):
    
    #remove empty output files
    for dirpath, dirs, files in os.walk(medex_out):
        for file in files: 
            path = os.path.join(dirpath, file)
            if os.stat(path).st_size == 0:
                os.remove(path)
                
    #read raw data in txt files
    cid_list = []
    raw_drug_data = []
    for file in glob.glob(medex_out + "*.txt"):
        cid_list.append(int(file.split('/')[-1][6:-4])) # file # = criteria id
        
        with open (file, "r") as raw:
            raw_drug_data.append(raw.read().rstrip('\n'))
            
    #delete all files under the input and output directories 
    if not keeptxt:
        files = glob.glob(medex_out+'*') + glob.glob(medex_in+'*')
        for f in files:
            os.remove(f)
            
    processed = []
    for (count, i) in enumerate(raw_drug_data):
        for k in i.split('\n'):
            v = [re.sub(r'\[.+?\]\s*', '', j) for j in k.split('|')[1:]]
            v.append(cid_list[count])
            processed.append(v)
    medex_df = pd.DataFrame(processed, columns=['drug_name', 'brand_name', 'drug_form', 'strength', 'dose',
                                                'route', 'frequency', 'duration', 'neccessity', 'cui',
                                                'rxnorm_cui', 'rxnorm_cui_g', 'generic_name','criteria_id'])
    # additional filters/modifications
    medex_df = medex_df.drop('brand_name', 1).drop('rxnorm_cui', 1).drop('rxnorm_cui_g', 1).drop('neccessity', 1)
    medex_df = medex_df[medex_df.drug_name!="refusal"][medex_df.drug_name!="timeline"][medex_df.drug_name!="impact"]
    medex_df['drug_name'] = medex_df['drug_name'].str.replace(' - ', '-')
    medex_df['drug_name'] = medex_df['drug_name'].str.replace(' -', '-')
    medex_df['drug_name'] = medex_df['drug_name'].str.replace(" ' ", "'")
    medex_df['generic_name'] = medex_df['generic_name'].str.replace(' \(obsolete\)', '')
    
    # converting plural tags to singular if the singular tag is also present in dataset
    to_replace = []
    checklist = list(medex_df.drug_name)+list(medex_df.generic_name)
    for i in medex_df.index:
        if medex_df.drug_name[i].endswith("s"):
            if medex_df.drug_name[i][:-1] in checklist:
                to_replace.append(medex_df.drug_name[i])
        if medex_df.generic_name[i].endswith("s"):
            if medex_df.generic_name[i][:-1] in checklist:
                to_replace.append(medex_df.generic_name[i])
    for i in set(to_replace):
        medex_df['drug_name'] = medex_df['drug_name'].str.replace(i, i[:-1])
        medex_df['generic_name'] = medex_df['generic_name'].str.replace(i, i[:-1])
    
    # merge with nctid/criteria id dataframe
    medex_df = pd.merge(df.loc[:, ['NctId', 'criteria_id']], medex_df, on='criteria_id', how='right')
    medex_df = medex_df.rename(columns={'NctId': 'nct_id'})
    
    # use generic name when possible as term column
    medex_df["term"] = np.where(medex_df['generic_name']=='', medex_df['drug_name'], medex_df['generic_name'])
    
    #add alternative names to terms
    to_replace = []
    for name, group in medex_df.groupby('term'):
        if len(set(group.drug_name))!=1 and len(name)!=0:
            other_names = []
            for i in set(group.drug_name):
                if name != i and ("/" not in i):
                    other_names.append(i)
            to_replace.append((name, name + " (" + ', '.join(str(e) for e in set(other_names)) +")"))
    for (j,k) in to_replace:
        medex_df['term'] = medex_df['term'].str.replace(j, k)    
    return medex_df
    
added = create_medex_df(criteria.loc[:, ['NctId', 'criteria_id']], medex_in, medex_out)
added.to_pickle(os.path.join(data_dir, 'medex.pckl'))

added.head(10)

Unnamed: 0,nct_id,criteria_id,drug_name,drug_form,strength,dose,route,frequency,duration,cui,generic_name,term
0,NCT00001205,8,corticosteroid,,,,,,,,corticosteroid,corticosteroid
1,NCT00001205,8,immunosuppressive,,,,,,,,immunosuppressive,immunosuppressive
2,NCT00001205,11,praziquantel,,,,,,,C0032911,praziquantel,praziquantel
3,NCT00001205,11,albendazole,,,,,,,C0001911,albendazole,albendazole
4,NCT00001205,11,methotrexate,,,,,,,C0025677,methotrexate,methotrexate
5,NCT00001205,11,corticosteroid,,,,,,,,corticosteroid,corticosteroid
6,NCT00001205,11,etanercept,,,,,,,C0717758,etanercept,etanercept
7,NCT00001205,14,anthelmintic,,,,,,,,anthelmintic,anthelmintic
8,NCT00001205,14,corticosteroid,,,,,,,,corticosteroid,corticosteroid
9,NCT00001205,14,immunosuppressive,,,,,,,,immunosuppressive,immunosuppressive
