# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [5]:
# coding: utf-8
!pip install tqdm
import csv
import os
import re
import shutil
import string
import zipfile
import sys
from collections import defaultdict
from lxml import objectify
import codecs
import nltk
import pandas as pd
import requests
import tarfile
import subprocess
import platform
import time
from tqdm import tqdm as progressbar # pandas df usage: 'for row in progressbar(df.itertuples(), total=df.shape[0])'

import glob



Helper function to create a directory under the specified path, gracefully handling errors.

In [3]:
def __mkdir(*args):
    path = os.path.join(*args)
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise
    return path

In [10]:
# Create the project directory holding the downloaded data, serialized dataframes and MetaMap install.
# working_dir = __mkdir(os.path.expanduser("~"), "Medframes")

# Set working directory as the current directory of the ipython notebook
working_dir = os.getcwd()
data_dir = __mkdir(working_dir, "data")
print("Working directory: %s" % working_dir)

Working directory: /Users/Lo/Work/CS109Project


# Load data for MedEx processing

In [17]:
%time
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
criteria.head()

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11.9 µs


Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((seizure, NNS),), ((presen...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((histo...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((age, NN),), ((patient, N...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((parent, NNS),), ((guardi...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4


# MedEx Settings

Please configure MedEx tagging settings here.

In [43]:
#medex input % output directory

medex_in = working_dir + "/data/medex_in/"
medex_out = working_dir + "/data/medex_out/"

keep_txt_files = True

# Preprocess the data by creating criteria text files

Create MedEx input files by exporting each criteria to a text file to "medex_in" directory.

In [66]:
def add_medex_prep(df, medex_in):
    for (count, row) in enumerate(criteria.Tokens):
        f = open(medex_in + "medex_" + str(count) + ".txt", "w")
        for word in row:
            f.write(word.encode('ascii', 'replace') + u' ')
        f.close()

add_medex_prep(criteria, medex_in)

# Process the data with MedEx

At this moment MedEx is ran in the system shell outside of the notebook.

Command Format (in MedEx installation folder):

$ java -Xmx1024m -cp lib/*:bin org.apache.medex.Main -i [input directory] -o [output directory]

eg.) $ java -Xmx1024m -cp lib/*:bin org.apache.medex.Main -i '/Users/Lo/Work/cs109project/data/medex_in/' -o '/Users/Lo/Work/cs109project/data/medex_out/'

MedEx scans all text files in the input directory and export the NLP results of each file to the output directory in this format:
{sentence order}-{sentence}
{

# Parse MedEx data and write to dataframe


In [189]:
#parse medex output results, add column, delete the generated txt files
def add_medex_column(df, medex_in, medex_out, keeptxt=keep_txt_files):
    
    #remove empty output files
    for dirpath, dirs, files in os.walk(medex_out):
        for file in files: 
            path = os.path.join(dirpath, file)
            if os.stat(path).st_size == 0:
                os.remove(path)
                
    #read raw data in txt files
    rowlist = []
    raw_drug_data = []
    for file in glob.glob(medex_out + "*.txt"):
        rowlist.append(int(file.split('/')[-1][6:-4])) # row index in dataframe
        
        with open (file, "r") as raw:
            raw_drug_data.append(raw.read().rstrip('\n'))
            
    #parse raw drug data
    processed_drug_data = []
    for i in range(len(raw_drug_data)):
        #processed = [re.sub(r'\[.+?\]\s*', '', s).replace('\n', '') for s in raw_drug_data[i].split('\n')[0].split('|')[1:]]
        processed = [[re.sub(r'\[.+?\]\s*', '', i.split('|')[1]), i.split('|')[-1]] for i in raw_drug_data[i].split('\n')]
        # medex identify refusal as a drug. Remove these results.
        # may need to look at other ones
        #if "refusal" in [col.lower() for col in processed]:
        #    rowlist.remove(rowlist[i])
        #else:
        #    processed_drug_data.append(processed)
        processed_drug_data.append(processed)
            
    #create MedEx column and add parsed medex data to df
    df['MedEx']=''
    for (count, i) in enumerate(rowlist):
        df.MedEx[i] = processed_drug_data[count]
    
    #delete all files under the input and output directories 
    if not keeptxt:
        files = glob.glob(medex_out+'*') + glob.glob(medex_in+'*')
        for f in files:
            os.remove(f)
    
    return df

added = add_medex_column(criteria, medex_in, medex_out)
added.to_pickle(os.path.join(data_dir, 'ct_medex.pckl'))

added.head(100)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id,MedEx
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((seizure, NNS),), ((presen...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0,
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((histo...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1,
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((age, NN),), ((patient, N...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2,
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((parent, NNS),), ((guardi...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3,
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4,


In [192]:
# show all extracted MedEx data
for i in added.index:
    if len(added.MedEx[i]) != 0:
        print i, added.MedEx[i]

8 [['corticosteroids', 'corticosteroids'], ['immunosuppressives', '']]
11 [['praziquantel', 'praziquantel'], ['albendazole', 'albendazole'], ['methotrexate', 'methotrexate'], ['corticosteroids', 'corticosteroids'], ['etanercept', 'etanercept']]
14 [['anthelmintics', 'anthelmintics'], ['corticosteroids', 'corticosteroids'], ['immunosuppressives', '']]
15 [['anthelmintics', 'anthelmintics'], ['immunosuppressive', 'immunosuppressive']]
16 [['albendazole', 'albendazole'], ['praziquantel', 'praziquantel']]
29 [['refusal', '']]
42 [['MPA', '']]
47 [['tegretol', 'carbamazepine']]
49 [['vigabatrin', 'vigabatrin']]
52 [['convulsant', ''], ['convulsants', '']]
53 [['convulsant', '']]
62 [['neuroleptic', 'antipsychotic drug'], ['antidepressant', 'antidepressant']]
73 [['Norplant', 'levonorgestrel'], ['oral contraceptives', 'oral contraceptives']]
81 [['topiramate', 'topiramate'], ['dextromethorphan', 'dextromethorphan']]
84 [['experimental drug', 'experimental drug']]
87 [['immunosuppressant', 'i

In [190]:
#list of drugs with different drug name and generic name in MedEx
# considering new rule: use generic name when available
for i in added.index:
    for k in added.MedEx[i]:
        if k[0].lower() != k[1].lower():
            print i, [k[0], k[1]]

8 ['immunosuppressives', '']
14 ['immunosuppressives', '']
29 ['refusal', '']
42 ['MPA', '']
47 ['tegretol', 'carbamazepine']
52 ['convulsant', '']
52 ['convulsants', '']
53 ['convulsant', '']
62 ['neuroleptic', 'antipsychotic drug']
73 ['Norplant', 'levonorgestrel']
120 ['SSRIs', '']
169 ['multivitamin', 'multiple vitamins']
169 ['vitamin E', 'tocopherol (vit e)']
181 ['Refusal', '']
197 ['Antipsychotics', 'antipsychotic drug']
246 ['neuroserpin', '']
247 ['neuroserpin', '']
250 ['DPA', 'valproate sodium']
283 ['dextrose', 'glucose']
285 ['VPA', 'valproate']
285 ['VPA', 'valproate']
292 ['albumin', 'albumin human']
306 ['neuroleptics', '']
381 ['Prempro', 'conjugated estrogens']
386 ['Cystagon', 'mercaptamine']
386 ['Cystagon', 'mercaptamine']
386 ['Cystagon', 'mercaptamine']
429 ['Carbatrol', 'carbamazepine']
429 ['Tegretol', 'carbamazepine']
429 ['Dilantin', 'phenytoin']
429 ['Trileptal', 'oxcarbazepine']
473 ['nitroglycerin', 'glyceryl trinitrate']
479 ['G - CSF', 'g-csf']
495 ['AC