# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [1]:
# coding: utf-8
!pip install tqdm
import csv
import os
import re
import shutil
import string
import zipfile
import sys
from collections import defaultdict
from lxml import objectify
import codecs
import nltk
import pandas as pd
import requests
import tarfile
import subprocess
import platform
import time
from tqdm import tqdm as progressbar # pandas df usage: 'for row in progressbar(df.itertuples(), total=df.shape[0])'

import glob
import numpy as np



Helper function to create a directory under the specified path, gracefully handling errors.

In [2]:
def __mkdir(*args):
    path = os.path.join(*args)
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise
    return path

In [3]:
# Create the project directory holding the downloaded data, serialized dataframes and MetaMap install.
# working_dir = __mkdir(os.path.expanduser("~"), "Medframes")

# Set working directory as the current directory of the ipython notebook
working_dir = os.getcwd()
data_dir = __mkdir(working_dir, "data")
print("Working directory: %s" % working_dir)

Working directory: /Users/Lo/Work/CS109Project


# Load data for MedEx processing

In [7]:
%time
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
criteria.head()

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((time, NN),), ((prior, RB), (to, TO), (admis...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((frequency, NN), (by, IN)), ((video, NN),), ...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS), (of, IN)), ((any, DT), (age,...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((express, NN), (willingness, NN), (to, TO)),...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4


# MedEx Settings

Please configure MedEx tagging settings here.

In [5]:
#medex input % output directory

medex_in = working_dir + "/data/medex_in/"
medex_out = working_dir + "/data/medex_out/"

keep_txt_files = True

# Preprocess the data by creating criteria text files

Create MedEx input files by exporting each criteria to a text file to "medex_in" directory.

In [133]:
def add_medex_prep(df, medex_in):
    for (cid, row) in zip(criteria.criteria_id, criteria.Lemmas):
        f = open(medex_in + "medex_" + str(cid) + ".txt", "w")
        #for word in row:
        for word in [k[1] for k in row]:
            f.write(word.encode('ascii', 'replace') + u' ')
        f.close()

add_medex_prep(criteria, medex_in)

# Process the data with MedEx

At this moment MedEx is ran in the system shell outside of the notebook.

Command Format (in MedEx installation folder):

$ java -Xmx1024m -cp lib/*:bin org.apache.medex.Main -i [input directory] -o [output directory]

eg.) $ java -Xmx1024m -cp lib/*:bin org.apache.medex.Main -i '/Users/Lo/Work/cs109project/data/medex_in/' -o '/Users/Lo/Work/cs109project/data/medex_out/'

MedEx scans all text files in the input directory and export the NLP results of each file to the output directory in this format:
{sentence order}-{sentence}
{

# Parse MedEx data and write to criteria dataframe


In [8]:
#parse medex output results, add column, delete the generated txt files
def create_medex_df(df, medex_in, medex_out, keeptxt=keep_txt_files):
    
    #remove empty output files
    for dirpath, dirs, files in os.walk(medex_out):
        for file in files: 
            path = os.path.join(dirpath, file)
            if os.stat(path).st_size == 0:
                os.remove(path)
                
    #read raw data in txt files
    cid_list = []
    raw_drug_data = []
    for file in glob.glob(medex_out + "*.txt"):
        cid_list.append(int(file.split('/')[-1][6:-4])) # file # = criteria id
        
        with open (file, "r") as raw:
            raw_drug_data.append(raw.read().rstrip('\n'))
            
    #delete all files under the input and output directories 
    if not keeptxt:
        files = glob.glob(medex_out+'*') + glob.glob(medex_in+'*')
        for f in files:
            os.remove(f)
            
    processed = []
    for (count, i) in enumerate(raw_drug_data):
        for k in i.split('\n'):
            v = [re.sub(r'\[.+?\]\s*', '', j) for j in k.split('|')[1:]]
            v.append(cid_list[count])
            processed.append(v)
    medex_df = pd.DataFrame(processed, columns=['drug_name', 'brand_name', 'drug_form', 'strength', 'dose',
                                                'route', 'frequency', 'duration', 'neccessity', 'cui',
                                                'rxnorm_cui', 'rxnorm_cui_generic', 'generic_name','criteria_id'])
    medex_df = pd.merge(df.loc[:, ['NctId', 'criteria_id']], medex_df, on='criteria_id', how='right')
    medex_df = medex_df.rename(columns={'NctId': 'nct_id'})
    
    return medex_df
    
added = create_medex_df(criteria.loc[:, ['NctId', 'criteria_id']], medex_in, medex_out)
added.to_pickle(os.path.join(data_dir, 'medex.pckl'))

added.head(100)

Unnamed: 0,nct_id,criteria_id,drug_name,brand_name,drug_form,strength,dose,route,frequency,duration,neccessity,cui,rxnorm_cui,rxnorm_cui_generic,generic_name
0,NCT00001205,8,corticosteroid,,,,,,,,,,354,354,corticosteroid
1,NCT00001205,8,immunosuppressive,,,,,,,,,,5748,5748,immunosuppressive
2,NCT00001205,11,praziquantel,,,,,,,,,C0032911,8628,8628,praziquantel
3,NCT00001205,11,albendazole,,,,,,,,,C0001911,430,430,albendazole
4,NCT00001205,11,methotrexate,,,,,,,,,C0025677,6851,6851,methotrexate
5,NCT00001205,11,corticosteroid,,,,,,,,,,354,354,corticosteroid
6,NCT00001205,11,etanercept,,,,,,,,,C0717758,214555,214555,etanercept
7,NCT00001205,14,anthelmintic,,,,,,,,,,868,868,anthelmintic
8,NCT00001205,14,corticosteroid,,,,,,,,,,354,354,corticosteroid
9,NCT00001205,14,immunosuppressive,,,,,,,,,,5748,5748,immunosuppressive


In [None]:
# rules:
# remove refusal
# replace " - " with "-"
# some similar ones: ['neuroleptic', 'antipsychotic drug'] and ['neuroleptic drugs', 'antipsychotic drug']

In [10]:
len(added[added.drug_name=="antiepileptic"])

248