# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [2]:
# coding: utf-8
!pip install tqdm
import csv
import os
import re
import shutil
import string
import zipfile
import sys
from collections import defaultdict
from lxml import objectify
import codecs
import nltk
import pandas as pd
import requests
import tarfile
import subprocess
import platform
import time
from tqdm import tqdm as progressbar # pandas df usage: 'for row in progressbar(df.itertuples(), total=df.shape[0])'



You are using pip version 7.0.3, however version 7.1.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Helper function to create a directory under the specified path, gracefully handling errors.

In [8]:
def __mkdir(*args):
    path = os.path.join(*args)
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise
    return path

In [4]:
# Create the project directory holding the downloaded data, serialized dataframes and MetaMap install.
# working_dir = __mkdir(os.path.expanduser("~"), "Medframes")

# Set working directory as the current directory of the ipython notebook
working_dir = os.getcwd()

print("Working directory: %s" % working_dir)

Working directory: C:\Study\CS102\project\project2\repro\CS109Project


# Data download

Download CSV data from clinicaltrials.gov. The data will be written in the working directory specified above as  [data_dir]/study_fields.csv.

For clinicaltrials.gov, a search term needs to be specified. In this example, we'll download search results for the term "seizure".

In [3]:
def download_ctgov(dest_dir, search_term):
    print("Downloading clinicaltrials.gov results for '%s' to %s" % (search_term, dest_dir))
    dl_url = "https://clinicaltrials.gov/ct2/results/download?down_stds=all&down_typ=results&down_flds=all&down_fmt=xml&term=%s&show_down=Y" % search_term

    # Download the zipped data and extract it to the output directory
    out_path = os.path.join(dest_dir, "download_ctgov.zip")
    with open(out_path, 'wb') as fh:
        r = requests.get(dl_url)
        for block in r.iter_content(1024):
            fh.write(block)
    with zipfile.ZipFile(out_path, 'r') as z:
        z.extractall(dest_dir)
    return dest_dir

In [7]:
download_dir = __mkdir(working_dir, "download")
download_ctgov(download_dir, "seizure")

# Pandas import

Convert the downloaded CSV data to Pandas dataframes and serialize them as Python pickles. The function reads XML files from the working directory and writes to "ctgov.pckl"

In [6]:
def ctgov_to_dataframe(src_dir):
    # Get all XML files in the data directory
    print("Transforming cliniclatrials download (%s) to dataframe" % (src_dir))
    data = []
    for f in [_ for _ in os.listdir(src_dir) if _.endswith('.xml')]:
        xml = objectify.parse(os.path.join(src_dir, f))
        root = xml.getroot()
        d = defaultdict(list)
        for t in root.iter():
            if t.text:
                key = re.sub(r'\[\d+\]', '', xml.getpath(t)).replace('/clinical_study/', '').replace('/', '.')
                val = t.text.strip()
                d[key].append(val)
        d = {k: v[0] if len(v) == 1 else v for k, v in d.items()}
        data.append(d)
    data_frame = pd.DataFrame(data)
    return data_frame

# Writing dataframes
Transform the downloaded data to Pandas dataframes and seialize them as Pytohn pickles.

In [7]:
download_dir = __mkdir(working_dir, "download")
data_dir = __mkdir(working_dir, "data")
ct_df = ctgov_to_dataframe(download_dir)
ct_df.to_pickle(os.path.join(data_dir, 'ctgov.pckl'))

Transforming cliniclatrials download (C:\Study\CS102\project\project2\repro\CS109Project\download) to dataframe


# Reading dataframes

Read the pickled data back into Pandas and display the first 5 records. In this example, the pickled dataframe is serialized to "ctgov.pckl" in the working directory.

In [5]:
data_dir = __mkdir(working_dir, "data")
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
ctgov_data.head()

NameError: name '__mkdir' is not defined

# Extract criteria

Read inthe serialized data from clinicaltrials.gov and extract inclusion/exlcusion criteria, one per row. Output a Series(id_info.nct_id, Criteria, Inclusion, TokenCount).

In [9]:
def id_generator(first_val=0, inc_func=lambda val: val + 1):
    """
        Simple id generator. It takes first val & increase function and yields ids as needed.
        Will return integers starting from 0 by default.
    """
    id = first_val
    while True:
        yield id
        id = inc_func(id)
        
        
def __process_criteria(data, get_criteria_id):
    pat = r"^([\w\-]*\s*){0,5}%s criteria[\s\w\(\),]*:"
    inpat = re.compile(pat % 'inclusion', re.UNICODE)
    expat = re.compile(pat % 'exclusion', re.UNICODE)
    try:
        incl = True
        nct_id = data[1]
        txt = [_.strip() for _ in data[2].split(u'\n\n')]
        for l in txt:
            if re.match(inpat, l.lower()):
                incl = True
            elif re.match(expat, l.lower()):
                incl = False
            else:
                toks = nltk.word_tokenize(l)
                cri_id = next(get_criteria_id)
                s = {'criteria_id': cri_id, 'NctId': nct_id, 'Criteria': unicode(l), 'Include': incl, 'Tokens': toks, 'TokenCount': len(toks)}
                yield s
    except Exception as e:
        print("Error processing row %s: %s" % (data[2], e))

        
def extract_criteria(data):
    print("Transforming data (extracting criteria)")
    criteria_id_generator = id_generator()
    transformed = [s for row in data[['id_info.nct_id', 'eligibility.criteria.textblock']].itertuples() for s in
                   __process_criteria(row, criteria_id_generator)]
    df = pd.DataFrame(transformed)
    return df

Transform the data and write the result to a file. (You'll notice that the script logs an error for one row. This is expected and results from that row being a "NaN".)

In [10]:
# Read in the data
data_dir = __mkdir(working_dir, "data")
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
# Extract criteria
criteria = extract_criteria(ctgov_data)
criteria.to_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))

Transforming data (extracting criteria)
Error processing row nan: 'float' object has no attribute 'split'


Read back the data and display a record selected by column value.

In [11]:
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
criteria.loc[criteria['criteria_id'] == 4]

Unnamed: 0,Criteria,Include,NctId,TokenCount,Tokens,criteria_id
4,Patients.,True,NCT00001192,2,"[Patients, .]",4


In [12]:
criteria.loc[criteria['NctId'] == 'NCT01373190']

Unnamed: 0,Criteria,Include,NctId,TokenCount,Tokens,criteria_id
9611,1. Diagnosis of Partial/Focal Onset Epilepsy (...,True,NCT01373190,13,"[1, ., Diagnosis, of, Partial/Focal, Onset, Ep...",9611
9612,2. Ages 18-70,True,NCT01373190,4,"[2, ., Ages, 18-70]",9612
9613,1. Pregnancy,False,NCT01373190,3,"[1, ., Pregnancy]",9613
9614,2. Recent trauma such as motor vehicle acciden...,False,NCT01373190,15,"[2, ., Recent, trauma, such, as, motor, vehicl...",9614
9615,"3. Currently on medication, other than for epi...",False,NCT01373190,22,"[3, ., Currently, on, medication, ,, other, th...",9615
9616,4. If diagnosed with a condition which could a...,False,NCT01373190,15,"[4, ., If, diagnosed, with, a, condition, whic...",9616
9617,1. Irritable bowel syndrome,False,NCT01373190,5,"[1, ., Irritable, bowel, syndrome]",9617
9618,2. Crohn's disease,False,NCT01373190,5,"[2, ., Crohn, 's, disease]",9618
9619,3. Ulcerative colitis,False,NCT01373190,4,"[3, ., Ulcerative, colitis]",9619
9620,4. Migraine headache with abdominal manifestation,False,NCT01373190,7,"[4, ., Migraine, headache, with, abdominal, ma...",9620


# Tag, lemmatize, ngrammize

Processes the extracted criteria with the NLTK POS tagger and lemmatizer and generates ngrams of 1-3 words (note: while unigrams are technically duplicated as 'Tokens', it will be more convenient to allow this and keep them in one column with bigrams and trigrams). Preprocesses the tokens by removing special characters and punctuation. Lemmata and ngrams are lowercased.

In [13]:
def __lemmatise(lemmatizer, r):
    wn_tags = {'NN': nltk.corpus.wordnet.NOUN, 'JJ': nltk.corpus.wordnet.ADJ, 'VB': nltk.corpus.wordnet.VERB,
               'RB': nltk.corpus.wordnet.ADV}
    return [(t[0], lemmatizer.lemmatize(t[0].lower(), pos=wn_tags.get(t[1][:2], nltk.corpus.wordnet.NOUN)).lower()) for
            t in r]


def tag_and_stem(data):
    print("Transforming data (tagging and lemmatising)")
    series = []
    lemmatizer = nltk.stem.WordNetLemmatizer()
    punct = '[%s]*' % re.escape(string.punctuation)
    pat = re.compile(r"^(%(p)s[\w\d]+%(p)s)+$" % {'p': punct}, re.UNICODE)
    # Itertuples is 50% faster than df.apply()
    for row in progressbar(data[['NctId', 'Tokens', 'criteria_id']].itertuples(), total=data.shape[0]):
        nct_id = row[1]
        toks = filter(lambda t: re.match(pat, t), row[2])
        cri_id = row[3]
        tags = nltk.pos_tag(toks)
        lemmas = __lemmatise(lemmatizer, tags)
        ngrams = []
        for n in (1, 2, 3):
            ngrams += list(nltk.ngrams([(lemma[1], tags[idx][1]) for idx, lemma in enumerate(lemmas)], n))
        s = {'criteria_id': cri_id, 'NctId': nct_id, 'Tokens': toks, 'Tags': tags, 'Lemmas': lemmas, 'Ngrams': ngrams}
        series.append(s)
    df = pd.DataFrame(series)
    return df

Read in the extracted criteria (stored in "ct_criteria.pckl" in the previous step), tag, lemmatize and ngrammize the data and store it as "ct_tagged.pckl".

In [14]:
data_dir = __mkdir(working_dir, "data")

criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
tagged = tag_and_stem(criteria)
tagged.to_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))

                                                                                                                                                               

Transforming data (tagging and lemmatising)




In [15]:
tagged = pd.read_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))
tagged.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((of, IN),), ((uncontrolled...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((by, I...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((of, IN),), ((any, DT),),...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((and, CC),), ((parent, NN...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4


# Filter criteria
Filters out criteria composed entirely of function words and stopwords. Strips ngrams composed entirely of stop words/tags from the ngram list. By default this function uses the NTLK stopword list and all PTB tags except nouns. Additional lists of stop words and stop tags can be supplied with keyword arguments ("stop_words", "stop_tags"). Returns a tuple of dataframes, (filtered_criteria, excluded_criteria).

(Note: this step generates a SettingWithCopyWarning. This is known and is a false positive.)

In [16]:
def __filter(values, idx, stops):
    return not set([t[idx] for t in values]) <= stops


def filter_criteria(data, user_stop_words=[], user_stop_tags=[]):
    print("Filtering criteria")
    default_stop_words = nltk.corpus.stopwords.words('english')
    default_stop_tags = ["$", "''", "(", ")", ",", "--", ".", ":", "CC", "CD", "DT",
                         "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD",
                         "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP",
                         "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
                         "WDT", "WP", "WP$", "WRB", "``"]
    print("Filtering stops")
    stop_words = set(default_stop_words + user_stop_words)
    stop_tags = set(default_stop_tags + user_stop_tags)
    excluded = pd.DataFrame()
    for col, idx, stops in (
            ('Lemmas', 0, stop_words),
            ('Tags', 1, stop_tags)):  # Lemma filtering excludes 18 rows, tag filtering excludes 205
        data['Ngrams'] = data['Ngrams'].apply(lambda row: [ngram for ngram in row if __filter(ngram, idx, stops)])
        groups = data.groupby(lambda r: __filter(data[col].loc[r], 1, stops))
        data = groups.get_group(True)
        excluded = excluded.append(groups.get_group(False)) if groups.groups.has_key(False) else excluded
    return (data, excluded)

Read in the tagged criteria (stored in "ct_tagged.pckl" in the previous step), filter out noise and write the results to "ct_filtered.pckl" (the included criteria) and "ct_excluded.pckl" (the excluded criteria).

In [17]:
data_dir = __mkdir(working_dir, "data")

criteria = pd.read_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))
incl, excl = filter_criteria(criteria)
incl.to_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
excl.to_pickle(os.path.join(data_dir, 'ct_excluded.pckl'))

Filtering criteria
Filtering stops


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
incl = pd.read_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
incl.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((seizure, NNS),), ((presen...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((histo...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((age, NN),), ((patient, N...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((parent, NNS),), ((guardi...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4


In [19]:
excl = pd.read_pickle(os.path.join(data_dir, 'ct_excluded.pckl'))
excl.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
149,"[(Other, other)]",NCT00004399,[],"[(Other, JJ)]",[Other],149
610,"[(Other, other)]",NCT00047073,[],"[(Other, JJ)]",[Other],610
636,"[(Other, other)]",NCT00047073,[],"[(Other, JJ)]",[Other],636
959,"[(Other, other)]",NCT00068770,[],"[(Other, JJ)]",[Other],959
993,"[(Other, other)]",NCT00068770,[],"[(Other, JJ)]",[Other],993


# Test MetaMap installation

This will start the server, run a simple query in interactive mode and stop the server.

In [23]:
def start_metamap(mm_dir):
    start_scripts = {"Linux": "bin/skrmedpostctl start",
                     "Windows": "bin\skrmedpostctl_start.bat",
                     "MacOS": "bin/skrmedpostctl start"}
    start_script = start_scripts[platform.system()]
    os.chdir(mm_dir)
    subprocess.Popen([start_script], shell=True)
    # __execute(['bin/wsdserverctl','start'])

In [24]:
def stop_metamap(mm_dir):
    stop_scripts = {"Linux": "bin/skrmedpostctl stop",
                    "Windows": "bin\skrmedpostctl_stop.bat",
                    "MacOS": "bin/skrmedpostctl stop"}
    stop_script = stop_scripts[platform.system()]
    os.chdir(mm_dir)
    subprocess.Popen([stop_script], shell=True)
    # __execute(['bin/skrmedpostctl', 'stop'])

In [20]:
def test_metamap(mm_dir):
    mm_scripts = {"Linux": 'echo "common flu" | ./bin/metamap -I',
                  "Windows": 'echo "common flu" | bin\metamap.bat -I',
                  "MacOS": 'echo "common flu" | ./bin/metamap -I'}
    mm_script = mm_scripts[platform.system()]
    os.chdir(mm_dir)
    print subprocess.check_output(mm_script, shell=True)

##### Set here the path to the MetaMap installation folder:

In [34]:
mm_dir = "C:\\Study\\CS102\\project\\public_mm_win32_main_2014\\public_mm\\"

In [35]:
test_metamap(mm_dir)


C:\Study\CS102\project\public_mm_win32_main_2014\public_mm>set path=C:\Anaconda\lib\site-packages\numpy\core;C:\ProgramData\Oracle\Java\javapath;C:\Program Files\Common Files\Microsoft Shared\Windows Live;C:\Program Files (x86)\Common Files\Microsoft Shared\Windows Live;C:\Program Files (x86)\Intel\iCLS Client\;C:\Program Files\Intel\iCLS Client\;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Program Files (x86)\Intel\OpenCL SDK\3.0\bin\x86;C:\Program Files (x86)\Intel\OpenCL SDK\3.0\bin\x64;C:\Program Files\Intel\Intel(R) Management Engine Components\DAL;C:\Program Files\Intel\Intel(R) Management Engine Components\IPT;C:\Program Files (x86)\Intel\Intel(R) Management Engine Components\DAL;C:\Program Files (x86)\Intel\Intel(R) Management Engine Components\IPT;c:\Program Files (x86)\Microsoft SQL Server\100\Tools\Binn\;c:\Program Files\Microsoft SQL Server\100\Tools\Binn\;c:\Program Files\Microsoft SQL Server\100\DTS\Binn\;C:\Program Files (x86)\Microsoft ASP.NET\ASP.NET We

# Process the data with MetaMap

#### If you did not install metamap, skip to section "Process MetaMap results"

Dump the ngrams to a text file in the "list of terms with IDs" format(id: "{criteria id}-{ngram index in criteria}") and tag it with MetaMap. Outputs the "fielded NMI" format.

In [23]:
def prepare_mm_input(df, dest_file, verbose=True):
    if verbose:
        print("Generating MetaMap input at %s" % dest_file)
    with codecs.open(dest_file, 'w', encoding='ascii', errors='ignore') as fh:
        for r in df[['Ngrams', 'criteria_id']].itertuples():
            ngrams = r[1]
            cri_id = r[2]
            for ngrami, ngram in enumerate(ngrams):
                line = '-'.join((str(cri_id), str(ngrami))) +'|'+' '.join(_[0] for _ in ngram)
                fh.write(line+os.linesep)
    return dest_file

In [24]:
def run_metamap(mm_dir, src_file, dest_file, verbose=True):
    try:
        num_lines = sum(1 for line in open(src_file, 'r'))
        total = 2*num_lines + 23
        if verbose:
            print("Running MetaMap on file %s, writing to %s" % (src_file, dest_file))
        mm_scripts = {"Linux": './bin/metamap --sldiID -z -i -N %s %s',
                      "Windows": 'bin\metamap.bat --sldiID -z -i -N %s %s',
                      "MacOS": './bin/metamap -I --sldiID -z -i -N %s %s'}
        mm_script = mm_scripts[platform.system()] % (src_file, dest_file)
        os.chdir(mm_dir)
        process = subprocess.Popen(mm_script, stdout=subprocess.PIPE, shell=True)
        if verbose:
            for line in progressbar(iter(process.stdout.readline, ''), total=total):
#                 sys.stdout.write(line)
                pass # no need for stdout if we use progressbar
        else:
            for line in iter(process.stdout.readline, ''):
                pass
        return dest_file
    except Exception as e:
        return None

In [25]:
def process_mm_serial(data):
    start = time.time()

    mm_in = prepare_mm_input(data, os.path.join(data_dir, 'mm_in.txt'))
    m_out = run_metamap(mm_dir, mm_in, os.path.join(data_dir, 'mm_out.txt'))

    print 'Done. Serial processing time: %.1f sec' % round(time.time() - start, 1)

In [26]:
def process_mm_parallel(data, n_jobs):
    import numpy as np
    from IPython.lib.backgroundjobs import BackgroundJobManager
    
    jobs = BackgroundJobManager()
    
    # each job will get df chunk, prepare mm input and give it to mm, returning path to mm output file
    def mm_job(i, _df):
        mm_in = prepare_mm_input(_df, os.path.join(data_dir, 'mm_in_{}.txt'.format(i)), verbose=False)
        m_out = run_metamap(mm_dir, mm_in, os.path.join(data_dir, 'mm_out_{}.txt'.format(i)), verbose=False)
        return m_out

    start = time.time()

    print 'Splitting dataframe..'
    dfs = np.array_split(data, n_jobs)
    
    print 'Done. Starting jobs..'
    for i, _df in enumerate(dfs):
        jobs.new(mm_job, i, _df, daemon=True)
    
    # Ping jobs status each 10 seconds while we dont complete all jobs or have error.
    while len(jobs.dead) == 0 and len(jobs.completed) < n_jobs:
        time.sleep(10)
        print '%d/%d jobs completed..' % (len(jobs.completed), n_jobs)

    print '%d/%d jobs completed. Jobs results:' % (len(jobs.completed), n_jobs)
    for job_n in jobs.all.keys():
        print jobs.result(job_n)

    jobs.flush()
    print 'Done. Parallel processing time: %.1f sec' % round(time.time() - start, 1)

In [27]:
def process_mm(data, conf):
    
    # split data if we have data_limit in config
    _data = data if not conf['data_limit'] else data[:conf['data_limit']]
    
    if conf['parallel']:
        print 'Processing %d records in parallel with %d jobs..' % (_data.shape[0], conf['n_jobs'])
        process_mm_parallel(_data, conf['n_jobs'])
    else:
        print 'Processing %d records in serial..' % _data.shape[0]
        process_mm_serial(_data)

# Settings
Here you can configure metamap processing options.

In [36]:
MM_CONFIG = {
    'parallel': True, # parallel execution gives X2 speed up
    'n_jobs': 2, # process all the data in ~1.5 hour in my setup
    'data_limit': None # set it to None to process all data  
}

In [37]:
data_dir = __mkdir(working_dir, "data")

criteria = pd.read_pickle(os.path.join(data_dir, "ct_filtered.pckl"))



In [38]:
criteria.head(2)

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((seizure, NNS),), ((presen...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((histo...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1


In [39]:
process_mm(criteria, MM_CONFIG)

Processing 17144 records in parallel with 2 jobs..
Splitting dataframe..
Done. Starting jobs..
Starting job # 0 in a separate thread.
Starting job # 2 in a separate thread.
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs completed..
0/2 jobs

# Process MetaMap results

Convert the MetaMap-processed data to a Pandas dataframe and use the dataframe.

In [40]:
def pass_threshold(score):
    return score >= 5.5

def metamap_to_dataframe(src_file):
    print("Converting MetaMap results in %s to a dataframe." % src_file)
    data = []
    with open(src_file, 'r') as fh:
        for line in fh:
            row = line.split("|")
            id = row[0]
            cri_id, ngrami = map(int, id.split("-"))
            score = float(row[2])
            term = row[3]
            cui = row[4]
            stype = row[5].strip("[]")
            cid = row[-1].strip().split(";")
            if pass_threshold(score):
                data.append([cri_id,ngrami,score,term,cui,stype,cid])
    df = pd.DataFrame(columns=["criteria_id", "ngram_index", "score", "term", "cui", "stype", "cid"], data=data)
    return df

def process_mm_results(conf):
    if conf['parallel']:
        # collect mm_out_... files, transform them to dataframes & concat in resulting mm:
        mm_dfs = []
        for i in range(conf['n_jobs']):
            mm_df = metamap_to_dataframe(os.path.join(data_dir, "mm_out_{}.txt".format(i)))
            mm_dfs.append(mm_df)
        mm = pd.concat(mm_dfs)
        return mm
    else:
        mm = metamap_to_dataframe(os.path.join(data_dir, "mm_out.txt"))
        return mm
    
def extend_with_nctids(mm_df, criteria_df):
    nctids = []
    for row in mm_df[['criteria_id']].itertuples():
        cri_id = row[1]
        nct_id = criteria_df[criteria_df['criteria_id'] == cri_id].NctId.values[0]
        nctids.append(nct_id)
    mm_df.insert(0, 'nct_id', nctids)
    return mm_df

In [41]:
data_dir = __mkdir(working_dir, "data")

mm = process_mm_results(MM_CONFIG)

# we can get NctId by criteria id from criterias df:
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
mm = extend_with_nctids(mm, criteria)

mm.to_pickle(os.path.join(data_dir, "mm.pckl"))

Converting MetaMap results in C:\Study\CS102\project\project2\repro\CS109Project\data\mm_out_0.txt to a dataframe.
Converting MetaMap results in C:\Study\CS102\project\project2\repro\CS109Project\data\mm_out_1.txt to a dataframe.


Load the serialized MetaMap results and display sample data.

In [10]:
data_dir = __mkdir(working_dir, "data")
mm = pd.read_pickle(os.path.join(data_dir, "mm.pckl"))
mm.head(1000)

Unnamed: 0,nct_id,criteria_id,ngram_index,score,term,cui,stype,cid
0,NCT00001149,0,0,11.49,Historical aspects qualifier,C0019665,inpr,[x.x.x]
1,NCT00001149,0,0,8.34,History,C0019664,ocdi,[K01.400]
2,NCT00001149,0,1,17.80,Seizures,C0036572,sosy,"[C10.228.140.490.631, C10.597.742, C23.888.592..."
3,NCT00001149,0,3,8.34,Time,C0040223,tmco,[G01.910]
4,NCT00001149,0,7,17.80,Seizures,C0036572,sosy,"[C10.228.140.490.631, C10.597.742, C23.888.592..."
5,NCT00001149,0,10,9.95,Historical aspects qualifier,C0019665,inpr,[x.x.x]
6,NCT00001149,0,10,6.79,History,C0019664,ocdi,[K01.400]
7,NCT00001149,0,11,16.26,Seizures,C0036572,sosy,"[C10.228.140.490.631, C10.597.742, C23.888.592..."
8,NCT00001149,0,12,16.26,Seizures,C0036572,sosy,"[C10.228.140.490.631, C10.597.742, C23.888.592..."
9,NCT00001149,0,14,6.79,Time,C0040223,tmco,[G01.910]
