# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [2]:
# coding: utf-8
!pip install tqdm
import csv
import os
import re
import shutil
import string
import zipfile
import sys
from collections import defaultdict
from lxml import objectify
import codecs
import nltk
import pandas as pd
import requests
import tarfile
import subprocess
import platform
import time
from tqdm import tqdm as progressbar # pandas df usage: 'for row in progressbar(df.itertuples(), total=df.shape[0])'



Helper function to create a directory under the specified path, gracefully handling errors.

In [4]:
def __mkdir(*args):
    path = os.path.join(*args)
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise
    return path

In [6]:
# Create the project directory holding the downloaded data, serialized dataframes and MetaMap install.
# working_dir = __mkdir(os.path.expanduser("~"), "Medframes")

# Set working directory as the current directory of the ipython notebook
working_dir = os.getcwd()
download_dir = __mkdir(working_dir, "download")
print("Working directory: %s" % working_dir)
print("Download directory: %s" % download_dir)

Working directory: C:\Study\CS102\project\project2\repro\CS109Project
Download directory: C:\Study\CS102\project\project2\repro\CS109Project\download


# Data download

Download CSV data from clinicaltrials.gov. The data will be written in the working directory specified above as  [data_dir]/study_fields.csv.

For clinicaltrials.gov, a search term needs to be specified. In this example, we'll download search results for the term "seizure".

In [4]:
def download_ctgov(dest_dir, search_term):
    print("Downloading clinicaltrials.gov results for '%s' to %s" % (search_term, dest_dir))
    dl_url = "https://clinicaltrials.gov/ct2/results/download?down_stds=all&down_typ=results&down_flds=all&down_fmt=xml&term=%s&show_down=Y" % search_term

    # Download the zipped data and extract it to the output directory
    out_path = os.path.join(dest_dir, "download_ctgov.zip")
    with open(out_path, 'wb') as fh:
        r = requests.get(dl_url)
        for block in r.iter_content(1024):
            fh.write(block)
    with zipfile.ZipFile(out_path, 'r') as z:
        z.extractall(dest_dir)
    return dest_dir

In [5]:
download_ctgov(download_dir, "seizure")

Downloading clinicaltrials.gov results for 'seizure' to C:\Study\CS102\project\project2\repro\CS109Project\download


'C:\\Study\\CS102\\project\\project2\\repro\\CS109Project\\download'

# Pandas import

Convert the downloaded CSV data to Pandas dataframes and serialize them as Python pickles. The function reads XML files from the working directory and writes to "ctgov.pckl"

In [6]:
def ctgov_to_dataframe(src_dir):
    # Get all XML files in the data directory
    print("Transforming cliniclatrials download (%s) to dataframe" % (src_dir))
    data = []
    for f in [_ for _ in os.listdir(src_dir) if _.endswith('.xml')]:
        xml = objectify.parse(os.path.join(src_dir, f))
        root = xml.getroot()
        d = defaultdict(list)
        for t in root.iter():
            if t.text:
                key = re.sub(r'\[\d+\]', '', xml.getpath(t)).replace('/clinical_study/', '').replace('/', '.')
                val = t.text.strip()
                d[key].append(val)
        d = {k: v[0] if len(v) == 1 else v for k, v in d.items()}
        data.append(d)
    data_frame = pd.DataFrame(data)
    return data_frame

# Writing dataframes
Transform the downloaded data to Pandas dataframes and seialize them as Python pickles.

In [7]:
download_dir = __mkdir(working_dir, "download")
data_dir = __mkdir(working_dir, "data")
ct_df = ctgov_to_dataframe(download_dir)
ct_df.to_pickle(os.path.join(data_dir, 'ctgov.pckl'))

Transforming cliniclatrials download (C:\Study\CS102\project\project2\repro\CS109Project\download) to dataframe


# Reading dataframes

Read the pickled data back into Pandas and display the first 5 records. In this example, the pickled dataframe is serialized to "ctgov.pckl" in the working directory.

In [8]:
data_dir = __mkdir(working_dir, "data")
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
ctgov_data.head()

Unnamed: 0,acronym,arm_group.arm_group_label,arm_group.arm_group_type,arm_group.description,biospec_descr.textblock,biospec_retention,brief_summary.textblock,brief_title,clinical_results.baseline.group_list.group.description,clinical_results.baseline.group_list.group.title,...,sponsors.collaborator.agency,sponsors.collaborator.agency_class,sponsors.lead_sponsor.agency,sponsors.lead_sponsor.agency_class,start_date,study_design,study_type,target_duration,verification_date,why_stopped
0,,,,,,,This study is designed to evaluate patients wi...,Monitoring Patients With Uncontrolled Epilepsy,,,...,,,National Institute of Neurological Disorders a...,NIH,November 1975,,Observational,,January 2002,
1,,,,,,,This study will allow researchers to use vario...,Neuropsychological Evaluation of Psychiatric a...,,,...,,,National Institute of Mental Health (NIMH),NIH,October 1983,,Observational,,December 2006,
2,,,,,,,"The purpose of this study is to evaluate, trea...",Treatment of Patients With Cysticercosis With ...,,,...,,,National Institute of Allergy and Infectious D...,NIH,July 1985,Time Perspective: Prospective,Observational,,December 2014,
3,,,,,,,Our past ultrasound research has indicated a n...,Development of Normative Ultrasound Databases ...,,,...,,,National Institutes of Health Clinical Center ...,NIH,February 1987,,Observational,,January 2000,
4,,Copper histidine,Experimental,,,,Menkes Disease is a genetic disorder affecting...,Copper Histidine Therapy for Menkes Diseases,[Classic Menkes disease: Copper histidine trea...,"[Early, Late, Mild, Total]",...,,,Eunice Kennedy Shriver National Institute of C...,NIH,June 1990,Endpoint Classification: Safety/Efficacy Study...,Interventional,,September 2015,


# Extract criteria

Read in the serialized data from clinicaltrials.gov and extract inclusion/exlcusion criteria, one per row. Output a Series(id_info.nct_id, Criteria, Inclusion, TokenCount).

In [9]:
def id_generator(first_val=0, inc_func=lambda val: val + 1):
    """
        Simple id generator. It takes first val & increase function and yields ids as needed.
        Will return integers starting from 0 by default.
    """
    id = first_val
    while True:
        yield id
        id = inc_func(id)
        
        
def __process_criteria(data, get_criteria_id):
    """
        Extract inclusion and exclusion criteria from the clinical trials data.
        Then - Tokenize and write the extracted data to a frame.
    """
    pat = r"^([\w\-]*\s*){0,5}%s criteria[\s\w\(\),]*:"
    inpat = re.compile(pat % 'inclusion', re.UNICODE)
    expat = re.compile(pat % 'exclusion', re.UNICODE)
    try:
        incl = True
        nct_id = data[1]
        txt = [_.strip() for _ in data[2].split(u'\n\n')]
        for l in txt:
            if re.match(inpat, l.lower()):
                incl = True
            elif re.match(expat, l.lower()):
                incl = False
            else:
                toks = nltk.word_tokenize(l)
                cri_id = next(get_criteria_id)
                s = {'criteria_id': cri_id, 'NctId': nct_id, 'Criteria': unicode(l), 'Include': incl, 'Tokens': toks, 'TokenCount': len(toks)}
                yield s
    except Exception as e:
        print("Error processing row %s: %s" % (data[2], e))

        
def extract_criteria(data):
    """
        Extract inclusion and exclusion criteria from each clinical trial into
    """
    print("Transforming data (extracting criteria)")
    criteria_id_generator = id_generator()
    transformed = [s for row in data[['id_info.nct_id', 'eligibility.criteria.textblock']].itertuples() for s in
                   __process_criteria(row, criteria_id_generator)]
    df = pd.DataFrame(transformed)
    return df

Transform the data and write the result to a file. (You'll notice that the script logs an error for one row. This is expected and results from that row being a "NaN".)

In [10]:
# Read in the data
data_dir = __mkdir(working_dir, "data")
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
# Extract criteria
criteria = extract_criteria(ctgov_data)
criteria.to_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))

Transforming data (extracting criteria)
Error processing row nan: 'float' object has no attribute 'split'


Read back the data and display a record selected by column value.

In [11]:
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
criteria.loc[criteria['criteria_id'] == 4]

Unnamed: 0,Criteria,Include,NctId,TokenCount,Tokens,criteria_id
4,Patients.,True,NCT00001192,2,"[Patients, .]",4


In [12]:
criteria.loc[criteria['NctId'] == 'NCT01373190']

Unnamed: 0,Criteria,Include,NctId,TokenCount,Tokens,criteria_id
9611,1. Diagnosis of Partial/Focal Onset Epilepsy (...,True,NCT01373190,13,"[1, ., Diagnosis, of, Partial/Focal, Onset, Ep...",9611
9612,2. Ages 18-70,True,NCT01373190,4,"[2, ., Ages, 18-70]",9612
9613,1. Pregnancy,False,NCT01373190,3,"[1, ., Pregnancy]",9613
9614,2. Recent trauma such as motor vehicle acciden...,False,NCT01373190,15,"[2, ., Recent, trauma, such, as, motor, vehicl...",9614
9615,"3. Currently on medication, other than for epi...",False,NCT01373190,22,"[3, ., Currently, on, medication, ,, other, th...",9615
9616,4. If diagnosed with a condition which could a...,False,NCT01373190,15,"[4, ., If, diagnosed, with, a, condition, whic...",9616
9617,1. Irritable bowel syndrome,False,NCT01373190,5,"[1, ., Irritable, bowel, syndrome]",9617
9618,2. Crohn's disease,False,NCT01373190,5,"[2, ., Crohn, 's, disease]",9618
9619,3. Ulcerative colitis,False,NCT01373190,4,"[3, ., Ulcerative, colitis]",9619
9620,4. Migraine headache with abdominal manifestation,False,NCT01373190,7,"[4, ., Migraine, headache, with, abdominal, ma...",9620


# Tag, lemmatize, ngrammize

Processes the extracted criteria with the NLTK POS tagger and lemmatizer and generates ngrams of 1-3 words (note: while unigrams are technically duplicated as 'Tokens', it will be more convenient to allow this and keep them in one column with bigrams and trigrams). Preprocesses the tokens by removing special characters and punctuation. Lemmata and ngrams are lowercased.

In [13]:
def __lemmatise(lemmatizer, r):
    wn_tags = {'NN': nltk.corpus.wordnet.NOUN, 'JJ': nltk.corpus.wordnet.ADJ, 'VB': nltk.corpus.wordnet.VERB,
               'RB': nltk.corpus.wordnet.ADV}
    return [(t[0], lemmatizer.lemmatize(t[0].lower(), pos=wn_tags.get(t[1][:2], nltk.corpus.wordnet.NOUN)).lower()) for
            t in r]


def tag_and_stem(data):
    print("Transforming data (tagging and lemmatising)")
    series = []
    lemmatizer = nltk.stem.WordNetLemmatizer()
    punct = '[%s]*' % re.escape(string.punctuation)
    pat = re.compile(r"^(%(p)s[\w\d]+%(p)s)+$" % {'p': punct}, re.UNICODE)
    # Itertuples is 50% faster than df.apply()
    for row in progressbar(data[['NctId', 'Tokens', 'criteria_id']].itertuples(), total=data.shape[0]):
        nct_id = row[1]
        toks = filter(lambda t: re.match(pat, t), row[2])
        cri_id = row[3]
        tags = nltk.pos_tag(toks)
        lemmas = __lemmatise(lemmatizer, tags)
        ngrams = []
        for n in (1, 2, 3):
            ngrams += list(nltk.ngrams([(lemma[1], tags[idx][1]) for idx, lemma in enumerate(lemmas)], n))
        s = {'criteria_id': cri_id, 'NctId': nct_id, 'Tokens': toks, 'Tags': tags, 'Lemmas': lemmas, 'Ngrams': ngrams}
        series.append(s)
    df = pd.DataFrame(series)
    return df

Read in the extracted criteria (stored in "ct_criteria.pckl" in the previous step), tag, lemmatize and ngrammize the data and store it as "ct_tagged.pckl".

In [14]:
data_dir = __mkdir(working_dir, "data")

criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
tagged = tag_and_stem(criteria)
tagged.to_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))

                                                                                                                                                               

Transforming data (tagging and lemmatising)




In [15]:
tagged = pd.read_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))
tagged.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((history, NN),), ((of, IN),), ((uncontrolled...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((seizure, NN),), ((frequency, NN),), ((by, I...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS),), ((of, IN),), ((any, DT),),...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((patient, NNS),), ((and, CC),), ((parent, NN...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4


# Filter criteria
Filters out criteria composed entirely of function words and stopwords. Strips ngrams composed entirely of stop words/tags from the ngram list. By default this function uses the NTLK stopword list and all PTB tags except nouns. Additional lists of stop words and stop tags can be supplied with keyword arguments ("stop_words", "stop_tags"). Returns a tuple of dataframes, (filtered_criteria, excluded_criteria).

(Note: this step generates a SettingWithCopyWarning. This is known and is a false positive.)

In [16]:
def __filter(values, idx, stops):
    return not set([t[idx] for t in values]) <= stops


def filter_criteria(data, user_stop_words=[], user_stop_tags=[]):
    print("Filtering criteria")
    default_stop_words = nltk.corpus.stopwords.words('english')
    default_stop_tags = ["$", "''", "(", ")", ",", "--", ".", ":", "CC", "CD", "DT",
                         "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD",
                         "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP",
                         "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
                         "WDT", "WP", "WP$", "WRB", "``"]
    print("Filtering stops")
    stop_words = set(default_stop_words + user_stop_words)
    stop_tags = set(default_stop_tags + user_stop_tags)
    excluded = pd.DataFrame()
    for col, idx, stops in (
            ('Lemmas', 0, stop_words),
            ('Tags', 1, stop_tags)):  # Lemma filtering excludes 18 rows, tag filtering excludes 205
        data['Ngrams'] = data['Ngrams'].apply(lambda row: [ngram for ngram in row if __filter(ngram, idx, stops)])
        groups = data.groupby(lambda r: __filter(data[col].loc[r], 1, stops))
        data = groups.get_group(True)
        excluded = excluded.append(groups.get_group(False)) if groups.groups.has_key(False) else excluded
    return (data, excluded)

Read in the tagged criteria (stored in "ct_tagged.pckl" in the previous step), filter out noise and write the results to "ct_filtered.pckl" (the included criteria) and "ct_excluded.pckl" (the excluded criteria).

In [17]:
data_dir = __mkdir(working_dir, "data")

criteria = pd.read_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))
incl, excl = filter_criteria(criteria)
incl.to_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
excl.to_pickle(os.path.join(data_dir, 'ct_excluded.pckl'))

Filtering criteria
Filtering stops


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Remove ngram duplicates

In [8]:
data_dir = __mkdir(working_dir, "data")
incl = pd.read_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
incl['Ngrams'] = incl['Ngrams'].apply(lambda ngrams: list(set(ngrams)))
incl.to_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
incl.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(History, history), (of, of), (uncontrolled, ...",NCT00001149,"[((time, NN),), ((prior, RB), (to, TO), (admis...","[(History, NN), (of, IN), (uncontrolled, VBN),...","[History, of, uncontrolled, seizures, at, the,...",0
1,"[(Seizure, seizure), (frequency, frequency), (...",NCT00001149,"[((frequency, NN), (by, IN)), ((video, NN),), ...","[(Seizure, NN), (frequency, NN), (by, IN), (hi...","[Seizure, frequency, by, history, must, be, su...",1
2,"[(Patients, patient), (of, of), (any, any), (a...",NCT00001149,"[((patient, NNS), (of, IN)), ((any, DT), (age,...","[(Patients, NNS), (of, IN), (any, DT), (age, N...","[Patients, of, any, age, may, be, accepted]",2
3,"[(Patients, patient), (and, and), (parents, pa...",NCT00001149,"[((express, NN), (willingness, NN), (to, TO)),...","[(Patients, NNS), (and, CC), (parents, NNS), (...","[Patients, and, parents, or, guardians, if, ap...",3
4,"[(Patients, patient)]",NCT00001192,"[((patient, NNS),)]","[(Patients, NNS)]",[Patients],4


In [9]:
excl = pd.read_pickle(os.path.join(data_dir, 'ct_excluded.pckl'))
excl.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
149,"[(Other, other)]",NCT00004399,[],"[(Other, JJ)]",[Other],149
610,"[(Other, other)]",NCT00047073,[],"[(Other, JJ)]",[Other],610
636,"[(Other, other)]",NCT00047073,[],"[(Other, JJ)]",[Other],636
959,"[(Other, other)]",NCT00068770,[],"[(Other, JJ)]",[Other],959
993,"[(Other, other)]",NCT00068770,[],"[(Other, JJ)]",[Other],993
