# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [2]:
# coding: utf-8
!pip install tqdm
import csv
import os
import re
import shutil
import string
import zipfile
import sys
from collections import defaultdict
from lxml import objectify
import codecs
import nltk
import pandas as pd
import requests
import tarfile
import subprocess
import platform
import time
from tqdm import tqdm as progressbar # pandas df usage: 'for row in progressbar(df.itertuples(), total=df.shape[0])'



Helper function to create a directory under the specified path, gracefully handling errors.

In [3]:
def __mkdir(*args):
    path = os.path.join(*args)
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise
    return path

In [4]:
# Create the project directory holding the downloaded data, serialized dataframes and MetaMap install.
# working_dir = __mkdir(os.path.expanduser("~"), "Medframes")

# Set working directory as the current directory of the ipython notebook
working_dir = os.getcwd()

download_dir = __mkdir(working_dir, "download", "diab")


print("Working directory: %s" % working_dir)
print("Download directory: %s" % download_dir)

Working directory: C:\Study\CS102\project\project2\repro2\CS109Project
Download directory: C:\Study\CS102\project\project2\repro2\CS109Project\download\diab


# Data download

Download CSV data from clinicaltrials.gov. The data will be written in the working directory specified above as  [data_dir]/study_fields.csv.

For clinicaltrials.gov, a search term needs to be specified. In this example, we'll download search results for the term "seizure".

In [7]:
def download_ctgov(dest_dir, search_term):
    print("Downloading clinicaltrials.gov results for '%s' to %s" % (search_term, dest_dir))
    dl_url = "https://clinicaltrials.gov/ct2/results/download?down_stds=all&down_typ=results&down_flds=all&down_fmt=xml&term=%s&show_down=Y" % search_term

    # Download the zipped data and extract it to the output directory
    out_path = os.path.join(dest_dir, "download_ctgov.zip")
    with open(out_path, 'wb') as fh:
        r = requests.get(dl_url)
        for block in r.iter_content(1024):
            fh.write(block)
    with zipfile.ZipFile(out_path, 'r') as z:
        z.extractall(dest_dir)
    return dest_dir

In [8]:
download_ctgov(download_dir, "type 2 diabetes")

Downloading clinicaltrials.gov results for 'type 2 diabetes' to C:\Study\CS102\project\project2\repro2\CS109Project\download\diab


'C:\\Study\\CS102\\project\\project2\\repro2\\CS109Project\\download\\diab'

# Pandas import

Convert the downloaded CSV data to Pandas dataframes and serialize them as Python pickles. The function reads XML files from the working directory and writes to "ctgov.pckl"

In [9]:
def ctgov_to_dataframe(src_dir):
    # Get all XML files in the data directory
    print("Transforming cliniclatrials download (%s) to dataframe" % (src_dir))
    data = []
    for f in [_ for _ in os.listdir(src_dir) if _.endswith('.xml')]:
        xml = objectify.parse(os.path.join(src_dir, f))
        root = xml.getroot()
        d = defaultdict(list)
        for t in root.iter():
            if t.text:
                key = re.sub(r'\[\d+\]', '', xml.getpath(t)).replace('/clinical_study/', '').replace('/', '.')
                val = t.text.strip()
                d[key].append(val)
        d = {k: v[0] if len(v) == 1 else v for k, v in d.items()}
        data.append(d)
    data_frame = pd.DataFrame(data)
    return data_frame

# Writing dataframes
Transform the downloaded data to Pandas dataframes and seialize them as Python pickles.

In [13]:
download_dir = __mkdir(working_dir, "download", "diab")
data_dir = __mkdir(working_dir, "data", "diab")
ct_df = ctgov_to_dataframe(download_dir)
ct_df.to_pickle(os.path.join(data_dir, 'ctgov.pckl'))

Transforming cliniclatrials download (C:\Study\CS102\project\project2\repro2\CS109Project\download\diab) to dataframe


# Reading dataframes

Read the pickled data back into Pandas and display the first 5 records. In this example, the pickled dataframe is serialized to "ctgov.pckl" in the working directory.

In [14]:
data_dir = __mkdir(working_dir, "data", "diab")
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
ctgov_data.head()

Unnamed: 0,acronym,arm_group.arm_group_label,arm_group.arm_group_type,arm_group.description,biospec_descr.textblock,biospec_retention,brief_summary.textblock,brief_title,clinical_results.baseline.group_list.group.description,clinical_results.baseline.group_list.group.title,...,sponsors.collaborator.agency,sponsors.collaborator.agency_class,sponsors.lead_sponsor.agency,sponsors.lead_sponsor.agency_class,start_date,study_design,study_type,target_duration,verification_date,why_stopped
0,ACCORD,"[Glycemia Trial: intensive control, Glycemia T...","[Experimental, Active Comparator, Experimental...",[Open label administration of oral anti-hyperg...,,,The purpose of this study is to prevent major ...,Action to Control Cardiovascular Risk in Diabe...,[Open label administration of oral anti-hyperg...,"[Glycemia Trial: Intensive Control, Glycemia T...",...,[National Institute of Diabetes and Digestive ...,"[NIH, NIH, NIH, U.S. Fed]","National Heart, Lung, and Blood Institute (NHLBI)",NIH,September 1999,"Allocation: Randomized, Endpoint Classificatio...",Interventional,,November 2014,
1,,,,,,,Patients with Zollinger-Ellison Syndrome suffe...,Combination Chemotherapy in Patients With Zoll...,,,...,,,National Institute of Diabetes and Digestive a...,NIH,September 1978,Endpoint Classification: Safety/Efficacy Study...,Interventional,,August 2003,
2,,,,,,,Patients with Zollinger-Ellison Syndrome suffe...,The Use of Oral Omeprazole and Intravenous Pan...,,,...,,,National Institute of Diabetes and Digestive a...,NIH,February 1983,Primary Purpose: Treatment,Interventional,,December 2007,
3,,,,,,,This study will examine the safety and effecti...,Interferon and Octreotide to Treat Zollinger-E...,,,...,,,National Institute of Diabetes and Digestive a...,NIH,October 1988,Primary Purpose: Treatment,Interventional,,September 2007,
4,,,,,,,In patients with Zollinger-Ellison Syndrome th...,Treatment of Zollinger-Ellison Syndrome,,,...,,,National Institute of Diabetes and Digestive a...,NIH,January 1989,,Observational,,December 2007,


# Extract criteria

Read in the serialized data from clinicaltrials.gov and extract inclusion/exlcusion criteria, one per row. Output a Series(id_info.nct_id, Criteria, Inclusion, TokenCount).

In [34]:
def id_generator(first_val=0, inc_func=lambda val: val + 1):
    """
        Simple id generator. It takes first val & increase function and yields ids as needed.
        Will return integers starting from 0 by default.
    """
    id = first_val
    while True:
        yield id
        id = inc_func(id)
        

def __tokenize(text):
    # extract only alphanumerics
    tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    # If first token is a number - remove it
    if len(tokens) > 0 and tokens[0].isdigit():
        tokens.pop(0)
    return tokens
        
def __process_criteria(data, get_criteria_id):
    """
        Extract inclusion and exclusion criteria from the clinical trials data.
        Then - Tokenize and write the extracted data to a frame.
    """
    pat = r"^([\w\-]*\s*){0,5}%s criteria[\s\w\(\),]*"
    inpat = re.compile(pat % 'inclusion', re.UNICODE)
    expat = re.compile(pat % 'exclusion', re.UNICODE)
    try:
        incl = True
        nct_id = data[1]
        txt = [_.strip() for _ in data[2].split(u'\n\n')]
        # Inclusion OR Exclusion
        for l in txt:
            if re.match(inpat, l.lower()):
                incl = True
                continue
            elif re.match(expat, l.lower()):
                incl = False
                continue
            else:
                toks = __tokenize(l)
                cri_id = next(get_criteria_id)
                s = {'criteria_id': cri_id, 'NctId': nct_id, 'Criteria': unicode(l), 'Include': incl, 'Tokens': toks, 'TokenCount': len(toks)}
                yield s
    except Exception as e:
        print("Error processing row %s: %s" % (data[2], e))

        
def extract_criteria(data):
    """
        Extract inclusion and exclusion criteria from each clinical trial into
    """
    print("Transforming data (extracting criteria)")
    criteria_id_generator = id_generator()
    transformed = [s for row in data[['id_info.nct_id', 'eligibility.criteria.textblock']].itertuples() for s in
                   __process_criteria(row, criteria_id_generator)]
    df = pd.DataFrame(transformed)
    return df

Transform the data and write the result to a file. (You'll notice that the script logs an error for one row. This is expected and results from that row being a "NaN".)

In [35]:
# Read in the data
data_dir = __mkdir(working_dir, "data", "diab")
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
# Extract criteria
criteria = extract_criteria(ctgov_data)
criteria.to_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))

Transforming data (extracting criteria)
Error processing row nan: 'float' object has no attribute 'split'
Error processing row nan: 'float' object has no attribute 'split'


Read back the data and display a record selected by column value.

In [36]:
criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
criteria.loc[criteria['criteria_id'] == 5]

Unnamed: 0,Criteria,Include,NctId,TokenCount,Tokens,criteria_id
5,Histologically proven gastrinoma;,True,NCT00001165,3,"[Histologically, proven, gastrinoma]",5


In [38]:
criteria.head(100)

Unnamed: 0,Criteria,Include,NctId,TokenCount,Tokens,criteria_id
0,"- Diagnosed with type 2 diabetes mellitus, as...",True,NCT00000620,54,"[Diagnosed, with, type, 2, diabetes, mellitus,...",0
1,"- For participants aged 40 years or older, hi...",True,NCT00000620,26,"[For, participants, aged, 40, years, or, older...",1
2,"- For participants aged 55 years or older, a ...",True,NCT00000620,40,"[For, participants, aged, 55, years, or, older...",2
3,- HbA1c 7.5%-9% (if on more drugs) or 7.5%-11...,True,NCT00000620,16,"[HbA1c, 7, 5, 9, if, on, more, drugs, or, 7, 5...",3
4,Subjects selected for this study will be patie...,True,NCT00001165,33,"[Subjects, selected, for, this, study, will, b...",4
5,Histologically proven gastrinoma;,True,NCT00001165,3,"[Histologically, proven, gastrinoma]",5
6,Evidence of metastatic disease or locally inva...,True,NCT00001165,19,"[Evidence, of, metastatic, disease, or, locall...",6
7,Progression of the tumor during the preceding ...,True,NCT00001165,9,"[Progression, of, the, tumor, during, the, pre...",7
8,The following pre-existing conditions will exc...,False,NCT00001165,11,"[The, following, pre, existing, conditions, wi...",8
9,Congestive heart failure;,False,NCT00001165,3,"[Congestive, heart, failure]",9


# Tag, lemmatize, ngrammize

Processes the extracted criteria with the NLTK POS tagger and lemmatizer and generates ngrams of 1-3 words (note: while unigrams are technically duplicated as 'Tokens', it will be more convenient to allow this and keep them in one column with bigrams and trigrams). Preprocesses the tokens by removing special characters and punctuation. Lemmata and ngrams are lowercased.

In [39]:
def __lemmatise(lemmatizer, r):
    wn_tags = {'NN': nltk.corpus.wordnet.NOUN, 'JJ': nltk.corpus.wordnet.ADJ, 'VB': nltk.corpus.wordnet.VERB,
               'RB': nltk.corpus.wordnet.ADV}
    return [(t[0], lemmatizer.lemmatize(t[0].lower(), pos=wn_tags.get(t[1][:2], nltk.corpus.wordnet.NOUN)).lower()) for
            t in r]


def tag_and_stem(data):
    print("Transforming data (tagging and lemmatising)")
    series = []
    lemmatizer = nltk.stem.WordNetLemmatizer()
    punct = '[%s]*' % re.escape(string.punctuation)
    pat = re.compile(r"^(%(p)s[\w\d]+%(p)s)+$" % {'p': punct}, re.UNICODE)
    # Itertuples is 50% faster than df.apply()
    for row in progressbar(data[['NctId', 'Tokens', 'criteria_id']].itertuples(), total=data.shape[0]):
        nct_id = row[1]
        toks = filter(lambda t: re.match(pat, t), row[2])
        cri_id = row[3]
        tags = nltk.pos_tag(toks)
        lemmas = __lemmatise(lemmatizer, tags)
        ngrams = []
        for n in (1, 2, 3):
            ngrams += list(nltk.ngrams([(lemma[1], tags[idx][1]) for idx, lemma in enumerate(lemmas)], n))
        s = {'criteria_id': cri_id, 'NctId': nct_id, 'Tokens': toks, 'Tags': tags, 'Lemmas': lemmas, 'Ngrams': ngrams}
        series.append(s)
    df = pd.DataFrame(series)
    return df

Read in the extracted criteria (stored in "ct_criteria.pckl" in the previous step), tag, lemmatize and ngrammize the data and store it as "ct_tagged.pckl".

In [40]:
data_dir = __mkdir(working_dir, "data", "diab")

criteria = pd.read_pickle(os.path.join(data_dir, 'ct_criteria.pckl'))
tagged = tag_and_stem(criteria)
tagged.to_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))

                                                                                                                                                               

Transforming data (tagging and lemmatising)




In [41]:
tagged = pd.read_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))
tagged.head()

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(Diagnosed, diagnose), (with, with), (type, t...",NCT00000620,"[((diagnose, VBN),), ((with, IN),), ((type, NN...","[(Diagnosed, VBN), (with, IN), (type, NN), (2,...","[Diagnosed, with, type, 2, diabetes, mellitus,...",0
1,"[(For, for), (participants, participant), (age...",NCT00000620,"[((for, IN),), ((participant, NNS),), ((age, V...","[(For, IN), (participants, NNS), (aged, VBD), ...","[For, participants, aged, 40, years, or, older...",1
2,"[(For, for), (participants, participant), (age...",NCT00000620,"[((for, IN),), ((participant, NNS),), ((age, V...","[(For, IN), (participants, NNS), (aged, VBD), ...","[For, participants, aged, 55, years, or, older...",2
3,"[(HbA1c, hba1c), (7, 7), (5, 5), (9, 9), (if, ...",NCT00000620,"[((hba1c, NNP),), ((7, CD),), ((5, CD),), ((9,...","[(HbA1c, NNP), (7, CD), (5, CD), (9, CD), (if,...","[HbA1c, 7, 5, 9, if, on, more, drugs, or, 7, 5...",3
4,"[(Subjects, subject), (selected, select), (for...",NCT00001165,"[((subject, NNS),), ((select, VBN),), ((for, I...","[(Subjects, NNS), (selected, VBN), (for, IN), ...","[Subjects, selected, for, this, study, will, b...",4


# Filter criteria
Filters out criteria composed entirely of function words and stopwords. Strips ngrams composed entirely of stop words/tags from the ngram list. By default this function uses the NTLK stopword list and all PTB tags except nouns. Additional lists of stop words and stop tags can be supplied with keyword arguments ("stop_words", "stop_tags"). Returns a tuple of dataframes, (filtered_criteria, excluded_criteria).

(Note: this step generates a SettingWithCopyWarning. This is known and is a false positive.)

In [42]:
def __filter(values, idx, stops):
    return not set([t[idx] for t in values]) <= [x for x in stops]


def filter_criteria(data, user_stop_words=[], user_stop_tags=[]):
    print("Filtering criteria")
    default_stop_words = nltk.corpus.stopwords.words('english')
    default_stop_tags = ["$", "''", "(", ")", ",", "--", ".", ":", "CC", "CD", "DT",
                         "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD",
                         "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP",
                         "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
                         "WDT", "WP", "WP$", "WRB", "``"]
    print("Filtering stops")
    stop_words = set(default_stop_words + user_stop_words)
    stop_tags = set(default_stop_tags + user_stop_tags)
    excluded = pd.DataFrame()
    for col, idx, stops in (
            ('Lemmas', 0, stop_words),
            ('Tags', 1, stop_tags)):  
        data['Ngrams'] = data['Ngrams'].apply(lambda row: [ngram for ngram in row if __filter(ngram, idx, stops)])
        groups = data.groupby(lambda r: __filter(data[col].loc[r], 1, stops))
        data = groups.get_group(True)
        excluded = excluded.append(groups.get_group(False)) if groups.groups.has_key(False) else excluded
    return (data, excluded)

Read in the tagged criteria (stored in "ct_tagged.pckl" in the previous step), filter out noise and write the results to "ct_filtered.pckl" (the included criteria) and "ct_excluded.pckl" (the excluded criteria).

In [46]:
data_dir = __mkdir(working_dir, "data", "diab")

criteria = pd.read_pickle(os.path.join(data_dir, 'ct_tagged.pckl'))

In [48]:

incl, excl = filter_criteria(criteria)
incl.to_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
excl.to_pickle(os.path.join(data_dir, 'ct_excluded.pckl'))

Filtering criteria
Filtering stops


### Remove ngram duplicates

In [50]:
data_dir = __mkdir(working_dir, "data", "diab")
incl = pd.read_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
incl['Ngrams'] = incl['Ngrams'].apply(lambda ngrams: list(set(ngrams)))
incl.to_pickle(os.path.join(data_dir, 'ct_filtered.pckl'))
incl.head(100)

Unnamed: 0,Lemmas,NctId,Ngrams,Tags,Tokens,criteria_id
0,"[(Diagnosed, diagnose), (with, with), (type, t...",NCT00000620,"[((association, NNP), (guideline, NNS), (which...","[(Diagnosed, VBN), (with, IN), (type, NN), (2,...","[Diagnosed, with, type, 2, diabetes, mellitus,...",0
1,"[(For, for), (participants, participant), (age...",NCT00000620,"[((or, CC), (carotid, JJ), (revascularization,...","[(For, IN), (participants, NNS), (aged, VBD), ...","[For, participants, aged, 40, years, or, older...",1
2,"[(For, for), (participants, participant), (age...",NCT00000620,"[((be, VB), (at, IN), (high, JJ)), ((to, TO),)...","[(For, IN), (participants, NNS), (aged, VBD), ...","[For, participants, aged, 55, years, or, older...",2
3,"[(HbA1c, hba1c), (7, 7), (5, 5), (9, 9), (if, ...",NCT00000620,"[((5, CD), (9, CD), (if, IN)), ((on, IN), (mor...","[(HbA1c, NNP), (7, CD), (5, CD), (9, CD), (if,...","[HbA1c, 7, 5, 9, if, on, more, drugs, or, 7, 5...",3
4,"[(Subjects, subject), (selected, select), (for...",NCT00001165,"[((suspected, NNP), (abnormality, NNPS), (of, ...","[(Subjects, NNS), (selected, VBN), (for, IN), ...","[Subjects, selected, for, this, study, will, b...",4
5,"[(Histologically, histologically), (proven, pr...",NCT00001165,"[((histologically, RB), (prove, VBN)), ((prove...","[(Histologically, RB), (proven, VBN), (gastrin...","[Histologically, proven, gastrinoma]",5
6,"[(Evidence, evidence), (of, of), (metastatic, ...",NCT00001165,"[((of, IN), (metastatic, JJ)), ((invasive, JJ)...","[(Evidence, NN), (of, IN), (metastatic, JJ), (...","[Evidence, of, metastatic, disease, or, locall...",6
7,"[(Progression, progression), (of, of), (the, t...",NCT00001165,"[((6, CD),), ((tumor, NN), (during, IN), (the,...","[(Progression, NN), (of, IN), (the, DT), (tumo...","[Progression, of, the, tumor, during, the, pre...",7
8,"[(The, the), (following, follow), (pre, pre), ...",NCT00001165,"[((condition, NNS), (will, MD), (exclude, VB))...","[(The, DT), (following, VBG), (pre, NN), (exis...","[The, following, pre, existing, conditions, wi...",8
9,"[(Congestive, congestive), (heart, heart), (fa...",NCT00001165,"[((congestive, JJ), (heart, NN), (failure, NN)...","[(Congestive, JJ), (heart, NN), (failure, NN)]","[Congestive, heart, failure]",9


In [53]:
excl = pd.read_pickle(os.path.join(data_dir, 'ct_excluded.pckl'))
excl.head()