# Initial setup

Let's import the required libraries and set up global variables for the rest of the script.

In [1]:
import requests
import zipfile
import tempfile
import shutil
import os
import pandas as pd
import pickle
import sys
import time
import csv
from lxml import objectify
import re
from collections import defaultdict

In [4]:
# Directory to hold the downloaded data and the serialized Pandas dataframes. 
# We'll create a new temporary directory here.

data_dir = tempfile.mkdtemp()

#data_dir = 'C:\project\ClinicalTrials\data-dir'
print("Working directory: %s" % data_dir)



Working directory: C:\Study\CS102\project\data-dir


# Data download

Download CSV data from DRKS and clinicaltrials.gov. The data will be written in the working directory specified above as:

* For DRKS: as [data_dir]/trials.csv
* FOR clinicaltrials.gov: as [data_dir]/study_fields.csv

The DRKS script downloads all data available on the page. In the case of clinicaltrials.gov, a search term needs to be specified. In this example, we'll download search results for the term "seizure".

In [3]:
def download_drks(dest_dir):
    print("Downloading DRKS data to %s" % (os.path.join(dest_dir, 'trials.csv')))
    LIMIT = 10000
    
    # Initiate the HTTP session so that we have the auth cookie
    session = requests.Session()
    session.get('http://drks-neu.uniklinik-freiburg.de/drks_web/navigate.do?navigationId=results')
    dl_url = 'http://drks-neu.uniklinik-freiburg.de/drks_web/DownloadTrialsServlet'    
    cookies = requests.utils.cookiejar_from_dict(requests.utils.dict_from_cookiejar(session.cookies))
    
    # Set up the download request
    headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
               "Accept-Encoding": "gzip, deflate",
               "Accept-Language": "en-GB,en;q=0.8,en-US;q=0.6,pl;q=0.4,fr;q=0.2",
               "Cache-Control": "no-cache",
               "Connection": "keep-alive",
               "Content-Length": "22",
               "Content-Type": "application/x-www-form-urlencoded",
               "Host": "drks-neu.uniklinik-freiburg.de",
               "Origin": "http://drks-neu.uniklinik-freiburg.de",
               "Pragma": "no-cache",
               "Referer": "http://drks-neu.uniklinik-freiburg.de/drks_web/navigate.do?navigationId=results",
               "Upgrade-Insecure-Requests": "1",
               "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36"}
    payload = {'QUANTITY': LIMIT, 'FORMAT': 'csv'}

    # Download the zipped data and extract it to the output directory
    r = session.post(dl_url, headers=headers, data=payload, cookies=cookies, stream=True)
    out_path = os.path.join(dest_dir, "download_drks.zip")
    with open(out_path, 'wb') as f:
        shutil.copyfileobj(r.raw, f)
    with zipfile.ZipFile(out_path, 'r') as z:
        z.extract('trials.csv', dest_dir)

In [5]:
def download_ctgov(dest_dir, search_term):
    print("Downloading clinicaltrials.gov results for '%s' to %s" % (search_term, dest_dir))
    dl_url = "https://clinicaltrials.gov/ct2/results/download?down_stds=all&down_typ=results&down_flds=all&down_fmt=xml&term=%s&show_down=Y" % search_term
    
    # Download the zipped data and extract it to the output directory
    out_path = os.path.join(dest_dir, "download_ctgov.zip")
    with open(out_path, 'wb') as fh:
        r = requests.get(dl_url)
        for block in r.iter_content(1024):
            fh.write(block)
    with zipfile.ZipFile(out_path, 'r') as z:
        z.extractall(dest_dir)

In [5]:
download_drks(data_dir)

Downloading DRKS data to /tmp/tmpwgkCAf/trials.csv


In [6]:
download_ctgov(data_dir, "seizure")

Downloading clinicaltrials.gov results for 'seizure' to C:\Study\CS102\project\data-dir


# Pandas import

Convert the downloaded CSV data to Pandas dataframes and serialize them as Python pickles.

You can run the functions providing the paths to the input and output files or specifying only the working directory. In the latter case, they will assume the following values:

* For DRKS: the function will expect the input file to be named "trials.csv" and will write the pickle to "drks.pckl"
* For clinicaltrials.gov: the function will read all XML files from the working directory and write to "ctgov.pckl"

In [7]:
def drks_to_dataframe(src_dir, src_file='trials.csv', dest_file='drks.pckl'):
    # Set up input and output paths
    src_path = os.path.join(src_dir, src_file)
    dest_path = os.path.join(src_dir, dest_file)
    print("Serializing file %s to %s" % (src_path, dest_path))
    
    # Read in the CSV and convert it to a Pandas dataframe
    # The CSV should have 322 fields (according to the CSV header), but some lines are misformatted. Ignore them.
    with open(src_path, 'r') as fh:
        data = [_ for _ in csv.reader(fh, delimiter=';', quotechar='"') if len(_) == 322]
    data_frame = pd.DataFrame(data=data[1:], columns=data[0])
    data_frame.to_pickle(dest_path)

In [8]:
def ctgov_to_dataframe(src_dir, dest_file='ctgov.pckl'):
    # Set up input and output paths
    dest_path = os.path.join(src_dir, dest_file)
    # Get all XML files in the data directory
    print("Serializing files in %s to %s" % (src_dir, dest_path))
    data = []
    for f in [_ for _ in os.listdir(src_dir) if _.endswith('.xml')]:
        xml = objectify.parse(os.path.join(src_dir, f))
        root = xml.getroot()
        d = defaultdict(list)
        for t in root.iter():
            if t.text:
                key = re.sub(r'\[\d+\]', '', xml.getpath(t)).replace('/clinical_study/','').replace('/','.')
                val = t.text.strip()
                d[key].append(val)
        d = {k: v[0] if len(v)==1 else v for k,v in d.items()}
        s = pd.Series(d)
        data.append(s)
    data_frame = pd.DataFrame(data)
    data_frame.to_pickle(dest_path)

In [9]:
drks_to_dataframe(data_dir) #If you need custom paths, run drks_to_dataframe(data_dir, [csv_file], [pickle_file])

Serializing file /tmp/tmpwgkCAf/trials.csv to /tmp/tmpwgkCAf/drks.pckl


In [9]:
ctgov_to_dataframe(data_dir) #If you need custom paths, run ctgov_to_dataframe(data_dir, [csv_file], [pickle_file])

Serializing files in C:\Study\CS102\project\data-dir to C:\Study\CS102\project\data-dir\ctgov.pckl


# Reading data

Read the pickled data back into Pandas and display the first 5 records. In this example, the pickled dataframes are in "drks.pckl" and "ctgov.pckl" in the working directory.

In [11]:
drks_data = pd.read_pickle(os.path.join(data_dir, 'drks.pckl'))
drks_data.head()

Unnamed: 0,drksId,firstDrksPublishDate,firstPartnerPublishDate,investorInitiated,ethicCommitteeVote,ethic-commission.id,ethic-commission.name,secId.type0,secId.id0,secId.name0,...,publication.key3,publication.type3,publication.value3,publication.category4,publication.key4,publication.type4,publication.value4,url,last-update,Unnamed: 21
0,DRKS00000002,2008/08/08,2005/09/13,no,Approved,238/02,Ethik-Kommission der Albert-Ludwigs-Universitä...,EudraCT-No.<br/>(for studies acc. to Drug Law),2004-000232-91,[---]*,...,,,,,,,,http://drks-neu.uniklinik-freiburg.de//drks_we...,2015-05-18T10:24:14.565+02:00,
1,DRKS00000003,2008/08/08,2008/02/01,no,Approved,034/08,Ethik-Kommission der Albert-Ludwig-Universität...,EudraCT-Number,2007-005376-13,[---]*,...,,,,,,,,http://drks-neu.uniklinik-freiburg.de//drks_we...,2012-09-05T10:10:33.027+02:00,
2,DRKS00000005,2008/08/08,2006/07/18,yes,Approved,245/2006,Ethik-Kommission I der Medizinischen Fakultät ...,Primary Registry-ID,18452029,ISRCTN Register,...,,,,,,,,http://drks-neu.uniklinik-freiburg.de//drks_we...,2014-05-21T13:57:01.539+02:00,
3,DRKS00000006,2008/08/19,2005/09/13,yes,Approved,337/01,Ethik-Kommission der Albert-Ludwig-Universität...,Primary Registry-ID,NCT00515151,ClinicalTrials.gov,...,,,,,,,,http://drks-neu.uniklinik-freiburg.de//drks_we...,2012-09-05T10:16:17.217+02:00,
4,DRKS00000008,2008/09/05,[---]*,yes,Approved,0255.6,Medizinische Ethik-Kommission II Medizinische ...,,,,...,,,,,,,,http://drks-neu.uniklinik-freiburg.de//drks_we...,[---]*,


In [10]:
ctgov_data = pd.read_pickle(os.path.join(data_dir, 'ctgov.pckl'))
ctgov_data.head()

Unnamed: 0,acronym,arm_group.arm_group_label,arm_group.arm_group_type,arm_group.description,biospec_descr.textblock,biospec_retention,brief_summary.textblock,brief_title,clinical_results.baseline.group_list.group.description,clinical_results.baseline.group_list.group.title,...,sponsors.collaborator.agency,sponsors.collaborator.agency_class,sponsors.lead_sponsor.agency,sponsors.lead_sponsor.agency_class,start_date,study_design,study_type,target_duration,verification_date,why_stopped
0,,,,,,,This study is designed to evaluate patients wi...,Monitoring Patients With Uncontrolled Epilepsy,,,...,,,National Institute of Neurological Disorders a...,NIH,November 1975,,Observational,,January 2002,
1,,,,,,,This study will allow researchers to use vario...,Neuropsychological Evaluation of Psychiatric a...,,,...,,,National Institute of Mental Health (NIMH),NIH,October 1983,,Observational,,December 2006,
2,,,,,,,"The purpose of this study is to evaluate, trea...",Treatment of Patients With Cysticercosis With ...,,,...,,,National Institute of Allergy and Infectious D...,NIH,July 1985,Time Perspective: Prospective,Observational,,December 2014,
3,,,,,,,Our past ultrasound research has indicated a n...,Development of Normative Ultrasound Databases ...,,,...,,,National Institutes of Health Clinical Center ...,NIH,February 1987,,Observational,,January 2000,
4,,Copper histidine,Experimental,,,,Menkes Disease is a genetic disorder affecting...,Copper Histidine Therapy for Menkes Diseases,[Classic Menkes disease: Copper histidine trea...,"[Early, Late, Mild, Total]",...,,,Eunice Kennedy Shriver National Institute of C...,NIH,June 1990,Endpoint Classification: Safety/Efficacy Study...,Interventional,,September 2015,


In [13]:
ctgov_data['eligibility.criteria.textblock'][1]

'-  INCLUSION CRITERIA:\n\n        Patients.\n\n        Normal volunteers.\n\n        EXCLUSION CRITERIA:\n\n        Subjects over 95 years of age.\n\n        Subjects with a history of alcohol or drug abuse, psychopathology, or central nervous\n        system pathology, other than that which defines group membership, may be excluded.'

In [14]:
ctgov_data.columns.values

array(['acronym', 'arm_group.arm_group_label', 'arm_group.arm_group_type',
       'arm_group.description', 'biospec_descr.textblock',
       'biospec_retention', 'brief_summary.textblock', 'brief_title',
       'clinical_results.baseline.group_list.group.description',
       'clinical_results.baseline.group_list.group.title',
       'clinical_results.baseline.measure_list.measure.category_list.category.measurement_list.measurement',
       'clinical_results.baseline.measure_list.measure.category_list.category.sub_title',
       'clinical_results.baseline.measure_list.measure.description',
       'clinical_results.baseline.measure_list.measure.dispersion',
       'clinical_results.baseline.measure_list.measure.param',
       'clinical_results.baseline.measure_list.measure.title',
       'clinical_results.baseline.measure_list.measure.units',
       'clinical_results.baseline.population',
       'clinical_results.certain_agreements.pi_employee',
       'clinical_results.certain_agreement

In [18]:
ctgov_data['id_info.nct_id']

0       NCT00001149
1       NCT00001192
2       NCT00001205
3       NCT00001218
4       NCT00001262
5       NCT00001289
6       NCT00001325
7       NCT00001366
8       NCT00001404
9       NCT00001489
10      NCT00001666
11      NCT00001725
12      NCT00001726
13      NCT00001912
14      NCT00001932
15      NCT00003625
16      NCT00004297
17      NCT00004399
18      NCT00004403
19      NCT00004437
20      NCT00004637
21      NCT00004729
22      NCT00004730
23      NCT00004758
24      NCT00004773
25      NCT00004776
26      NCT00004807
27      NCT00004817
28      NCT00005772
29      NCT00005925
           ...     
1304    NCT02540603
1305    NCT02541474
1306    NCT02544750
1307    NCT02544763
1308    NCT02545816
1309    NCT02547389
1310    NCT02550028
1311    NCT02551146
1312    NCT02551731
1313    NCT02552511
1314    NCT02554032
1315    NCT02555410
1316    NCT02555462
1317    NCT02556008
1318    NCT02560597
1319    NCT02561013
1320    NCT02564952
1321    NCT02565108
1322    NCT02565316
