diff --git a/bin/load.js b/bin/load.js index e1b6d02c..57576a4d 100644 --- a/bin/load.js +++ b/bin/load.js @@ -36,7 +36,7 @@ const cosmicResistance = require('../src/cosmic/resistance'); const cosmicFusions = require('../src/cosmic/fusions'); const API_MODULES = { - asco, clinicaltrialsgov, dgidb, docm, fdaApprovals, moa, oncotree + asco, dgidb, docm, fdaApprovals, moa, oncotree }; const FILE_MODULES = { @@ -45,7 +45,6 @@ const FILE_MODULES = { cancerhotspots, cgi, cgl, - clinicaltrialsgov, diseaseOntology, drugbank, ensembl, @@ -70,11 +69,12 @@ const ALL_MODULES = { ...FILE_MODULES, ...COSMIC_MODULES, civic, + clinicaltrialsgov, }; const parser = createOptionsMenu(); -const subparsers = parser.add_subparsers({ help: 'Sub-command help', required: true }); +const subparsers = parser.add_subparsers({ help: 'Sub-command help', required: true, dest: 'subparser_name' }); const apiParser = subparsers.add_parser('api'); apiParser.add_argument('module', { choices: Object.keys(API_MODULES), @@ -103,6 +103,11 @@ civicParser.add_argument('--trustedCurators', { nargs: '+', }); +const clinicaltrialsgovParser = subparsers.add_parser('clinicaltrialsgov'); +clinicaltrialsgovParser.add_argument('--days', { + help: 'Load new and existing studies added or modified (last update posted) in the last # of days', + type: Number, +}); const cosmicParser = subparsers.add_parser('cosmic'); cosmicParser.add_argument('module', { @@ -118,36 +123,22 @@ cosmicParser.add_argument('classification', { type: fileExists, }); -const { module: moduleName, input, ...options } = parser.parse_args(); +const { subparser_name, moduleName, input, ...options } = parser.parse_args(); let loaderFunction; if (input) { - loaderFunction = ALL_MODULES[moduleName || 'civic'].uploadFile; + loaderFunction = ALL_MODULES[moduleName || subparser_name].uploadFile; } else { debugger; - loaderFunction = ALL_MODULES[moduleName || 'civic'].upload; + loaderFunction = ALL_MODULES[moduleName || subparser_name].upload; } const loaderOptions = { ...options }; if (input) { debugger; - if (moduleName === 'clinicaltrialsgov') { - if (fs.lstatSync(input).isDirectory()) { - const files = fs.readdirSync(input) - .map(filename => path.join(input, filename)); - loaderOptions.files = files; - } else { - loaderOptions.files = [input]; - } - } else { - loaderOptions.filename = input; - - if (options.module === 'cosmic') { - loaderOptions.classification = options.classification; - } - } + loaderOptions.filename = input; } runLoader(options, loaderFunction, loaderOptions) diff --git a/src/clinicaltrialsgov/README.md b/src/clinicaltrialsgov/README.md index 3ecddbb1..da708c71 100644 --- a/src/clinicaltrialsgov/README.md +++ b/src/clinicaltrialsgov/README.md @@ -5,8 +5,19 @@ This module loads clinical trials data into GraphKB from [https://www.clinicaltr > :warning: Since this loader produces statements, ontology and vocabulary data should be loaded first -Uses REST API to load clinical trials data +Uses REST API to load clinical trials data. By default this loader loads all studies that related to cancer, which will be a huge number of records. ```bash -node bin/load.js api clinicaltrialsgov +node bin/load.js clinicaltrialsgov ``` + +Using `--maxRecords` can specify the maximum number of loaded studies. +```bash +node bin/load.js --maxRecords 100 clinicaltrialsgov +``` + +Using `--days` can load the new and existing studies added or modified (last update posted) in the last # of days. +```bash +node bin/load.js clinicaltrialsgov --days 7 +``` +Loading the studies added or modified in the last week. \ No newline at end of file diff --git a/src/clinicaltrialsgov/index.js b/src/clinicaltrialsgov/index.js index 795edc78..cebb4265 100644 --- a/src/clinicaltrialsgov/index.js +++ b/src/clinicaltrialsgov/index.js @@ -1,21 +1,11 @@ /** * Module to import clinical trials data exported from clinicaltrials.gov - * - * 1. Perform a search on their site, for example https://clinicaltrials.gov/ct2/results?cond=Cancer&cntry=CA&Search=Apply&recrs=b&recrs=a&age_v=&gndr=&type=Intr&rslt= - * 2. Click their Download link/Button - * 3. Adjust the settings in the Pop up dialog (Include all studies, all columns, and export as XML) - * 4. Download and save the file - * 5. Upload the file to GraphKB using this module - * * @module importer/clinicaltrialsgov */ const path = require('path'); const Ajv = require('ajv'); -const fs = require('fs'); const { - loadXmlToJson, - parseXmlToJson, checkSpec, requestWithRetry, } = require('../util'); @@ -28,7 +18,6 @@ const { clinicalTrialsGov: SOURCE_DEFN } = require('../sources'); const { api: apiSpec, rss: rssSpec } = require('./specs.json'); const BASE_URL = 'https://clinicaltrials.gov/api/v2/studies'; -const RSS_URL = 'https://clinicaltrials.gov/ct2/results/rss.xml'; const CACHE = {}; const ajv = new Ajv(); @@ -124,11 +113,11 @@ const processPhases = (phaseList) => { /** - * Process the XML trial record. Attempt to link the drug and/or disease information + * Process the record. Attempt to link the drug and/or disease information * * @param {object} opt * @param {ApiConnection} opt.conn the GraphKB connection object - * @param {object} opt.record the XML record (pre-parsed into JSON) + * @param {object} opt.record the record (pre-parsed into JSON) * @param {object|string} opt.source the 'source' record for clinicaltrials.gov * * @todo: handle updates to existing clinical trial records @@ -306,47 +295,23 @@ const fetchAndLoadById = async (conn, nctID, { upsert = false } = {}) => { return trial; }; +const formatDate = (date) => { + return `${date.getFullYear()}-${date.getMonth()+1}-${date.getDate()}`; +} + /** - * Uploads a file exported from clinicaltrials.gov as XML - * @param {object} opt - * @param {ApiConnection} opt.conn the GraphKB connection object - * @param {string} opt.filename the path to the XML export + * Loading all clinical trials related to cancer */ -const uploadFiles = async ({ conn, files }) => { +const upload = async ({ conn, maxRecords, days }) => { const source = await conn.addSource(SOURCE_DEFN); - logger.info(`loading ${files.length} records`); - const counts = { - error: 0, success: 0, - }; - - for (const filepath of files) { - const filename = path.basename(filepath); - - if (!filename.endsWith('.xml')) { - logger.warn(`ignoring non-xml file: ${filename}`); - continue; - } + let options = {}; - try { - const xml = await loadXmlToJson(filepath); - const record = convertAPIRecord(xml); - await processRecord({ - conn, record, source, upsert: true, - }); - counts.success++; - } catch (err) { - logger.error(`[${filename}] ${err}`); - counts.error++; - } + if (days) { + const startDate = new Date(Date.now() - days * 24 * 60 * 60 * 1000); + options = {'query.term': `AREA[LastUpdatePostDate]RANGE[${formatDate(startDate)},MAX]`}; + logger.info(`loading records updated from ${formatDate(startDate)} to ${formatDate(new Date())}`); } - logger.info(JSON.stringify(counts)); -}; - -const upload = async ({ conn }) => { - const source = await conn.addSource(SOURCE_DEFN); - - let trials = await requestWithRetry({ json: true, @@ -355,7 +320,9 @@ const upload = async ({ conn }) => { aggFilters: 'studyType:int', countTotal: true, pageSize: 1000, + sort: 'LastUpdatePostDate', 'query.cond': 'cancer', + ...options }, uri: BASE_URL, }); @@ -366,9 +333,22 @@ const upload = async ({ conn }) => { error: 0, success: 0, }; + let processCount = 1, + total; + + if (maxRecords) { + total = maxRecords; + } else { + total = trials.totalCount; + } + for (const trial of trials.studies) { + if (processCount > total) { + break; + } try { const record = convertAPIRecord(trial); + logger.info(`processing (${processCount++}/${total}) record: ${record.sourceId}`); await processRecord({ conn, record, source, upsert: true, }); @@ -382,6 +362,9 @@ const upload = async ({ conn }) => { let next = trials.nextPageToken; while (next) { + if (processCount > total) { + break; + } trials = await requestWithRetry({ json: true, method: 'GET', @@ -390,14 +373,20 @@ const upload = async ({ conn }) => { countTotal: true, pageSize: 1000, pageToken: next, + sort: 'LastUpdatePostDate', 'query.cond': 'cancer', + ...options }, uri: BASE_URL, }); for (const trial of trials.studies) { + if (processCount > total) { + break; + } try { const record = convertAPIRecord(trial); + logger.info(`processing (${processCount++}/${total}) record: ${record.sourceId}`); await processRecord({ conn, record, source, upsert: true, }); @@ -413,57 +402,10 @@ const upload = async ({ conn }) => { logger.info(JSON.stringify(counts)); }; - - -/** - * Parses clinical trial RSS Feed results for clinical trials in Canada and the US - * which were updated in the last 2 weeks - */ -const loadNewTrials = async ({ conn }) => { - // ping them both to get the list of recently updated trials - const recentlyUpdatedTrials = []; - - const resp = await requestWithRetry({ - method: 'GET', - qs: { - cond: 'cancer', // cancer related trials - count: 10000, - lup_d: 14, - rcv_d: '', - recrs: 'abdef', - sel_rss: 'mod14', // mod14 for last 2 weeks updated - type: 'Intr', // interventional only - }, - uri: RSS_URL, - }); - const xml = await parseXmlToJson(resp); - fs.writeFileSync('output.json', JSON.stringify(xml, null, 2)); - checkSpec(validateRssFeed, xml); - recentlyUpdatedTrials.push( - ...xml.rss.channel[0].item.map(item => item.guid[0]._), - ); - - logger.info(`loading ${recentlyUpdatedTrials.length} recently updated trials`); - const counts = { error: 0, success: 0 }; - - for (const trialId of recentlyUpdatedTrials) { - try { - await fetchAndLoadById(conn, trialId, { upsert: true }); - counts.success++; - } catch (err) { - counts.error++; - logger.error(`[${trialId}] ${err}`); - } - } - logger.info(JSON.stringify(counts)); -}; - module.exports = { SOURCE_DEFN, convertAPIRecord, fetchAndLoadById, kb: true, - loadNewTrials, upload, - uploadFiles, }; diff --git a/src/clinicaltrialsgov/specs.json b/src/clinicaltrialsgov/specs.json index d37883d0..521f2f61 100644 --- a/src/clinicaltrialsgov/specs.json +++ b/src/clinicaltrialsgov/specs.json @@ -148,62 +148,5 @@ "protocolSection" ], "type": "object" - }, - "rss": { - "properties": { - "rss": { - "properties": { - "channel": { - "items": { - "properties": { - "item": { - "items": { - "properties": { - "guid": { - "items": { - "properties": { - "_": { - "pattern": "^NCT\\d+$", - "type": "string" - } - }, - "required": [ - "_" - ], - "type": "object" - }, - "maxItems": 1, - "minItems": 1, - "type": "array" - } - }, - "required": [ - "guid" - ], - "type": "object" - }, - "type": "array" - } - }, - "required": [ - "item" - ], - "type": "object" - }, - "maxItems": 1, - "minItems": 1, - "type": "array" - } - }, - "required": [ - "channel" - ], - "type": "object" - } - }, - "required": [ - "rss" - ], - "type": "object" } }