bcgsc · sshugsc · Apr 4, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/bin/load.js b/bin/load.js
@@ -36,7 +36,7 @@ const cosmicResistance = require('../src/cosmic/resistance');
 const cosmicFusions = require('../src/cosmic/fusions');
 
 const API_MODULES = {
-    asco, clinicaltrialsgov, dgidb, docm, fdaApprovals, moa, oncotree
+    asco, dgidb, docm, fdaApprovals, moa, oncotree,
 };
 
 const FILE_MODULES = {
@@ -45,7 +45,6 @@ const FILE_MODULES = {
     cancerhotspots,
     cgi,
     cgl,
-    clinicaltrialsgov,
     diseaseOntology,
     drugbank,
     ensembl,
@@ -54,8 +53,8 @@ const FILE_MODULES = {
     ncit,
     ncitFdaXref,
     ontology,
-    sources,
     refseq,
+    sources,
     uberon,
     variants,
 };
@@ -70,11 +69,12 @@ const ALL_MODULES = {
     ...FILE_MODULES,
     ...COSMIC_MODULES,
     civic,
+    clinicaltrialsgov,
 };
 
 const parser = createOptionsMenu();
 
-const subparsers = parser.add_subparsers({ help: 'Sub-command help', required: true });
+const subparsers = parser.add_subparsers({ dest: 'subparser_name', help: 'Sub-command help', required: true });
 const apiParser = subparsers.add_parser('api');
 apiParser.add_argument('module', {
     choices: Object.keys(API_MODULES),
@@ -103,6 +103,11 @@ civicParser.add_argument('--trustedCurators', {
     nargs: '+',
 });
 
+const clinicaltrialsgovParser = subparsers.add_parser('clinicaltrialsgov');
+clinicaltrialsgovParser.add_argument('--days', {
+    help: 'Load new and existing studies added or modified (last update posted) in the last # of days',
+    type: Number,
+});
 
 const cosmicParser = subparsers.add_parser('cosmic');
 cosmicParser.add_argument('module', {
@@ -118,36 +123,24 @@ cosmicParser.add_argument('classification', {
     type: fileExists,
 });
 
-const { module: moduleName, input, ...options } = parser.parse_args();
+const {
+    subparser_name, module: moduleName, input, ...options
+} = parser.parse_args();
 
 let loaderFunction;
 
 if (input) {
-    loaderFunction = ALL_MODULES[moduleName || 'civic'].uploadFile;
+    loaderFunction = ALL_MODULES[moduleName || subparser_name].uploadFile;
 } else {
     debugger;
-    loaderFunction = ALL_MODULES[moduleName || 'civic'].upload;
+    loaderFunction = ALL_MODULES[moduleName || subparser_name].upload;
 }
 
 const loaderOptions = { ...options };
 
 if (input) {
     debugger;
-    if (moduleName === 'clinicaltrialsgov') {
-        if (fs.lstatSync(input).isDirectory()) {
-            const files = fs.readdirSync(input)
-                .map(filename => path.join(input, filename));
-            loaderOptions.files = files;
-        } else {
-            loaderOptions.files = [input];
-        }
-    } else {
-        loaderOptions.filename = input;
-
-        if (options.module === 'cosmic') {
-            loaderOptions.classification = options.classification;
-        }
-    }
+    loaderOptions.filename = input;
 }
 
 runLoader(options, loaderFunction, loaderOptions)

diff --git a/src/clinicaltrialsgov/README.md b/src/clinicaltrialsgov/README.md
@@ -5,8 +5,14 @@ This module loads clinical trials data into GraphKB from [https://www.clinicaltr
 > :warning: Since this loader produces statements, ontology and vocabulary data should be loaded first
 
 
-Uses REST API to load clinical trials data
+Uses REST API to load clinical trials data.
+```bash
+node bin/load.js clinicaltrialsgov
+```
 
+By default this loader loads all studies that related to cancer, which will be a huge number of records.
+Using `--days` can load the new and existing studies added or modified (last update posted) in the last # of days.
 ```bash
-node bin/load.js api clinicaltrialsgov
+node bin/load.js clinicaltrialsgov --days 7
 ```
+Loading the studies added or modified in the last week.
diff --git a/src/clinicaltrialsgov/index.js b/src/clinicaltrialsgov/index.js
@@ -1,21 +1,10 @@
 /**
  * Module to import clinical trials data exported from clinicaltrials.gov
- *
- * 1. Perform a search on their site, for example https://clinicaltrials.gov/ct2/results?cond=Cancer&cntry=CA&Search=Apply&recrs=b&recrs=a&age_v=&gndr=&type=Intr&rslt=
- * 2. Click their Download link/Button
- * 3. Adjust the settings in the Pop up dialog (Include all studies, all columns, and export as XML)
- * 4. Download and save the file
- * 5. Upload the file to GraphKB using this module
- *
  * @module importer/clinicaltrialsgov
  */
-const path = require('path');
 const Ajv = require('ajv');
-const fs = require('fs');
 
 const {
-    loadXmlToJson,
-    parseXmlToJson,
     checkSpec,
     requestWithRetry,
 } = require('../util');
@@ -25,15 +14,13 @@ const {
 } = require('../graphkb');
 const { logger } = require('../logging');
 const { clinicalTrialsGov: SOURCE_DEFN } = require('../sources');
-const { api: apiSpec, rss: rssSpec } = require('./specs.json');
+const { studies: studiesSpecs } = require('./specs.json');
 
 const BASE_URL = 'https://clinicaltrials.gov/api/v2/studies';
-const RSS_URL = 'https://clinicaltrials.gov/ct2/results/rss.xml';
 const CACHE = {};
 
 const ajv = new Ajv();
-const validateAPITrialRecord = ajv.compile(apiSpec);
-const validateRssFeed = ajv.compile(rssSpec);
+const validateAPITrialRecord = ajv.compile(studiesSpecs);
 
 
 /**
@@ -124,11 +111,11 @@ const processPhases = (phaseList) => {
 
 
 /**
- * Process the XML trial record. Attempt to link the drug and/or disease information
+ * Process the record. Attempt to link the drug and/or disease information
  *
  * @param {object} opt
  * @param {ApiConnection} opt.conn the GraphKB connection object
- * @param {object} opt.record the XML record (pre-parsed into JSON)
+ * @param {object} opt.record the record (pre-parsed into JSON)
  * @param {object|string} opt.source the 'source' record for clinicaltrials.gov
  *
  * @todo: handle updates to existing clinical trial records
@@ -306,98 +293,66 @@ const fetchAndLoadById = async (conn, nctID, { upsert = false } = {}) => {
     return trial;
 };
 
+const formatDate = (date) => `${date.getFullYear()}-${date.getMonth() + 1}-${date.getDate()}`;
+
 /**
- * Uploads a file exported from clinicaltrials.gov as XML
- * @param {object} opt
- * @param {ApiConnection} opt.conn the GraphKB connection object
- * @param {string} opt.filename the path to the XML export
+ * Loading all clinical trials related to cancer
  */
-const uploadFiles = async ({ conn, files }) => {
+const upload = async ({ conn, maxRecords, days }) => {
     const source = await conn.addSource(SOURCE_DEFN);
 
-    logger.info(`loading ${files.length} records`);
-    const counts = {
-        error: 0, success: 0,
-    };
-
-    for (const filepath of files) {
-        const filename = path.basename(filepath);
-
-        if (!filename.endsWith('.xml')) {
-            logger.warn(`ignoring non-xml file: ${filename}`);
-            continue;
-        }
+    let options = {};
 
-        try {
-            const xml = await loadXmlToJson(filepath);
-            const record = convertAPIRecord(xml);
-            await processRecord({
-                conn, record, source, upsert: true,
-            });
-            counts.success++;
-        } catch (err) {
-            logger.error(`[${filename}] ${err}`);
-            counts.error++;
-        }
+    if (days) {
+        const startDate = new Date(Date.now() - days * 24 * 60 * 60 * 1000);
+        options = { 'query.term': `AREA[LastUpdatePostDate]RANGE[${formatDate(startDate)},MAX]` };
+        logger.info(`loading records updated from ${formatDate(startDate)} to ${formatDate(new Date())}`);
     }
-    logger.info(JSON.stringify(counts));
-};
-
-const upload = async ({ conn }) => {
-    const source = await conn.addSource(SOURCE_DEFN);
-
-
 
-    let trials = await requestWithRetry({
-        json: true,
-        method: 'GET',
-        qs: {
-            aggFilters: 'studyType:int',
-            countTotal: true,
-            pageSize: 1000,
-            'query.cond': 'cancer',
-        },
-        uri: BASE_URL,
-    });
-
-
-    logger.info(`loading ${trials.totalCount} records`);
     const counts = {
         error: 0, success: 0,
     };
 
-    for (const trial of trials.studies) {
-        try {
-            const record = convertAPIRecord(trial);
-            await processRecord({
-                conn, record, source, upsert: true,
-            });
-            counts.success++;
-        } catch (err) {
-            counts.error++;
-            logger.error(`[${trial}] ${err}`);
-        }
-    }
-
-    let next = trials.nextPageToken;
+    let processCount = 1,
+        next = true,
+        nextToken,
+        total = maxRecords;
 
     while (next) {
-        trials = await requestWithRetry({
+        if (nextToken) {
+            options = { pageToken: nextToken, ...options };
+        }
+        const trials = await requestWithRetry({
             json: true,
             method: 'GET',
             qs: {
                 aggFilters: 'studyType:int',
                 countTotal: true,
                 pageSize: 1000,
-                pageToken: next,
                 'query.cond': 'cancer',
+                sort: 'LastUpdatePostDate',
+                ...options,
             },
             uri: BASE_URL,
         });
 
+        if (!total) {
+            total = trials.totalCount;
+        }
+
+        if (processCount > total) {
+            break;
+        }
+
         for (const trial of trials.studies) {
+            if (processCount > total) {
+                break;
+            }
+
             try {
                 const record = convertAPIRecord(trial);
+                logger.info(`processing (${processCount}/${total}) record: ${record.sourceId}`);
+                processCount++;
                 await processRecord({
                     conn, record, source, upsert: true,
                 });
@@ -408,52 +363,8 @@ const upload = async ({ conn }) => {
             }
         }
 
-        next = trials.nextPageToken;
-    }
-    logger.info(JSON.stringify(counts));
-};
-
-
-
-/**
- * Parses clinical trial RSS Feed results for clinical trials in Canada and the US
- * which were updated in the last 2 weeks
- */
-const loadNewTrials = async ({ conn }) => {
-    // ping them both to get the list of recently updated trials
-    const recentlyUpdatedTrials = [];
-
-    const resp = await requestWithRetry({
-        method: 'GET',
-        qs: {
-            cond: 'cancer', // cancer related trials
-            count: 10000,
-            lup_d: 14,
-            rcv_d: '',
-            recrs: 'abdef',
-            sel_rss: 'mod14', // mod14 for last 2 weeks updated
-            type: 'Intr', // interventional only
-        },
-        uri: RSS_URL,
-    });
-    const xml = await parseXmlToJson(resp);
-    fs.writeFileSync('output.json', JSON.stringify(xml, null, 2));
-    checkSpec(validateRssFeed, xml);
-    recentlyUpdatedTrials.push(
-        ...xml.rss.channel[0].item.map(item => item.guid[0]._),
-    );
-
-    logger.info(`loading ${recentlyUpdatedTrials.length} recently updated trials`);
-    const counts = { error: 0, success: 0 };
-
-    for (const trialId of recentlyUpdatedTrials) {
-        try {
-            await fetchAndLoadById(conn, trialId, { upsert: true });
-            counts.success++;
-        } catch (err) {
-            counts.error++;
-            logger.error(`[${trialId}] ${err}`);
-        }
+        nextToken = trials.nextPageToken;
+        next = nextToken !== undefined;
     }
     logger.info(JSON.stringify(counts));
 };
@@ -463,7 +374,5 @@ module.exports = {
     convertAPIRecord,
     fetchAndLoadById,
     kb: true,
-    loadNewTrials,
     upload,
-    uploadFiles,
 };