Skip to content

Commit

Permalink
update clinicaltrialsgov to load last updated data
Browse files Browse the repository at this point in the history
  • Loading branch information
sshugsc committed Mar 28, 2024
1 parent d1c3f6e commit 1a67141
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 175 deletions.
33 changes: 12 additions & 21 deletions bin/load.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ const cosmicResistance = require('../src/cosmic/resistance');
const cosmicFusions = require('../src/cosmic/fusions');

const API_MODULES = {
asco, clinicaltrialsgov, dgidb, docm, fdaApprovals, moa, oncotree
asco, dgidb, docm, fdaApprovals, moa, oncotree
};

const FILE_MODULES = {
Expand All @@ -45,7 +45,6 @@ const FILE_MODULES = {
cancerhotspots,
cgi,
cgl,
clinicaltrialsgov,
diseaseOntology,
drugbank,
ensembl,
Expand All @@ -70,11 +69,12 @@ const ALL_MODULES = {
...FILE_MODULES,
...COSMIC_MODULES,
civic,
clinicaltrialsgov,
};

const parser = createOptionsMenu();

const subparsers = parser.add_subparsers({ help: 'Sub-command help', required: true });
const subparsers = parser.add_subparsers({ help: 'Sub-command help', required: true, dest: 'subparser_name' });
const apiParser = subparsers.add_parser('api');
apiParser.add_argument('module', {
choices: Object.keys(API_MODULES),
Expand Down Expand Up @@ -103,6 +103,11 @@ civicParser.add_argument('--trustedCurators', {
nargs: '+',
});

const clinicaltrialsgovParser = subparsers.add_parser('clinicaltrialsgov');
clinicaltrialsgovParser.add_argument('--days', {
help: 'Load new and existing studies added or modified (last update posted) in the last # of days',
type: Number,
});

const cosmicParser = subparsers.add_parser('cosmic');
cosmicParser.add_argument('module', {
Expand All @@ -118,36 +123,22 @@ cosmicParser.add_argument('classification', {
type: fileExists,
});

const { module: moduleName, input, ...options } = parser.parse_args();
const { subparser_name, moduleName, input, ...options } = parser.parse_args();

let loaderFunction;

if (input) {
loaderFunction = ALL_MODULES[moduleName || 'civic'].uploadFile;
loaderFunction = ALL_MODULES[moduleName || subparser_name].uploadFile;
} else {
debugger;
loaderFunction = ALL_MODULES[moduleName || 'civic'].upload;
loaderFunction = ALL_MODULES[moduleName || subparser_name].upload;
}

const loaderOptions = { ...options };

if (input) {
debugger;
if (moduleName === 'clinicaltrialsgov') {
if (fs.lstatSync(input).isDirectory()) {
const files = fs.readdirSync(input)
.map(filename => path.join(input, filename));
loaderOptions.files = files;
} else {
loaderOptions.files = [input];
}
} else {
loaderOptions.filename = input;

if (options.module === 'cosmic') {
loaderOptions.classification = options.classification;
}
}
loaderOptions.filename = input;
}

runLoader(options, loaderFunction, loaderOptions)
Expand Down
15 changes: 13 additions & 2 deletions src/clinicaltrialsgov/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,19 @@ This module loads clinical trials data into GraphKB from [https://www.clinicaltr
> :warning: Since this loader produces statements, ontology and vocabulary data should be loaded first

Uses REST API to load clinical trials data
Uses REST API to load clinical trials data. By default this loader loads all studies that related to cancer, which will be a huge number of records.

```bash
node bin/load.js api clinicaltrialsgov
node bin/load.js clinicaltrialsgov
```

Using `--maxRecords` can specify the maximum number of loaded studies.
```bash
node bin/load.js --maxRecords 100 clinicaltrialsgov
```

Using `--days` can load the new and existing studies added or modified (last update posted) in the last # of days.
```bash
node bin/load.js clinicaltrialsgov --days 7
```
Loading the studies added or modified in the last week.
132 changes: 37 additions & 95 deletions src/clinicaltrialsgov/index.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
/**
* Module to import clinical trials data exported from clinicaltrials.gov
*
* 1. Perform a search on their site, for example https://clinicaltrials.gov/ct2/results?cond=Cancer&cntry=CA&Search=Apply&recrs=b&recrs=a&age_v=&gndr=&type=Intr&rslt=
* 2. Click their Download link/Button
* 3. Adjust the settings in the Pop up dialog (Include all studies, all columns, and export as XML)
* 4. Download and save the file
* 5. Upload the file to GraphKB using this module
*
* @module importer/clinicaltrialsgov
*/
const path = require('path');

Check failure on line 5 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

'path' is assigned a value but never used

Check failure on line 5 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

'path' is assigned a value but never used

Check failure on line 5 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

'path' is assigned a value but never used
const Ajv = require('ajv');
const fs = require('fs');

const {
loadXmlToJson,
parseXmlToJson,
checkSpec,
requestWithRetry,
} = require('../util');
Expand All @@ -28,7 +18,6 @@ const { clinicalTrialsGov: SOURCE_DEFN } = require('../sources');
const { api: apiSpec, rss: rssSpec } = require('./specs.json');

const BASE_URL = 'https://clinicaltrials.gov/api/v2/studies';
const RSS_URL = 'https://clinicaltrials.gov/ct2/results/rss.xml';
const CACHE = {};

const ajv = new Ajv();
Expand Down Expand Up @@ -124,11 +113,11 @@ const processPhases = (phaseList) => {


/**
* Process the XML trial record. Attempt to link the drug and/or disease information
* Process the record. Attempt to link the drug and/or disease information
*
* @param {object} opt
* @param {ApiConnection} opt.conn the GraphKB connection object
* @param {object} opt.record the XML record (pre-parsed into JSON)
* @param {object} opt.record the record (pre-parsed into JSON)
* @param {object|string} opt.source the 'source' record for clinicaltrials.gov
*
* @todo: handle updates to existing clinical trial records
Expand Down Expand Up @@ -306,47 +295,23 @@ const fetchAndLoadById = async (conn, nctID, { upsert = false } = {}) => {
return trial;
};

const formatDate = (date) => {

Check failure on line 298 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

Multiple spaces found before '='

Check failure on line 298 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

Unexpected block statement surrounding arrow body; move the returned value immediately after the `=>`

Check failure on line 298 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

Multiple spaces found before '='

Check failure on line 298 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

Unexpected block statement surrounding arrow body; move the returned value immediately after the `=>`

Check failure on line 298 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

Multiple spaces found before '='

Check failure on line 298 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

Unexpected block statement surrounding arrow body; move the returned value immediately after the `=>`
return `${date.getFullYear()}-${date.getMonth()+1}-${date.getDate()}`;

Check failure on line 299 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

Operator '+' must be spaced

Check failure on line 299 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

Operator '+' must be spaced

Check failure on line 299 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

Operator '+' must be spaced
}

Check failure on line 300 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

Missing semicolon

Check failure on line 300 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

Missing semicolon

Check failure on line 300 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

Missing semicolon

/**
* Uploads a file exported from clinicaltrials.gov as XML
* @param {object} opt
* @param {ApiConnection} opt.conn the GraphKB connection object
* @param {string} opt.filename the path to the XML export
* Loading all clinical trials related to cancer
*/
const uploadFiles = async ({ conn, files }) => {
const upload = async ({ conn, maxRecords, days }) => {
const source = await conn.addSource(SOURCE_DEFN);

logger.info(`loading ${files.length} records`);
const counts = {
error: 0, success: 0,
};

for (const filepath of files) {
const filename = path.basename(filepath);

if (!filename.endsWith('.xml')) {
logger.warn(`ignoring non-xml file: ${filename}`);
continue;
}
let options = {};

try {
const xml = await loadXmlToJson(filepath);
const record = convertAPIRecord(xml);
await processRecord({
conn, record, source, upsert: true,
});
counts.success++;
} catch (err) {
logger.error(`[${filename}] ${err}`);
counts.error++;
}
if (days) {
const startDate = new Date(Date.now() - days * 24 * 60 * 60 * 1000);
options = {'query.term': `AREA[LastUpdatePostDate]RANGE[${formatDate(startDate)},MAX]`};

Check failure on line 312 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

A space is required after '{'

Check failure on line 312 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

A space is required before '}'

Check failure on line 312 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

A space is required after '{'

Check failure on line 312 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

A space is required before '}'

Check failure on line 312 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

A space is required after '{'

Check failure on line 312 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

A space is required before '}'
logger.info(`loading records updated from ${formatDate(startDate)} to ${formatDate(new Date())}`);
}
logger.info(JSON.stringify(counts));
};

const upload = async ({ conn }) => {
const source = await conn.addSource(SOURCE_DEFN);



let trials = await requestWithRetry({
json: true,
Expand All @@ -355,7 +320,9 @@ const upload = async ({ conn }) => {
aggFilters: 'studyType:int',
countTotal: true,
pageSize: 1000,
sort: 'LastUpdatePostDate',
'query.cond': 'cancer',
...options

Check failure on line 325 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

Missing trailing comma

Check failure on line 325 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

Missing trailing comma

Check failure on line 325 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

Missing trailing comma
},
uri: BASE_URL,
});
Expand All @@ -366,9 +333,22 @@ const upload = async ({ conn }) => {
error: 0, success: 0,
};

let processCount = 1,
total;

Check failure on line 338 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-12

Trailing spaces not allowed

Check failure on line 338 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-14

Trailing spaces not allowed

Check failure on line 338 in src/clinicaltrialsgov/index.js

View workflow job for this annotation

GitHub Actions / node-16

Trailing spaces not allowed
if (maxRecords) {
total = maxRecords;
} else {
total = trials.totalCount;
}

for (const trial of trials.studies) {
if (processCount > total) {
break;
}
try {
const record = convertAPIRecord(trial);
logger.info(`processing (${processCount++}/${total}) record: ${record.sourceId}`);
await processRecord({
conn, record, source, upsert: true,
});
Expand All @@ -382,6 +362,9 @@ const upload = async ({ conn }) => {
let next = trials.nextPageToken;

while (next) {
if (processCount > total) {
break;
}
trials = await requestWithRetry({
json: true,
method: 'GET',
Expand All @@ -390,14 +373,20 @@ const upload = async ({ conn }) => {
countTotal: true,
pageSize: 1000,
pageToken: next,
sort: 'LastUpdatePostDate',
'query.cond': 'cancer',
...options
},
uri: BASE_URL,
});

for (const trial of trials.studies) {
if (processCount > total) {
break;
}
try {
const record = convertAPIRecord(trial);
logger.info(`processing (${processCount++}/${total}) record: ${record.sourceId}`);
await processRecord({
conn, record, source, upsert: true,
});
Expand All @@ -413,57 +402,10 @@ const upload = async ({ conn }) => {
logger.info(JSON.stringify(counts));
};



/**
* Parses clinical trial RSS Feed results for clinical trials in Canada and the US
* which were updated in the last 2 weeks
*/
const loadNewTrials = async ({ conn }) => {
// ping them both to get the list of recently updated trials
const recentlyUpdatedTrials = [];

const resp = await requestWithRetry({
method: 'GET',
qs: {
cond: 'cancer', // cancer related trials
count: 10000,
lup_d: 14,
rcv_d: '',
recrs: 'abdef',
sel_rss: 'mod14', // mod14 for last 2 weeks updated
type: 'Intr', // interventional only
},
uri: RSS_URL,
});
const xml = await parseXmlToJson(resp);
fs.writeFileSync('output.json', JSON.stringify(xml, null, 2));
checkSpec(validateRssFeed, xml);
recentlyUpdatedTrials.push(
...xml.rss.channel[0].item.map(item => item.guid[0]._),
);

logger.info(`loading ${recentlyUpdatedTrials.length} recently updated trials`);
const counts = { error: 0, success: 0 };

for (const trialId of recentlyUpdatedTrials) {
try {
await fetchAndLoadById(conn, trialId, { upsert: true });
counts.success++;
} catch (err) {
counts.error++;
logger.error(`[${trialId}] ${err}`);
}
}
logger.info(JSON.stringify(counts));
};

module.exports = {
SOURCE_DEFN,
convertAPIRecord,
fetchAndLoadById,
kb: true,
loadNewTrials,
upload,
uploadFiles,
};
57 changes: 0 additions & 57 deletions src/clinicaltrialsgov/specs.json
Original file line number Diff line number Diff line change
Expand Up @@ -148,62 +148,5 @@
"protocolSection"
],
"type": "object"
},
"rss": {
"properties": {
"rss": {
"properties": {
"channel": {
"items": {
"properties": {
"item": {
"items": {
"properties": {
"guid": {
"items": {
"properties": {
"_": {
"pattern": "^NCT\\d+$",
"type": "string"
}
},
"required": [
"_"
],
"type": "object"
},
"maxItems": 1,
"minItems": 1,
"type": "array"
}
},
"required": [
"guid"
],
"type": "object"
},
"type": "array"
}
},
"required": [
"item"
],
"type": "object"
},
"maxItems": 1,
"minItems": 1,
"type": "array"
}
},
"required": [
"channel"
],
"type": "object"
}
},
"required": [
"rss"
],
"type": "object"
}
}

0 comments on commit 1a67141

Please sign in to comment.