Skip to content

Commit

Permalink
Merge pull request #142 from bcgsc/release/v7.0.1
Browse files Browse the repository at this point in the history
Release/v7.0.1
  • Loading branch information
sshugsc committed Mar 15, 2024
2 parents f8d1b6f + a6dd81b commit fe8e1fd
Show file tree
Hide file tree
Showing 7 changed files with 14 additions and 394 deletions.
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@bcgsc-pori/graphkb-loader",
"main": "src/index.js",
"version": "7.0.0",
"version": "7.0.1",
"repository": {
"type": "git",
"url": "https://github.com/bcgsc/pori_graphkb_loader.git"
Expand Down
4 changes: 3 additions & 1 deletion src/ensembl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ This loader loads both a BioMart export TSV file or individual records by ID. It
to batch load Ensembl data but you can do so if you would like it to appear for users who
will use the auto-complete adding variants through GraphKB client

Link for archived ensembl versions: https://useast.ensembl.org/info/website/archives/index.html

First download the batch export from BioMart

```bash
query_string='<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "transcript_biotype" value = "protein_coding"/><Attribute name = "ensembl_gene_id" /><Attribute name = "ensembl_gene_id_version" /><Attribute name = "ensembl_transcript_id" /><Attribute name = "ensembl_transcript_id_version" /><Attribute name = "ensembl_peptide_id" /><Attribute name = "ensembl_peptide_id_version" /><Attribute name = "hgnc_id" /><Attribute name = "refseq_mrna" /><Attribute name = "description" /><Attribute name = "external_gene_name" /><Attribute name = "external_gene_source" /></Dataset></Query>'
query_string='<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "transcript_biotype" value = "protein_coding"/><Attribute name = "ensembl_gene_id" /><Attribute name = "ensembl_gene_id_version" /><Attribute name = "ensembl_transcript_id" /><Attribute name = "ensembl_transcript_id_version" /><Attribute name = "ensembl_peptide_id" /><Attribute name = "ensembl_peptide_id_version" /><Attribute name = "hgnc_id" /><Attribute name = "description" /><Attribute name = "external_gene_name" /><Attribute name = "external_gene_source" /></Dataset></Query>'
wget -O biomart_export.tsv "http://www.ensembl.org/biomart/martservice?query=$query_string"
```

Expand Down
54 changes: 3 additions & 51 deletions src/ensembl/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@

const { loadDelimToJson, requestWithRetry, convertRowFields } = require('../util');
const {
rid, orderPreferredOntologyTerms, generateCacheKey,
rid, generateCacheKey,
} = require('../graphkb');
const { logger } = require('../logging');
const _hgnc = require('../hgnc');
const _entrez = require('../entrez/gene');
const _refseq = require('../entrez/refseq');
const { ensembl: SOURCE_DEFN, refseq: refseqSourceDefn } = require('../sources');
const { ensembl: SOURCE_DEFN } = require('../sources');

const BASE_URL = 'http://rest.ensembl.org';

Expand Down Expand Up @@ -182,7 +181,6 @@ const uploadFile = async (opt) => {
geneIdVersion: 'Gene stable ID version',
hgncId: 'HGNC ID',
proteinIdVersion: 'Protein stable ID version',
refseqId: 'RefSeq mRNA ID',
transcriptIdVersion: 'Transcript stable ID version',
};
const { filename, conn } = opt;
Expand All @@ -198,12 +196,9 @@ const uploadFile = async (opt) => {

const source = await conn.addSource(SOURCE_DEFN);

const refseqSource = await conn.addSource(refseqSourceDefn);


const visited = {}; // cache genes to speed up adding records
const hgncMissingRecords = new Set();
const refseqMissingRecords = new Set();

logger.info('pre-load the entrez cache to avoid unecessary requests');
await _entrez.preLoadCache(conn);
Expand Down Expand Up @@ -263,18 +258,6 @@ const uploadFile = async (opt) => {
}


logger.info('pre-fetching refseq entries');
await _refseq.preLoadCache(conn);
const missingRefSeqIds = new Set();
rows.map(r => r.refseqId || '').forEach((id) => {
if (!_refseq.cacheHas(id) && id) {
missingRefSeqIds.add(id);
}
});

logger.info(`fetching ${missingRefSeqIds.size} missing refseq entries`);
await _refseq.fetchAndLoadByIds(conn, Array.from(missingRefSeqIds));

logger.info(`processing ${rows.length} records`);

for (let index = 0; index < rows.length; index++) {
Expand Down Expand Up @@ -481,35 +464,6 @@ const uploadFile = async (opt) => {



// transcript -> crossreferenceof -> refseq
if (record.refseqId) {
skip--;

try {
const refseq = await conn.getUniqueRecordBy({
filters: {
AND: [
{ source: rid(refseqSource) },
{ sourceId: record.refseqId },
{ sourceIdVersion: null },
],
},
sort: orderPreferredOntologyTerms,
target: 'Feature',
});
await conn.addRecord({
content: {
in: rid(refseq), out: rid(transcript), source: rid(source),
},
existsOk: true,
fetchExisting: false,
target: 'crossreferenceof',
});
} catch (err) {
logger.log('error', `failed cross-linking from ${record.transcriptId} to ${record.refseqId}`);
refseqMissingRecords.add(record.refseqId);
}
}
// gene -> crossreferenceof -> hgnc
if (record.hgncId && newGene) {
skip--;
Expand Down Expand Up @@ -539,9 +493,7 @@ const uploadFile = async (opt) => {
if (hgncMissingRecords.size) {
logger.warn(`Unable to retrieve ${hgncMissingRecords.size} hgnc records for linking`);
}
if (refseqMissingRecords.size) {
logger.warn(`Unable to retrieve ${refseqMissingRecords.size} refseq records for linking`);
}

logger.info(JSON.stringify(counts));
};

Expand Down
10 changes: 5 additions & 5 deletions test/data/ensembl_biomart_export_ENSG00000139618.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Gene stable ID Gene stable ID version Transcript stable ID Transcript stable ID version Protein stable ID Protein stable ID version HGNC ID RefSeq mRNA ID Gene description Gene name Source of gene name
ENSG00000139618 ENSG00000139618.17 ENST00000544455 ENST00000544455.6 ENSP00000439902 ENSP00000439902.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
ENSG00000139618 ENSG00000139618.17 ENST00000530893 ENST00000530893.6 ENSP00000499438 ENSP00000499438.2 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
ENSG00000139618 ENSG00000139618.17 ENST00000380152 ENST00000380152.8 ENSP00000369497 ENSP00000369497.3 HGNC:1101 NM_000059 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
ENSG00000139618 ENSG00000139618.17 ENST00000680887 ENST00000680887.1 ENSP00000505508 ENSP00000505508.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
Gene stable ID Gene stable ID version Transcript stable ID Transcript stable ID version Protein stable ID Protein stable ID version HGNC ID Gene description Gene name Source of gene name
ENSG00000139618 ENSG00000139618.17 ENST00000544455 ENST00000544455.6 ENSP00000439902 ENSP00000439902.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
ENSG00000139618 ENSG00000139618.17 ENST00000530893 ENST00000530893.6 ENSP00000499438 ENSP00000499438.2 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
ENSG00000139618 ENSG00000139618.17 ENST00000380152 ENST00000380152.8 ENSP00000369497 ENSP00000369497.3 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
ENSG00000139618 ENSG00000139618.17 ENST00000680887 ENST00000680887.1 ENSP00000505508 ENSP00000505508.1 HGNC:1101 BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101] BRCA2 HGNC Symbol
Loading

0 comments on commit fe8e1fd

Please sign in to comment.