Skip to content

Commit

Permalink
Create UMLS parser for Entrez genes
Browse files Browse the repository at this point in the history
Convert Entrez gene ids into UMLS CUIs using the mapping provided
by HGNC.
  • Loading branch information
Tong Shu Li authored and sirloon committed Sep 5, 2017
1 parent ad45ee5 commit 15cc71d
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 0 deletions.
Empty file.
61 changes: 61 additions & 0 deletions src/dataload/sources/umls/umls_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Determine UMLS CUI to Entrez Gene id mappings for genes
# 1. Parse UMLS to determine HGNC ids for each CUI
# 2. Use HGNC to convert HGNC ids to Entrez Gene ids

from collections import defaultdict

import pandas as pd

def parse_hgnc():
"""Determine HGNC to Entrez gene id mapping.
Drops all genes without Entrez Gene ids.
"""

file_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt"

# drops any HGNC genes with missing Entrez ids
return (pd
.read_csv(file_url, sep='\t', low_memory=False)
[["hgnc_id", "symbol", "status", "entrez_id"]]
.dropna(axis=0, how="any", subset=["entrez_id"])
.assign(entrez_id = lambda df: df["entrez_id"].astype(int))
)

def parse_umls():
"""Parse the UMLS to determine the HGNC identifier of each gene CUI.
The relevant files are in the archive <version>-1-meta.nlm (a zip file)
within <version>/META/MRCONSO.RRF.*.gz
Concatenate the unzipped versions of the MRCONSO files together to get the
final MRCONSO.RRF file, which is a | delimited text file without a header.
"""

res = defaultdict(list)
with open("MRCONSO.RRF", "r") as fin:
for line in fin:
if "HGNC:" in line:
vals = line.rstrip("\n").split("|")

cui = vals[0]
for val in vals[1:]:
if val.startswith("HGNC:"):
res["cui"].append(cui)
res["hgnc_id"].append(val)

return pd.DataFrame(res).drop_duplicates()

def load_data():
hgnc_map = parse_hgnc()
cui_map = parse_umls()

res = hgnc_map.merge(cui_map, how="inner", on="hgnc_id")

for idx, row in res.iterrows():
yield {
"_id": row["entrez_id"],
"umls": {
"cui": row["cui"]
}
}

0 comments on commit 15cc71d

Please sign in to comment.