Skip to content

Commit

Permalink
adjustments for mm39 and mm10
Browse files Browse the repository at this point in the history
  • Loading branch information
zcqian committed Nov 2, 2021
1 parent cae10a8 commit abc11d7
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 13 deletions.
2 changes: 1 addition & 1 deletion src/config_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

TAXONOMY = {
"human": {"tax_id": "9606", "assembly": "hg38"},
"mouse": {"tax_id": "10090", "assembly": "mm10"},
"mouse": {"tax_id": "10090", "assembly": "mm39"},
"rat": {"tax_id": "10116", "assembly": "rn6"},
"fruitfly": {"tax_id": "7227", "assembly": "dm6"},
"nematode": {"tax_id": "6239", "assembly": "ce11"},
Expand Down
3 changes: 3 additions & 0 deletions src/hub/dataload/sources/ucsc/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,13 @@ def get_file_list(self):
fixed_files = ['../hg38/database/refFlat.txt.gz',
'../hg19/database/refFlat.txt.gz',
'../mm9/database/refFlat.txt.gz',
'../mm10/database/refFlat.txt.gz',
'../mm39/database/refFlat.txt.gz',
'../hgFixed/database/refLink.txt.gz']
# hg38 is downloaded twice but having the directory 'Homo_sapiens'
# is essential for the parser/uploader to trigger the upload for hg19 and hg38
# and the tradeoff is only less than 10MB wasted disk space
# same goes for mouse genome assemblies
for file_path in [os.path.join(genome, "database/refFlat.txt.gz") for genome in genome_li] + fixed_files:
lastmodified = self.get_ftpfile_lastmodified(file_path)
if lastmodified:
Expand Down
47 changes: 35 additions & 12 deletions src/hub/dataload/sources/ucsc/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import time
from biothings.utils.common import timesofar
from biothings.utils.dataload import tab2dict, tabfile_feeder, list2dict
import copy


def load_exons_for_species(data_folder, species, exons_key='exons'):
Expand Down Expand Up @@ -41,35 +42,57 @@ def load_exons_for_species(data_folder, species, exons_key='exons'):
return gene2exons


def _merge_exons(base_assembly_exons: dict, other_assembly_exons: dict) -> dict:
"""
Merge exons from two assemblies
Given exons from two assemblies, perform a full outer join on identical gene id.
A copy is returned (not an in-place merge).
For example:
>>> e1 = {1: {'exons_a1': ...}, 2: {'exons_a1': ...} }
>>> e2 = {2: {'exons_a2': ...}, 3: {'exons_a2': ...} }
>>> e3 = _merge_exons(e1, e2)
>>> print(e3)
{1: {'exons_a1': ...}, 2: {'exons_a1': ..., 'exons_a2': ...}, 3: {'exons_a3':...}}
"""
merged_exons = copy.deepcopy(base_assembly_exons)
for gene_id in other_assembly_exons:
ot_exons = copy.deepcopy(other_assembly_exons[gene_id])
if gene_id in merged_exons:
merged_exons[gene_id].update(ot_exons)
else:
merged_exons[gene_id] = ot_exons
return merged_exons


def load_exons_for_human(data_folder):
'''We currently load exons on both hg19 and hg38 for human genes,
so it will be loaded separately from other species.
exons --> hg38
exons_hg19 --> hg19
'''
gene2exons_hg19 = load_exons_for_species(data_folder, '../hg19', exons_key='exons_hg19')
gene2exons = load_exons_for_species(data_folder, '../hg38', exons_key='exons')
for gid in gene2exons_hg19:
if gid in gene2exons:
gene2exons[gid].update(gene2exons_hg19[gid])
else:
gene2exons[gid] = gene2exons_hg19[gid]
gene2exons_hg38 = load_exons_for_species(data_folder, '../hg38', exons_key='exons')
gene2exons = _merge_exons(gene2exons_hg38, gene2exons_hg19)
return gene2exons


def load_exons_for_mouse(data_folder):
# exons for mm9, mm10 and mm39 are used
# mm39 (latest) is used as the basis
# and any exons present in the other assemblies are merged into it
'''We currently load exons on both mm9 and mm10 for mouse genes,
so it will be loaded separately from other species.
exons --> mm10
exons_mm9 --> mm9
'''
gene2exons = load_exons_for_species(data_folder, 'Mus_musculus')
gene2exons_mm39 = load_exons_for_species(data_folder, '../mm39', exons_key='exons')
gene2exons_mm9 = load_exons_for_species(data_folder, '../mm9', exons_key='exons_mm9')
for gid in gene2exons_mm9:
if gid in gene2exons:
gene2exons[gid].update(gene2exons_mm9[gid])
else:
gene2exons[gid] = gene2exons_mm9[gid]
gene2exons_mm10 = load_exons_for_species(data_folder, '../mm10', exons_key='exons_mm10')

gene2exons = _merge_exons(gene2exons_mm39, gene2exons_mm10)
gene2exons = _merge_exons(gene2exons, gene2exons_mm9)
return gene2exons


Expand Down

0 comments on commit abc11d7

Please sign in to comment.