Skip to content

Commit

Permalink
Merge branch 'master' into dbgrowth
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Standage committed Aug 18, 2023
2 parents 0a6112c + 292ef21 commit f404e01
Show file tree
Hide file tree
Showing 31 changed files with 110,055 additions and 100,551 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
microhapdb/_version.py export-subst
notebooks/*.ipynb linguist-documentation
dbbuild/sources/auton2015/scripts/infocalc linguist-vendored
dbbuild/legacy/sources/1kgp/infocalc linguist-vendored
638 changes: 342 additions & 296 deletions dbbuild/build-summary.txt

Large diffs are not rendered by default.

Binary file modified dbbuild/frequency.csv.gz
Binary file not shown.
8 changes: 4 additions & 4 deletions dbbuild/indels.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ mh01ZBF-002,0,T,TT
mh02FHL-003,3,A,AC;ACC
mh02FHL-006,33,T,TA
mh02ZBF-001,2,C,CC
mh03FHL-003.v2,1,AT,A
mh03FHL-003.v1,1,AT,A
mh03ZBF-001,0,CAATCAATCAA,CAATCAA
mh03ZBF-001,1,CT,C
mh03ZBF-002,0,TGTTGTT,TGTT
Expand Down Expand Up @@ -38,9 +38,9 @@ mh11KK-091,0,TG,T
mh11ZBF-001,1,TAAA,T
mh13FHL-002,6,AT,A;ATT
mh13ZBF-001,0,CAT,C
mh17FHL-005.v2,6,CT,C
mh17FHL-005.v2,10,C,CCA
mh17FHL-005.v2,12,GC,G
mh17FHL-005.v1,6,CT,C
mh17FHL-005.v1,10,C,CCA
mh17FHL-005.v1,12,GC,G
mh17ZBF-001,1,CAG,C
mh20ZBF-001,0,TTTTT,TT
mh22KK-064,3,AATAATT,A
Expand Down
2 changes: 1 addition & 1 deletion dbbuild/lib/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def check(self):
loci = set([m.locus for m in interval.data])
if len(loci) == 1:
continue
markers = sorted(interval.data, key=lambda m: (m.sources[0].year, m.sources[0].name))
markers = sorted(interval.data, key=lambda m: m.sources[0].sortkey)
for marker in markers[1:]:
if marker.name != markers[0].name and marker.name not in self.mergeables:
self.mergeables[marker.name] = markers[0].name
Expand Down
11 changes: 6 additions & 5 deletions dbbuild/lib/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,23 @@ def resolve(self):
yield marker
return
self.check_overlap()
for marker in sorted(self.markers, key=lambda m: (m.sources[0].year, m.name.lower())):
for marker in sorted(self.markers, key=lambda m: (m.source.sortkey, m.sortkey)):
if marker.posstr() in self.definition_names:
message = f"Marker {marker.name} as defined in {marker.sources[0].name} was defined previously and is redundant"
assert len(marker.sources) == 1, (marker.name, marker.sources)
message = f"Marker {marker.name} as defined in {marker.source.name} was defined previously and is redundant"
print(message)
self.source_name_map[marker.sources[0].name][marker.name] = self.definition_names[marker.posstr()]
self.source_name_map[marker.source.name][marker.name] = self.definition_names[marker.posstr()]
continue
else:
new_name = marker.name
if len(self.markers_by_definition) > 1:
new_name = f"{marker.name}.v{len(self.definition_names) + 1}"
self.definition_names[marker.posstr()] = new_name
self.source_name_map[marker.sources[0].name][marker.name] = new_name
self.source_name_map[marker.source.name][marker.name] = new_name
marker.name = new_name
for othermarker in self.markers_by_definition[marker.posstr()]:
if othermarker != marker:
marker.sources.append(othermarker.sources[0])
marker.sources.append(othermarker.source)
yield marker

def check_overlap(self):
Expand Down
20 changes: 19 additions & 1 deletion dbbuild/lib/marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# -------------------------------------------------------------------------------------------------

from .variant import VariantList
from itertools import chain
import pandas as pd


Expand Down Expand Up @@ -154,9 +155,13 @@ def span(self):
def sourcename(self):
if len(self.sources) == 0:
return None
names = [s.name for s in sorted(self.sources, key=lambda x: (x.year, x.name))]
names = [s.name for s in sorted(self.sources, key=lambda s: s.sortkey)]
return ";".join(names)

@property
def source(self):
return self.sources[0]

def posstr(self, refr="GRCh38"):
return ";".join(map(str, self.positions[refr]))

Expand All @@ -174,6 +179,19 @@ def overlaps(self, other):
same_chrom = self.chrom_num == other.chrom_num
return same_chrom and self.start <= other.end and self.end >= other.start

def rsid_union(self, *others):
rsids = set(self.rsids)
for other in others:
rsids |= set(other.rsids)
for rsid in rsids:
for marker in chain([self], others):
if rsid not in marker.rsids:
marker.rsids.append(rsid)

@property
def sortkey(self):
return self.chrom_num, self.span, self.name


class MarkerFromPositions(Marker):
def __init__(self, name, positions, rsids, index, xrefs=None, source=None):
Expand Down
23 changes: 14 additions & 9 deletions dbbuild/lib/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,11 @@
from .locus import Locus
from .marker import Marker
from .variant import VariantIndex
from collections import Counter, defaultdict
from collections import defaultdict
from io import StringIO
import json
import pandas as pd
from pathlib import Path
from pyfaidx import Fasta as FastaIdx
import rsidx
import sqlite3
import subprocess
from tempfile import TemporaryDirectory


class DataSource:
Expand Down Expand Up @@ -69,6 +64,16 @@ def name(self):
def year(self):
return self.metadata["year"]

@property
def order(self):
if "order" in self.metadata:
return self.metadata["order"]
return 0

@property
def sortkey(self):
return self.year, self.order, self.name.lower()

@property
def description(self):
return self.metadata["description"]
Expand Down Expand Up @@ -175,7 +180,7 @@ def update_marker_names(self):
self._markers.append(marker)
for sourcename, namedict in locus.source_name_map.items():
source_name_map[sourcename].update(namedict)
for source in sorted(self.sources, key=lambda s: (s.year, s.name)):
for source in sorted(self.sources, key=lambda s: s.sortkey):
source.rename_markers(source_name_map[source.name])

def interval_check(self):
Expand All @@ -186,7 +191,7 @@ def interval_check(self):
@property
def markers(self):
table = list()
for marker in sorted(self._markers, key=lambda m: (m.chrom_num, m.span, m.name)):
for marker in sorted(self._markers, key=lambda m: m.sortkey):
table.append(marker.fields)
return pd.DataFrame(table, columns=Marker.field_names)

Expand Down Expand Up @@ -218,6 +223,6 @@ def merges(self):

def __str__(self):
output = StringIO()
for source in sorted(self.sources, key=lambda s: (s.year, s.name.lower())):
for source in sorted(self.sources, key=lambda s: s.sortkey):
print(source, file=output)
return output.getvalue()
33 changes: 18 additions & 15 deletions dbbuild/lib/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from collections import defaultdict
from dataclasses import dataclass
import json
import pandas as pd
from pathlib import Path
import rsidx
Expand Down Expand Up @@ -83,7 +84,7 @@ def load_merged_rsids(self, updateint=1e6):
merged_file = self.dbsnp_path / "refsnp-merged.csv.gz"
if not merged_file.is_file():
merged_file = self.dbsnp_path / "refsnp-merged.csv"
if merged_file:
if merged_file.is_file():
table = pd.read_csv(merged_file)
self.merged_rsids = dict(zip(table.Source, table.Target))
else:
Expand All @@ -92,20 +93,22 @@ def load_merged_rsids(self, updateint=1e6):
raise FileNotFoundError(merged_file)
self.merged_rsids = dict()
threshold = updateint
for n, line in enumerate(instream):
try:
data = json.loads(line)
except:
warn(f"Could not parse line {n+1}, skipping: {line}")
source = data["refsnp_id"]
targets = data["merged_snapshot_data"]["merged_into"]
for target in targets:
self.merged_rsids[f"rs{source}"] = f"rs{target}"
if n >= threshold:
threshold += updateint
if threshold == updateint * 10:
updateint = threshold
print(f"processed {n} rows")
with open(merged_file, "r") as instream:
for n, line in enumerate(instream):
try:
data = json.loads(line)
except Exception:
warn(f"Could not parse line {n+1}, skipping: {line}")
continue
source = data["refsnp_id"]
targets = data["merged_snapshot_data"]["merged_into"]
for target in targets:
self.merged_rsids[f"rs{source}"] = f"rs{target}"
if n >= threshold:
threshold += updateint
if threshold == updateint * 10:
updateint = threshold
print(f"processed {n} rows")
table = pd.DataFrame(self.merged_rsids.items(), columns=["Source", "Target"])
table.to_csv(self.dbsnp_path / "refsnp-merged.csv", index=False)

Expand Down
Loading

0 comments on commit f404e01

Please sign in to comment.