Skip to content

Commit

Permalink
clean up dbbuild
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Standage committed Jul 18, 2023
1 parent 5592dbb commit 534a5ae
Show file tree
Hide file tree
Showing 8 changed files with 1,981 additions and 1,967 deletions.
2,168 changes: 1,084 additions & 1,084 deletions dbbuild/build-summary.txt

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions dbbuild/indels.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Marker,VariantIndex,Refr,Alt
mh01ZBF-002,0,T,TT
mh02FHL-003,3,A,AC;ACC
mh02FHL-006,33,T,TA
mh02FHL-003.v3,3,A,AC;ACC
mh02FHL-006.v2,33,T,TA
mh02ZBF-001,2,C,CC
mh03FHL-003.v2,1,AT,A
mh03FHL-003.v1,1,AT,A
mh03ZBF-001,0,CAATCAATCAA,CAATCAA
mh03ZBF-001,1,CT,C
mh03ZBF-002,0,TGTTGTT,TGTT
Expand Down Expand Up @@ -38,9 +38,9 @@ mh11KK-091,0,TG,T
mh11ZBF-001,1,TAAA,T
mh13FHL-002,6,AT,A;ATT
mh13ZBF-001,0,CAT,C
mh17FHL-005.v2,6,CT,C
mh17FHL-005.v2,10,C,CCA
mh17FHL-005.v2,12,GC,G
mh17FHL-005.v1,6,CT,C
mh17FHL-005.v1,10,C,CCA
mh17FHL-005.v1,12,GC,G
mh17ZBF-001,1,CAG,C
mh20ZBF-001,0,TTTTT,TT
mh22KK-064,3,AATAATT,A
Expand Down
2 changes: 1 addition & 1 deletion dbbuild/lib/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def check(self):
loci = set([m.locus for m in interval.data])
if len(loci) == 1:
continue
markers = sorted(interval.data, key=lambda m: (m.sources[0].year, m.sources[0].name))
markers = sorted(interval.data, key=lambda m: m.sortkey)
for marker in markers[1:]:
if marker.name != markers[0].name and marker.name not in self.mergeables:
self.mergeables[marker.name] = markers[0].name
Expand Down
11 changes: 6 additions & 5 deletions dbbuild/lib/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,23 @@ def resolve(self):
yield marker
return
self.check_overlap()
for marker in sorted(self.markers, key=lambda m: (m.sources[0].year, m.name.lower())):
for marker in sorted(self.markers, key=lambda m: (m.source.sortkey, m.sortkey)):
if marker.posstr() in self.definition_names:
message = f"Marker {marker.name} as defined in {marker.sources[0].name} was defined previously and is redundant"
assert len(marker.sources) == 1, (marker.name, marker.sources)
message = f"Marker {marker.name} as defined in {marker.source.name} was defined previously and is redundant"
print(message)
self.source_name_map[marker.sources[0].name][marker.name] = self.definition_names[marker.posstr()]
self.source_name_map[marker.source.name][marker.name] = self.definition_names[marker.posstr()]
continue
else:
new_name = marker.name
if len(self.markers_by_definition) > 1:
new_name = f"{marker.name}.v{len(self.definition_names) + 1}"
self.definition_names[marker.posstr()] = new_name
self.source_name_map[marker.sources[0].name][marker.name] = new_name
self.source_name_map[marker.source.name][marker.name] = new_name
marker.name = new_name
for othermarker in self.markers_by_definition[marker.posstr()]:
if othermarker != marker:
marker.sources.append(othermarker.sources[0])
marker.sources.append(othermarker.source)
yield marker

def check_overlap(self):
Expand Down
10 changes: 9 additions & 1 deletion dbbuild/lib/marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,13 @@ def span(self):
def sourcename(self):
if len(self.sources) == 0:
return None
names = [s.name for s in sorted(self.sources, key=lambda x: (x.year, x.name))]
names = [s.name for s in sorted(self.sources, key=lambda s: s.sortkey)]
return ";".join(names)

@property
def source(self):
return self.sources[0]

def posstr(self, refr="GRCh38"):
return ";".join(map(str, self.positions[refr]))

Expand All @@ -174,6 +178,10 @@ def overlaps(self, other):
same_chrom = self.chrom_num == other.chrom_num
return same_chrom and self.start <= other.end and self.end >= other.start

@property
def sortkey(self):
return self.chrom_num, self.span, self.name


class MarkerFromPositions(Marker):
def __init__(self, name, positions, rsids, index, xrefs=None, source=None):
Expand Down
23 changes: 14 additions & 9 deletions dbbuild/lib/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,11 @@
from .locus import Locus
from .marker import Marker
from .variant import VariantIndex
from collections import Counter, defaultdict
from collections import defaultdict
from io import StringIO
import json
import pandas as pd
from pathlib import Path
from pyfaidx import Fasta as FastaIdx
import rsidx
import sqlite3
import subprocess
from tempfile import TemporaryDirectory


class DataSource:
Expand Down Expand Up @@ -69,6 +64,16 @@ def name(self):
def year(self):
return self.metadata["year"]

@property
def order(self):
if "order" in self.metadata:
return self.metadata["order"]
return 0

@property
def sortkey(self):
return self.year, self.order, self.name.lower()

@property
def description(self):
return self.metadata["description"]
Expand Down Expand Up @@ -175,7 +180,7 @@ def update_marker_names(self):
self._markers.append(marker)
for sourcename, namedict in locus.source_name_map.items():
source_name_map[sourcename].update(namedict)
for source in sorted(self.sources, key=lambda s: (s.year, s.name)):
for source in sorted(self.sources, key=lambda s: s.sortkey):
source.rename_markers(source_name_map[source.name])

def interval_check(self):
Expand All @@ -186,7 +191,7 @@ def interval_check(self):
@property
def markers(self):
table = list()
for marker in sorted(self._markers, key=lambda m: (m.chrom_num, m.span, m.name)):
for marker in sorted(self._markers, key=lambda m: m.sortkey):
table.append(marker.fields)
return pd.DataFrame(table, columns=Marker.field_names)

Expand Down Expand Up @@ -218,6 +223,6 @@ def merges(self):

def __str__(self):
output = StringIO()
for source in sorted(self.sources, key=lambda s: (s.year, s.name.lower())):
for source in sorted(self.sources, key=lambda s: s.sortkey):
print(source, file=output)
return output.getvalue()
Loading

0 comments on commit 534a5ae

Please sign in to comment.