Skip to content

Commit

Permalink
Clean up variant index code
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Standage committed Aug 18, 2023
1 parent 534a5ae commit b8098a2
Showing 1 changed file with 18 additions and 15 deletions.
33 changes: 18 additions & 15 deletions dbbuild/lib/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from collections import defaultdict
from dataclasses import dataclass
import json
import pandas as pd
from pathlib import Path
import rsidx
Expand Down Expand Up @@ -83,7 +84,7 @@ def load_merged_rsids(self, updateint=1e6):
merged_file = self.dbsnp_path / "refsnp-merged.csv.gz"
if not merged_file.is_file():
merged_file = self.dbsnp_path / "refsnp-merged.csv"
if merged_file:
if merged_file.is_file():
table = pd.read_csv(merged_file)
self.merged_rsids = dict(zip(table.Source, table.Target))
else:
Expand All @@ -92,20 +93,22 @@ def load_merged_rsids(self, updateint=1e6):
raise FileNotFoundError(merged_file)
self.merged_rsids = dict()
threshold = updateint
for n, line in enumerate(instream):
try:
data = json.loads(line)
except:
warn(f"Could not parse line {n+1}, skipping: {line}")
source = data["refsnp_id"]
targets = data["merged_snapshot_data"]["merged_into"]
for target in targets:
self.merged_rsids[f"rs{source}"] = f"rs{target}"
if n >= threshold:
threshold += updateint
if threshold == updateint * 10:
updateint = threshold
print(f"processed {n} rows")
with open(merged_file, "r") as instream:
for n, line in enumerate(instream):
try:
data = json.loads(line)
except Exception:
warn(f"Could not parse line {n+1}, skipping: {line}")
continue
source = data["refsnp_id"]
targets = data["merged_snapshot_data"]["merged_into"]
for target in targets:
self.merged_rsids[f"rs{source}"] = f"rs{target}"
if n >= threshold:
threshold += updateint
if threshold == updateint * 10:
updateint = threshold
print(f"processed {n} rows")
table = pd.DataFrame(self.merged_rsids.items(), columns=["Source", "Target"])
table.to_csv(self.dbsnp_path / "refsnp-merged.csv", index=False)

Expand Down

0 comments on commit b8098a2

Please sign in to comment.