Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Formatting python scripts with black auto-formatter #73

Open
wants to merge 1 commit into
base: accelerate_preprocess_new
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
597 changes: 385 additions & 212 deletions neusomatic/python/call.py

Large diffs are not rendered by default.

210 changes: 137 additions & 73 deletions neusomatic/python/dataloader.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion neusomatic/python/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
NUM_ST_FEATURES = 26
VCF_HEADER = "##fileformat=VCFv4.2"
TYPE_CLASS_DICT = {"DEL": 0, "INS": 1, "NONE": 2, "SNP": 3}
VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP']
VARTYPE_CLASSES = ["DEL", "INS", "NONE", "SNP"]
MAT_DTYPES = ["uint8", "uint16"]
597 changes: 418 additions & 179 deletions neusomatic/python/extend_features.py

Large diffs are not rendered by default.

169 changes: 112 additions & 57 deletions neusomatic/python/extract_postprocess_targets.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# extract_postprocess_targets.py
# Extract variants that need postprocessing (like larege INDELs)
# from the output predictions of 'call.py'.
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
import argparse
import traceback
import logging
Expand All @@ -19,34 +19,31 @@ def check_rep(ref_seq, left_right, w):
if len(ref_seq) < 2 * w:
return False
if left_right == "left":
return ref_seq[0:w] == ref_seq[w:2 * w]
return ref_seq[0:w] == ref_seq[w : 2 * w]
elif left_right == "right":
return ref_seq[-w:] == ref_seq[-2 * w:-w]
return ref_seq[-w:] == ref_seq[-2 * w : -w]
else:
logger.error("Wrong left/right value: {}".format(left_right))
raise Exception


def extend_region_repeat(chrom, start, end, ref_fasta,
chrom_length, pad):
def extend_region_repeat(chrom, start, end, ref_fasta, chrom_length, pad):
logger = logging.getLogger(extend_region_repeat.__name__)
new_start = start
new_end = end
w = 3
while True:
changed = False
new_start = max(new_start - pad - w, 1)
ref_seq = ref_fasta.fetch(
chrom, new_start, new_end + 1).upper()
ref_seq = ref_fasta.fetch(chrom, new_start, new_end + 1).upper()
while True:
cnt_s = 0
for rep_len in [1, 2, 3, 4]:
if cnt_s > 0:
continue
while check_rep(ref_seq, "left", rep_len) and new_start > rep_len:
new_start -= rep_len
ref_seq = ref_fasta.fetch(
chrom, new_start, new_end + 1).upper()
ref_seq = ref_fasta.fetch(chrom, new_start, new_end + 1).upper()
cnt_s += rep_len
changed = True
if cnt_s > 0:
Expand All @@ -58,17 +55,18 @@ def extend_region_repeat(chrom, start, end, ref_fasta,
while True:
changed = False
new_end = min(new_end + pad + w, chrom_length - 2)
ref_seq = ref_fasta.fetch(
chrom, new_start, new_end + 1).upper()
ref_seq = ref_fasta.fetch(chrom, new_start, new_end + 1).upper()
while True:
cnt_e = 0
for rep_len in [1, 2, 3, 4]:
if cnt_e > 0:
continue
while check_rep(ref_seq, "right", rep_len) and new_end < chrom_length - rep_len - 1:
while (
check_rep(ref_seq, "right", rep_len)
and new_end < chrom_length - rep_len - 1
):
new_end += rep_len
ref_seq = ref_fasta.fetch(
chrom, new_start, new_end + 1).upper()
ref_seq = ref_fasta.fetch(chrom, new_start, new_end + 1).upper()
cnt_e += rep_len
changed = True
if cnt_e > 0:
Expand All @@ -80,7 +78,9 @@ def extend_region_repeat(chrom, start, end, ref_fasta,
return new_start, new_end


def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, extend_repeats, pad):
def extract_postprocess_targets(
reference, input_vcf, min_len, max_dist, extend_repeats, pad
):
logger = logging.getLogger(extract_postprocess_targets.__name__)

logger.info("--------------Extract Postprocessing Targets---------------")
Expand All @@ -107,16 +107,53 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, extend_
if not record_set:
record_set.append(record)
continue
chrom_, pos_, ref_, alt_ = push_left_var(
ref_fasta, chrom, pos, ref, alt)
if len(list(filter(lambda x: (chrom == x[0] and
(min(abs(x[1] + len(x[2]) - (pos + len(ref))),
abs(x[1] - pos),
abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos))) <= max_dist)), record_set))) > 0 or len(
list(filter(lambda x: (chrom_ == x[0] and
(min(abs(x[1] + len(x[2]) - (pos_ + len(ref_))),
abs(x[1] - pos_),
abs(min(x[1] + len(x[2]), pos_ + len(ref_)) - max(x[1], pos_))) <= max_dist)), record_set))) > 0:
chrom_, pos_, ref_, alt_ = push_left_var(ref_fasta, chrom, pos, ref, alt)
if (

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pavanstrand the formatting seems to have gone crazy here. Would be better with a bit of refactoring here.

len(
list(
filter(
lambda x: (
chrom == x[0]
and (
min(
abs(x[1] + len(x[2]) - (pos + len(ref))),
abs(x[1] - pos),
abs(
min(x[1] + len(x[2]), pos + len(ref))
- max(x[1], pos)
),
)
<= max_dist
)
),
record_set,
)
)
)
> 0
or len(
list(
filter(
lambda x: (
chrom_ == x[0]
and (
min(
abs(x[1] + len(x[2]) - (pos_ + len(ref_))),
abs(x[1] - pos_),
abs(
min(x[1] + len(x[2]), pos_ + len(ref_))
- max(x[1], pos_)
),
)
<= max_dist
)
),
record_set,
)
)
)
> 0
):
record_set.append(record)
continue

Expand All @@ -138,16 +175,21 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, extend_
if len(varid_pos[vid]) > 1:
multi_allelic = True

if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)) or multi_allelic:
if (
list(filter(lambda x: len(x[2]) != len(x[3]), record_set))
or multi_allelic
):
for x in record_set:
fields = x[-1].strip().split()
# fields[2] = str(ii)
if ii not in redo_vars:
redo_vars[ii] = []
redo_vars[ii].append(fields)
redo_regions[ii] = [record_set[0][0], max(0,
min(map(lambda x:x[1], record_set)) - pad),
max(map(lambda x:x[1] + len(x[2]), record_set)) + pad]
redo_regions[ii] = [
record_set[0][0],
max(0, min(map(lambda x: x[1], record_set)) - pad),
max(map(lambda x: x[1] + len(x[2]), record_set)) + pad,
]
else:
for x in record_set:
o_f.write(x[-1])
Expand All @@ -160,30 +202,32 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, extend_
redo_vars[ii] = []
redo_vars[ii].append(fields)
chrom_, pos_, ref_, alt_ = record_set[0][0:4]
redo_regions[ii] = [chrom_, max(
0, pos_ - pad), pos_ + len(ref_) + pad]
redo_regions[ii] = [
chrom_,
max(0, pos_ - pad),
pos_ + len(ref_) + pad,
]
else:
o_f.write(record_set[0][-1])


if extend_repeats:
chrom_lengths = dict(
zip(ref_fasta.references, ref_fasta.lengths))
chrom_lengths = dict(zip(ref_fasta.references, ref_fasta.lengths))
tmp_ = get_tmp_file()
with open(tmp_, "w") as o_f:
for ii in redo_regions:
chrom, st, en = redo_regions[ii]
st, en = extend_region_repeat(
chrom, st, en, ref_fasta, chrom_lengths[chrom], 0)
chrom, st, en, ref_fasta, chrom_lengths[chrom], 0
)
o_f.write("\t".join(list(map(str, [chrom, st, en, ii]))) + "\n")
tmp_=bedtools_sort(tmp_,run_logger=logger)
tmp_=bedtools_merge(tmp_,args="-c 4 -o collapse", run_logger=logger)
tmp_ = bedtools_sort(tmp_, run_logger=logger)
tmp_ = bedtools_merge(tmp_, args="-c 4 -o collapse", run_logger=logger)
else:
tmp_ = get_tmp_file()
with open(tmp_, "w") as o_f:
for ii in redo_regions:
chrom, st, en = redo_regions[ii]
o_f.write("\t".join(list(map(str, [chrom, st, en, ii]))) + "\n")
o_f.write("\t".join(list(map(str, [chrom, st, en, ii]))) + "\n")
j = 0
with open(tmp_) as i_f, open(redo_vcf, "w") as r_f, open(redo_bed, "w") as r_b:
r_f.write("{}\n".format(VCF_HEADER))
Expand All @@ -198,35 +242,46 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, extend_
j += 1


if __name__ == '__main__':
if __name__ == "__main__":

FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
FORMAT = "%(levelname)s %(asctime)-15s %(name)-20s %(message)s"
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)

parser = argparse.ArgumentParser(
description='infer genotype by ao and ro counts')
parser.add_argument('--reference', type=str,
help='reference fasta filename', required=True)
parser.add_argument('--input_vcf', type=str,
help='input vcf', required=True)
parser.add_argument('--min_len', type=int,
help='minimum INDEL len to resolve', default=4)
parser.add_argument('--max_dist', type=int,
help='max distance to neighboring variant', default=5)
parser.add_argument('--extend_repeats',
help='extend resolve regions to repeat boundaries',
action='store_true')
parser = argparse.ArgumentParser(description="infer genotype by ao and ro counts")
parser.add_argument(
"--reference", type=str, help="reference fasta filename", required=True
)
parser.add_argument("--input_vcf", type=str, help="input vcf", required=True)
parser.add_argument(
"--min_len", type=int, help="minimum INDEL len to resolve", default=4
)
parser.add_argument(
"--max_dist", type=int, help="max distance to neighboring variant", default=5
)
parser.add_argument(
"--extend_repeats",
help="extend resolve regions to repeat boundaries",
action="store_true",
)
parser.add_argument(
'--pad', type=int, help='padding to bed region for extracting reads', default=10)
"--pad", type=int, help="padding to bed region for extracting reads", default=10
)
args = parser.parse_args()
logger.info(args)
try:
extract_postprocess_targets(
args.reference, args.input_vcf, args.min_len, args.max_dist, args.extend_repeats, args.pad)
args.reference,
args.input_vcf,
args.min_len,
args.max_dist,
args.extend_repeats,
args.pad,
)
except Exception as e:
logger.error(traceback.format_exc())
logger.error("Aborting!")
logger.error(
"extract_postprocess_targets.py failure on arguments: {}".format(args))
"extract_postprocess_targets.py failure on arguments: {}".format(args)
)
raise e
Loading