Skip to content

Commit

Permalink
Fix to QC function (#137)
Browse files Browse the repository at this point in the history
* Fix to QC function

* Return non-unique names in database

* Fix identification of duplicate names

* More succinct duplicate finding

Co-authored-by: John Lees <lees.john6@gmail.com>
  • Loading branch information
nickjcroucher and johnlees committed Dec 29, 2020
1 parent 5a7eb12 commit c7719c6
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 12 deletions.
26 changes: 14 additions & 12 deletions PopPUNK/sketchlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,21 +519,22 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
raw_fit = fitKmerCurve(raw, klist, jacobian)
corrected_fit = fitKmerCurve(corrected, klist, jacobian)
plot_fit(klist,
raw,
raw_fit,
corrected,
corrected_fit,
dbPrefix + "/" + dbPrefix + "_fit_example_" + str(plot_idx + 1),
"Example fit " + str(plot_idx + 1) + " - " + example[0] + " vs. " + example[1])
raw,
raw_fit,
corrected,
corrected_fit,
dbPrefix + "/" + dbPrefix + "_fit_example_" + str(plot_idx + 1),
"Example fit " + str(plot_idx + 1) + " - " + example[0] + " vs. " + example[1])
else:
query_db = queryPrefix + "/" + os.path.basename(queryPrefix)

if len(set(rNames).intersection(set(qNames))) > 0:
sys.stderr.write("Sample names in query are contained in reference database\n")
duplicated = set(rNames).intersection(set(qNames))
if len(duplicated) > 0:
sys.stderr.write("Sample names in query are contained in reference database:\n")
sys.stderr.write("\n".join(duplicated))
sys.stderr.write("Unique names are required!\n")
exit(0)
sys.exit(1)

# Calls to library
query_db = queryPrefix + "/" + os.path.basename(queryPrefix)
distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist,
True, False, threads, use_gpu, deviceid)

Expand Down Expand Up @@ -645,6 +646,7 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads):
remove = True
qc_file.write(dataset + '\tBelow lower length threshold\n')
elif seq_length[dataset] > upper_length:
remove = True
qc_file.write(dataset + '\tAbove upper length threshold\n')
if qc_dict['upper_n'] is not None and seq_ambiguous[dataset] > qc_dict['upper_n']:
remove = True
Expand Down Expand Up @@ -691,7 +693,7 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads):
elif qc_dict['qc_filter'] == 'continue':
retained = retained + failed

# calculate random matches if any sequences pass QC filters
# stop if no sequences pass QC
if len(retained) == 0:
sys.stderr.write('No sequences passed QC filters - please adjust your settings\n')
sys.exit(1)
Expand Down
3 changes: 3 additions & 0 deletions PopPUNK/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,10 @@ def readRfile(rFile, oneSeq=False):
sequences.append(sample_files)

if len(set(names)) != len(names):
seen = set()
dupes = set(x for x in names if x in seen or seen.add(x))
sys.stderr.write("Input contains duplicate names! All names must be unique\n")
sys.stderr.write("Non-unique names are " + ",".join(dupes) + "\n")
sys.exit(1)

return (names, sequences)
Expand Down

0 comments on commit c7719c6

Please sign in to comment.