Skip to content

Commit

Permalink
Further fixes to update-db
Browse files Browse the repository at this point in the history
  • Loading branch information
johnlees committed Aug 9, 2018
1 parent 81c7aec commit 7ce656e
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 8 deletions.
10 changes: 7 additions & 3 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,19 +420,23 @@ def main():
if args.full_db is False:
newRepresentativesNames, newRepresentativesFile = extractReferences(genomeNetwork, args.output, refList)
genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(newRepresentativesNames))
newQueries = set(newRepresentativesNames).intersection(ordered_queryList)
newQueries = [x for x in ordered_queryList if x in frozenset(newRepresentativesNames)] # intersection that maintains order
else:
newQueries = ordered_queryList
nx.write_gpickle(genomeNetwork, args.output + "/" + os.path.basename(args.output) + '_graph.gpickle')

# Update the mash database
tmpRefFile = writeTmpFile(newQueries)
constructDatabase(tmpRefFile, kmers, sketch_sizes, args.output, args.threads, args.mash, True) # overwrite old db
joinDBs(args.output, args.ref_db, kmers)
joinDBs(args.ref_db, args.output, args.output, kmers)
os.remove(tmpRefFile)

# Update distance matrices with all calculated distances
refList, refList_copy, self, ref_distMat = readPickle(args.distances)
if args.distances == None:
distanceFiles = args.ref_db + "/" + os.path.basename(args.ref_db) + ".dists"
else:
distanceFiles = args.distances
refList, refList_copy, self, ref_distMat = readPickle(distanceFiles)
combined_seq, core_distMat, acc_distMat = update_distance_matrices(refList, ref_distMat,
ordered_queryList, distMat, query_distMat)
complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat)
Expand Down
12 changes: 7 additions & 5 deletions PopPUNK/mash.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,16 @@ def getSeqsInDb(mashSketch, mash_exec = 'mash'):

return seqs

def joinDBs(db1, db2, klist, mash_exec = 'mash'):
def joinDBs(db1, db2, output, klist, mash_exec = 'mash'):
"""Join two mash sketch databases with ``mash paste``
Args:
db1 (str)
Prefix for db1. Used for new joined database
Prefix for db1
db2 (str)
Prefix for db2
output (str)
Prefix for joined output
klist (list)
List of k-mer sizes to sketch
mash_exec (str)
Expand All @@ -191,13 +193,13 @@ def joinDBs(db1, db2, klist, mash_exec = 'mash'):
"""
for kmer in klist:
try:
join_name = db1 + "/" + os.path.basename(db1) + "." + str(kmer) + ".joined"
join_name = output + "/" + os.path.basename(output) + "." + str(kmer) + ".joined"
db1_name = db1 + "/" + os.path.basename(db1) + "." + str(kmer) + ".msh"
db2_name = db2 + "/" + os.path.basename(db2) + "." + str(kmer) + ".msh"

mash_cmd = mash_exec + " paste " + join_name + " " + db1_name + " " + db2_name
subprocess.run(mash_cmd, shell=True, check=True)
os.rename(join_name + ".msh", db1_name)
os.rename(join_name + ".msh", output + "/" + os.path.basename(output) + "." + str(kmer) + ".msh")
except subprocess.CalledProcessError as e:
sys.stderr.write("Could not run command " + mash_cmd + "; returned: " + e.message + "\n")
sys.exit(1)
Expand Down Expand Up @@ -520,7 +522,7 @@ def fitKmerBlock(idxRanges, distMat, raw, klist, jacobian):
idxRanges (int, int)
Tuple of first and last row of slice to calculate
distMat (numpy.array)
sharedmem object to store core and accessory distances in(altered in place)
sharedmem object to store core and accessory distances in (altered in place)
raw (numpy.array)
sharedmem object with proportion of k-mer matches for each query-ref pair
by row, columns are at k-mer lengths in klist
Expand Down

0 comments on commit 7ce656e

Please sign in to comment.