diff --git a/app/MainBase.py b/app/MainBase.py index 3312217..16e7250 100644 --- a/app/MainBase.py +++ b/app/MainBase.py @@ -6,6 +6,7 @@ import app.Parser import app.load import app.clean +import app.build_kmer_sets import app.card_annotation import app.wildcard_annotation import app.remove_duplicates @@ -30,11 +31,12 @@ def __init__(self, api=False): baits_annotation create fasta files with annotations from baits (Experimental) remove_duplicates removes duplicate sequences (Experimental) heatmap heatmap for multiple analysis (Experimental) + kmer Build kmer sets (Experimental) database Information on installed card database''' parser = argparse.ArgumentParser(prog="rgi", description='{} - {}'.format(APP_NAME, SOFTWARE_VERSION), epilog=SOFTWARE_SUMMARY, usage=USAGE) parser.add_argument('command', choices=['main', 'tab', 'parser', 'load', - 'clean', 'galaxy', 'database', 'bwt', 'card_annotation', 'wildcard_annotation', 'baits_annotation', 'remove_duplicates', 'heatmap'], + 'clean', 'galaxy', 'database', 'bwt', 'card_annotation', 'wildcard_annotation', 'baits_annotation', 'remove_duplicates', 'heatmap', 'kmer'], help='Subcommand to run') if api == False: @@ -121,6 +123,18 @@ def load_args(self): def load_run(self, args): app.load.main(args) + def kmer(self): + parser = self.kmer_args() + args = parser.parse_args(sys.argv[2:]) + self.kmer_run(args) + + def kmer_args(self): + parser = app.build_kmer_sets.create_parser() + return parser + + def kmer_run(self, args): + app.build_kmer_sets.main(args) + def card_annotation(self): parser = self.card_annotation_args() args = parser.parse_args(sys.argv[2:]) diff --git a/app/build_kmer_sets.py b/app/build_kmer_sets.py new file mode 100644 index 0000000..f1e958d --- /dev/null +++ b/app/build_kmer_sets.py @@ -0,0 +1,317 @@ +import os, sys, json, csv, argparse +import app.make_kmer_json +from Bio import SeqIO, Seq + +""" +This script builds the CARD*k-mer sets. +Please provide location to the CARD*Resistomes&Variants nucleotide FASTAs + and the location to the index file (index-for-model-sequences.txt). +Please ensure that scripts "v3_kmer.py", "v3_kmer_species.py", and "plasmids.py" + are all located in your working directory. +""" + +working_directory = os.path.join(os.getcwd()) + +def combine_variant_sequences(f1, f2, f3, f4): + os.system( + "cat {fasta_one} {fasta_two} {fasta_three} {fasta_four} \ + > nucleotide_prevalence_all.fasta" + .format(fasta_one=f1, fasta_two=f2, + fasta_three=f3, fasta_four=f4) + ) + +def split_variant_sequences(index, fasta): + # Plasmids/Data type + print('Getting CARD*Resistomes&Variants sequences metadata...') + id_plas = set() + id_chr = set() + id_contig = set() + id_path = {} + with open(index, 'r') as index: + i = csv.reader(index, delimiter='\t') + for line in i: + # Genomic + if line[7] == 'ncbi_chromosome': + if line[0] not in id_chr: + id_chr.add(line[0]) + elif line[7] == 'ncbi_plasmid': + if line[0] not in id_plas: + id_plas.add(line[0]) + elif line[7] == 'ncbi_contig': + if line[0] not in id_contig: + id_contig.add(line[0]) + + # Taxonomy + if line[0] not in id_path: + id_path[line[0]] = [line[5]] + elif line[0] in id_path: + if line[5] not in id_path[line[0]]: + id_path[line[0]].append(line[5]) + else: + print('Error') + + # Discard shared kmers in genomic sets + p_and_chr = id_plas.intersection(id_chr) + tplas = id_plas.difference(id_chr) # plasmid sequqences not in chromosomes + tchr = id_chr.difference(id_plas) + tcontig = id_contig.difference(tchr) # contig sequences not in chr + tcontig = tcontig.difference(tplas) + tcontig = tcontig.difference(p_and_chr) + print('Writing genomic outputs...') + with open('both.fasta', 'w') as both: + with open('plasmid.fasta', 'w') as plasmids: + with open('chr.fasta', 'w') as chrs: + with open('contig.fasta', 'w') as contigs: + for entry in SeqIO.parse(fasta, 'fasta'): + prev_id = entry.id.split(":")[1].split("|")[0] + if prev_id in p_and_chr: + SeqIO.write(entry, both, 'fasta') + elif prev_id in tchr: + SeqIO.write(entry, chrs, 'fasta') + elif prev_id in tplas: + SeqIO.write(entry, plasmids, 'fasta') + elif prev_id in tcontig: + SeqIO.write(entry, contigs, 'fasta') + else: + print('Error') + print('# of sequences in chr + plasmid:', len(p_and_chr)) + print('# of sequences in just plasmids', len(tplas)) + print('# of sequences in just chromosomes:', len(tchr)) + print('# of sequences in just contigs:', len(tcontig)) + + # Discard prev sequences in multiple pathogens + print('Writing taxonomic outputs...') + species_count = 0 + genus_count = 0 + multi_count = 0 + with open('species.fasta', 'w') as single_path: + with open('genus.fasta', 'w') as single_genus: + with open('multi.fasta', 'w') as multi_path: + for entry in SeqIO.parse(fasta, 'fasta'): + prev_id = entry.id.split(":")[1].split("|")[0] + if len(id_path[prev_id]) == 1: + # single[prev_id] = entry.seq + SeqIO.write(entry, single_path, 'fasta') + species_count += 1 + else: + genus = [p.split()[0] for p in id_path[prev_id]] + if len(set(genus)) == 1: + SeqIO.write(entry, single_genus, 'fasta') + genus_count += 1 + else: + # multi[prev_id] = entry.seq + SeqIO.write(entry, multi_path, 'fasta') + multi_count += 1 + print('# of sequences in single species:', species_count) + print('# of sequences in single genus:', genus_count) + print('# of sequences in multiple genus:', multi_count) + +def count_kmers(k): + # Species + print("Counting single species kmers...") + # Counts kmers + os.system( + "jellyfish count -m {k} -t 24 -s 2G {fasta}" + .format(k=k, fasta=os.path.join(working_directory, "species.fasta")) + ) + # Writes jellyfish output to text file + species_kmers = "species_{k}.out".format(k=k) + os.system( + "jellyfish dump -c -t -o {name} mer_counts.jf" + .format(name=os.path.join(working_directory, species_kmers)) + ) + os.system( + "rm mer_counts.jf" + ) + + # Genus + print("Counting single genus kmers...") + # Counts kmers + os.system( + "jellyfish count -m {k} -t 24 -s 2G {fasta}" + .format(k=k, fasta=os.path.join(working_directory, "genus.fasta")) + ) + # Writes jellyfish output to text file + genus_kmers = "genus_{k}.out".format(k=k) + os.system( + "jellyfish dump -c -t -o {name} mer_counts.jf" + .format(name=os.path.join(working_directory, genus_kmers)) + ) + os.system( + "rm mer_counts.jf" + ) + + # Multi genus + print("Counting multi genus kmers...") + # Counts kmers + os.system( + "jellyfish count -m {k} -t 24 -s 2G {fasta}" + .format(k=k, fasta=os.path.join(working_directory, "multi.fasta")) + ) + # Writes jellyfish output to text file + multi_kmers = "multi_{k}.out".format(k=k) + os.system( + "jellyfish dump -c -t -o {name} mer_counts.jf" + .format(name=os.path.join(working_directory, multi_kmers)) + ) + os.system( + "rm mer_counts.jf" + ) + + # Chr + Plasmid kmers + print("Counting chr + plasmid kmers...") + # Counts kmers + os.system( + "jellyfish count -m {k} -t 24 -s 2G {fasta}" + .format(k=k, fasta=os.path.join(working_directory, "both.fasta")) + ) + # Writes jellyfish output to text file + both_kmers = "both_{k}.out".format(k=k) + os.system( + "jellyfish dump -c -t -o {name} mer_counts.jf" + .format(name=os.path.join(working_directory, both_kmers)) + ) + os.system( + "rm mer_counts.jf" + ) + + # Plasmid kmers + print("Counting plasmid kmers...") + # Counts kmers + os.system( + "jellyfish count -m {k} -t 24 -s 2G {fasta}" + .format(k=k, fasta=os.path.join(working_directory, "plasmid.fasta")) + ) + # Writes jellyfish output to text file + plasmid_kmers = "plasmid_{k}.out".format(k=k) + os.system( + "jellyfish dump -c -t -o {name} mer_counts.jf" + .format(name=os.path.join(working_directory, plasmid_kmers)) + ) + os.system( + "rm mer_counts.jf" + ) + + # Chr kmers + print("Counting chromosome kmers...") + # Counts kmers + os.system( + "jellyfish count -m {k} -t 24 -s 2G {fasta}" + .format(k=k, fasta=os.path.join(working_directory, "chr.fasta")) + ) + # Writes jellyfish output to text file + chr_kmers = "chr_{k}.out".format(k=k) + os.system( + "jellyfish dump -c -t -o {name} mer_counts.jf" + .format(name=os.path.join(working_directory, chr_kmers)) + ) + os.system( + "rm mer_counts.jf" + ) + + return species_kmers, genus_kmers, multi_kmers, both_kmers, plasmid_kmers, chr_kmers + +def create_kmer_db(sk, gk, mk, pk, ck, bk, variants, index, k): + os.system( + "python3 v3_kmer_species.py -s {species_kmers} -m {multi_kmers} \ + -prev {species_fasta} -k {k}" + .format(species_kmers=os.path.join(working_directory, species_kmers), + multi_kmers=os.path.join(working_directory, multi_kmers), + species_fasta=os.path.join(working_directory, "species.fasta"), + k=k) + ) + +def is_tool(name): + import distutils.spawn + if distutils.spawn.find_executable(name) is not None: + return True + else: + return False + +def main(args): + # check if jellyfish is installed + if is_tool("jellyfish") is False: + print("Missing dependency: jellyfish.\nPlease install from https://github.com/gmarcais/Jellyfish/releases") + exit("Debug") + + # some code + f1 = args.fasta_one + f2 = args.fasta_two + f3 = args.fasta_three + f4 = args.fasta_four + index = args.index + k = args.k + + + if not args.skip: + if all(a is None for a in [f1,f2,f3,f4]): + print("ERROR: CARD*R&V fastas needed.") + exit() + else: + print("-- CONCATENATING ALL CARD*R&V SEQUENCES --") + combine_variant_sequences(f1, f2, f3, f4) + print("Done \n") + + print("-- SPLITTING CARD*R&V SEQUENCES --") + split_variant_sequences(index, os.path.join( + working_directory, "nucleotide_prevalence_all.fasta")) + print("DONE \n") + else: + pass + + print("-- COUNTING KMERS --") + species_kmers, genus_kmers, multi_kmers, both_kmers, plasmid_kmers, chr_kmers = count_kmers(k) + print(species_kmers) + print("DONE \n") + + # """DEBUG""" + # species_kmers = "species_temp.out" + # genus_kmers = "genus_temp.out" + # multi_kmers = "multi_temp.out" + # plasmid_kmers = "plasmid_temp.out" + # chr_kmers = "chr_temp.out" + # both_kmers = "both_temp.out" + + species_file = os.path.join(working_directory, species_kmers) + genus_file = os.path.join(working_directory, genus_kmers) + multi_file = os.path.join(working_directory, multi_kmers) + both_file = os.path.join(working_directory, both_kmers) + plasmid_file = os.path.join(working_directory, plasmid_kmers) + chr_file = os.path.join(working_directory, chr_kmers) + variant_sequences = os.path.join(working_directory, \ + "nucleotide_prevalence_all.fasta") + + print("-- CREATING KMER SETS --") + app.make_kmer_json.make_json(plasmid_file, chr_file, both_file, genus_file, \ + species_file, multi_file, variant_sequences, index, k) + print("DONE \n") + + print("Finished creating CARD*kmers set.") + +def create_parser(): + parser = argparse.ArgumentParser( + description="Builds the kmer sets for CARD*kmers") + parser.add_argument('-1', dest="fasta_one", + help="nucleotide_fasta_protein_homolog_model_variants.fasta") + parser.add_argument('-2', dest="fasta_two", + help="nucleotide_fasta_protein_overexpression_model_variants.fasta") + parser.add_argument('-3', dest="fasta_three", + help="nucleotide_fasta_protein_variant_model_variants.fasta") + parser.add_argument('-4', dest="fasta_four", + help="nucleotide_fasta_rRNA_gene_variant_model_variants.fasta") + parser.add_argument('-i', '--index', dest="index", required=True, + help="CARD*R&V Variants index file (index-for-model-sequences.txt)") + parser.add_argument('-k', dest="k", required=True, + help="k-mer size (e.g., 61)") + parser.add_argument('--skip', dest="skip", action='store_true', + help="Skips the concatenation and splitting of the CARD*R*V sequences.") + + return parser + +def run(): + parser = create_parser() + args = parser.parse_args() + main(args) + +if __name__ == '__main__': + run() diff --git a/app/make_kmer_json.py b/app/make_kmer_json.py new file mode 100644 index 0000000..e30a3e7 --- /dev/null +++ b/app/make_kmer_json.py @@ -0,0 +1,253 @@ +import csv, re, argparse, multiprocessing, math, json, ahocorasick, os +from Bio import Seq, SeqIO + +""" +This scripts creates the JSON to hold all kmer sets. +""" + +def split_list(l,n): + for i in range(0, len(l), n): + yield l[i:i+n] + +def query_kmers(l, t, fasta, o): + """Finds kmers in variant sequences""" + try: + print('THREAD {t}: Adding kmers'.format(t=t)) + A = ahocorasick.Automaton() + for k in l: + A.add_word(k,k) + print('THREAD {t}: Making automaton'.format(t=t)) + A.make_automaton() + + f = {} + r = {} + seq_num = 0 + for entry in SeqIO.parse(fasta, "fasta"): + seq_num += 1 + if seq_num % 25000 == 0: + print('THREAD {t}: Done querying {n} sequences'.format(t=t, n=seq_num)) + prev_id = entry.id.split(":")[1].split("|")[0] + pathogens = id_path[prev_id] # get a list of pathogens that prev sequence is found in + for tup in A.iter(str(entry.seq)): + kmer = tup[1] + rev_kmer = str(Seq.Seq(kmer).reverse_complement()) + if kmer not in f: + f[kmer] = [] + if rev_kmer not in r: + r[rev_kmer] = [] + for p in pathogens: + if p not in f[kmer]: + f[kmer].append(p) + if p not in r[rev_kmer]: + r[rev_kmer].append(p) + print(t, 'Done') + + except Exception as e: + print(e) + o.put((t, f, r)) + +def get_genomic_kmers(plasmid_file, chr_file, both_file): + with open(plasmid_file, 'r') as f1: + print('Getting plasmid kmers...') + p = csv.reader(f1, delimiter='\t') + pkmers = {row[0] for row in p} + with open(chr_file, 'r') as f2: + print('Getting chromosome kmers...') + np = csv.reader(f2, delimiter='\t') + nkmers = {row[0] for row in np} + with open(both_file, 'r') as f3: + print('Getting both (plasmid + chr) kmers...') + b = csv.reader(f3, delimiter='\t') + bkmers = {row[0] for row in b} + + # Discard itnersect + both_separate = pkmers.intersection(nkmers) + # Remove non specific plasmid kmers + pf = pkmers.difference(nkmers) + pf = pf.difference(bkmers) + # Remove non specific chromosome kmers + cf = nkmers.difference(pkmers) + cf = cf.difference(bkmers) + for kmer in both_separate: + bkmers.add(kmer) + + print('Reverse complementing kmers...') + # Reverse complement non-plasmid kmers + rev_chr = {str(Seq.Seq(ck).reverse_complement()) for ck in cf} + rev_plas = {str(Seq.Seq(pk).reverse_complement()) for pk in pf} + rev_both = {str(Seq.Seq(bk).reverse_complement()) for bk in bkmers} + + # print(rev_chr) + for false_hit in pf.intersection(rev_chr): + # F kmer in plasmid is a R kmer in chr + pf.remove(false_hit) + bkmers.add(false_hit) + for false_hit in pf.intersection(rev_both): + pf.remove(false_hit) + bkmers.add(false_hit) + for false_hit in cf.intersection(rev_plas): + cf.remove(false_hit) + bkmers.add(false_hit) + for false_hit in cf.intersection(rev_both): + cf.remove(false_hit) + bkmers.add(false_hit) + + return list(pf), list(cf), list(bkmers) # type: sets + +def get_taxon_kmers(single_file, multi_file, variant_sequences, index_file, k, type): + """Gets taxonomic kmer sets""" + global id_path + id_path = {} + print('Getting pathogen metadata...') + with open(index_file, 'r') as index: + i = csv.reader(index, delimiter='\t') + for line in i: + if line[0] not in id_path: + id_path[line[0]] = [line[5]] + elif line[0] in id_path: + if line[5] not in id_path[line[0]]: + id_path[line[0]].append(line[5]) + else: + print('Error') + + with open(single_file, 'r') as f1: + print('Getting single species kmers...') + single = csv.reader(f1, delimiter='\t') + skmers = {row[0] for row in single} + with open(multi_file, 'r') as f2: + print('Getting multi species kmers...') + multi = csv.reader(f2, delimiter='\t') + mkmers = {row[0] for row in multi} + + print('Discarding shared single- and multi-species kmers...') + sf = list(skmers.difference(mkmers)) # kmers in only the single set + print('Remaining unique kmers:', len(sf)) + list_size = math.ceil(len(sf)/24) # For 24 threads + l = list(split_list(sf, list_size)) + + print('Querying kmers...') + # Threading + output = multiprocessing.Queue() + processes = [] + for ind in range(len(l)): + process = multiprocessing.Process(target=query_kmers, args=(l[ind], ind, variant_sequences, output)) + process.start() + processes.append(process) + # print(processes) + results = [output.get() for process in processes] + for process in processes: + process.join() + + print('Concatenating all data from all threads') + f = {} + r = {} + for i in range(len(results)): + f = {**f, **results[i][1]} + r = {**r, **results[i][2]} + + # Filters kmers that are not unique + to_delete = [] + for k,v in f.items(): + if type == "species": + if len(v) > 1: + to_delete.append(k) + elif type == "genus": + genus = [p.split()[0] for p in v] + if len(set(genus)) > 1: + to_delete.append(k) + else: + print("Error") + + print('before:', len(f)) + print('to delete:', len(to_delete)) + for k in to_delete: + del f[k] + del r[Seq.Seq(k).reverse_complement()] + print('after:', len(f)) + + rev_multi = {Seq.Seq(mk).reverse_complement() for mk in mkmers} + rev_temp = {Seq.Seq(tk).reverse_complement() for tk in to_delete} + + shared = 0 + same = 0 + single = 0 + forward = 0 + reverse = 0 + to_also_delete = [] + for k in r: + if k in f: + shared += 1 + if r[k] == f[k]: + same += 1 + else: # Shared kmers to different pathogens in diff orientations + single += 1 + to_also_delete.append(k) + for k in to_also_delete: + del f[k] + del r[k] + + ts = set(f.keys()) + # print(len(ts.intersection(rev_temp))) + for x in ts.intersection(rev_multi): + del f[x] + forward += 1 + for x2 in ts.intersection(rev_temp): + del f[x2] + forward += 1 + + return f + +def make_json(plasmid_file, chr_file, both_file, genus_file, species_file, \ + multi_file, variant_sequences, index_file, k): + print(plasmid_file, chr_file, both_file, genus_file, species_file, multi_file, variant_sequences, index_file, k) + exit() + p, c, b = get_genomic_kmers(plasmid_file, chr_file, both_file) + s = get_taxon_kmers(species_file, multi_file, variant_sequences, index_file, k, "species") + g = get_taxon_kmers(genus_file, multi_file, variant_sequences, index_file, k, "genus") + + final = {"p": p, "c": c, "b": b, "s": s, "g": g} + + with open(os.path.join(os.path.join(os.getcwd()), '{k}_kmer_db.json'.format(k=k)), 'w') as out: + json.dump(final, out) + + +def main(args): + plasmid_file = args.plasmid + chr_file = args.chromosome + both_file = args.both + species_file = args.species + genus_file = args.genus + multi_file = args.multi + variant_sequences = args.variants + k = int(args.k) + index_file = args.index + + make_json(plasmid_file, chr_file, both_file, genus_file, species_file, \ + multi_file, variant_sequences, index_file, k) + +def run(): + parser = argparse.ArgumentParser( + description='Creates a kmer catalogue') + parser.add_argument('-p', dest="plasmid", required=True, + help="Plasmid sequence kmers output from Jellyfish") + parser.add_argument('-c', dest="chromosome", required=True, + help="Chromosome sequence kmers output from Jellyfish") + parser.add_argument('-b', dest="both", required=True, + help="Plasmid + chromosome sequence kmers output from Jellyfish") + parser.add_argument('-s', dest="species", required=True, + help="Single species kmer output from Jellyfish") + parser.add_argument('-g', dest="genus", required=True, + help="Single genus kmer output from Jellyfish") + parser.add_argument('-m', dest="multi", required=True, + help="Multi species kmer output from Jellyfish") + parser.add_argument('-v', dest="variants", required=True, + help="CARD*Resistomes&Variants sequences (FASTA)") + parser.add_argument('-i', dest="index", required=True, + help="CARD*Resitomes&Variants index (index-for-model-sequences.txt)") + parser.add_argument('-k', dest="k", required=True, + help="kmer length") + args = parser.parse_args() + main(args) + +if __name__ == '__main__': + run() diff --git a/requirements.txt b/requirements.txt index 3c10c0a..bed15a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ pandas>=0.15.0 matplotlib>=2.1.2 seaborn>=0.8.1 pyfaidx>=0.5.4.1 +pyahocorasick==1.1.8 \ No newline at end of file diff --git a/test-requirements.txt b/test-requirements.txt index 81fddc2..3f2cad6 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -7,4 +7,5 @@ requests==2.18.2 six==1.10.0 matplotlib>=2.1.2 seaborn>=0.8.1 -pyfaidx>=0.5.4.1 \ No newline at end of file +pyfaidx>=0.5.4.1 +pyahocorasick==1.1.8 \ No newline at end of file