In [48]:
import os
import pandas as pd

os.chdir('/Users/aaron/Desktop/MAC_paper/mac_evo/')

def run_snippy_wf(out_dir, use_ref, threads=os.cpu_count()):
    snippy = 'docker run -v $PWD:/data staphb/snippy snippy --cpus {threads} --outdir {out_dir} --ref {ref} --R1 reads/processed/{isolate}_R1_001.qcd.fastq.gz --R2 reads/processed/{isolate}_R2_001.qcd.fastq.gz'
    ## run snippy on fastbaps clusters
    df = pd.read_csv('data/fastbaps.tsv', sep='\t')
    clusters = [int(x) for x in open('data/fastbaps_clusters.txt', 'r').read().splitlines()]
    with open('run_snippy_wf.sh', 'w') as f:
        for cluster in clusters:
            isolates = sorted(df[df['cluster'] == int(cluster)]['isolate'].to_list())
            if use_ref:
                ref = 'databases/GCF_009741445.1/GCF_009741445.1_ASM974144v1_genomic.fna'
            else:
                rep = isolates[0]
                ref = 'assemblies/{}.contigs.fasta'.format(rep)
            name = 'fastbaps_cluster_' + str(cluster).zfill(2)
            out_dir = f'snippy_x_fastbaps/{name}'
            cmds = []; subdirs = []
            for isolate in isolates[1:len(isolates)]:
                subdir = f'{out_dir}/{isolate}'
                subdirs.append(subdir)
                target = f'{subdir}/snps.csv'
                if not os.path.exists(target):
                    cmds.append(snippy.format(threads=threads, out_dir=subdir, ref=ref, isolate=isolate))
            ## run snippy-core + gubbins + snp-sites + snp-dists
            targets = [f'{out_dir}/core.full.aln', 
                       f'{out_dir}/gubbins.core.full.aln',
                       f'{out_dir}/gubbins.filtered_polymorphic_sites.fasta',
                       f'{out_dir}/gubbins.core.aln',
                       f'{out_dir}/snp_dists.txt']
            prefix = f'{out_dir}/core'
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)
            if not os.path.exists(targets[0]):
                inp = ' '.join(subdirs)
                snippy_core = f'docker run -v $PWD:/data staphb/snippy snippy-core --ref {ref} --prefix {prefix} {inp}'
                cmds.append(snippy_core)
            if not os.path.exists(targets[1]):
                snippy_clean_full_aln = f'docker run -v $PWD:/data staphb/snippy snippy-clean_full_aln {out_dir}/core.full.aln > {out_dir}/gubbins.core.full.aln'
                cmds.append(snippy_clean_full_aln)
            if not os.path.exists(targets[2]):
                run_gubbins = f'run_gubbins.py --prefix {out_dir}/gubbins {out_dir}/gubbins.core.full.aln --threads {os.cpu_count()}'
                cmds.append(run_gubbins)
            if not os.path.exists(targets[3]):
                snp_sites = f'docker run -v $PWD:/data staphb/snp-sites snp-sites -c {out_dir}/gubbins.filtered_polymorphic_sites.fasta > {out_dir}/gubbins.core.aln'
                cmds.append(snp_sites)
            if not os.path.exists(targets[4]):
                snp_dists = f'docker run -v $PWD:/data staphb/snp-dists snp-dists {out_dir}/gubbins.core.aln > {out_dir}/snp_dists.txt'
                cmds.append(snp_dists)
            cmd = ' && '.join(cmds) + '\n'
            f.write(cmd)


In [49]:
run_snippy_wf(out_dir='snippy_x_fastbaps', use_ref=False, threads=os.cpu_count())

In [45]:
['a', 'b', 'c'][1:4]

['b', 'c']

In [2]:
import pandas as pd
import os

os.chdir('/Users/aaron/Desktop/MAC_paper/mac_evo')

df = pd.read_csv('data/fastbaps.tsv', sep='\t')
print(df)

              isolate  cluster
0          ERR4022296       23
1          ERR4022330       23
2          ERR4022304       23
3           Reference        6
4    23IE176_S15_L001        6
..                ...      ...
281        ERR3468814       22
282        ERR3468806       22
283        ERR3468956       22
284        ERR3468926       22
285        ERR4022334       23

[286 rows x 2 columns]


In [None]:
def run_snippy_wf(out_dir, use_ref):
    ## run snippy on fastbaps clusters
    df = pd.read_csv('data/fastbaps.tsv', sep='\t')
    clusters = [int(x) for x in open('data/fastbaps_clusters.txt', 'r').read().splitlines()]
    with open('run_snippy_wf.sh', 'w') as f:
        for cluster in clusters:
                if use_ref:
                    ref = 'databases/GCF_009741445.1/GCF_009741445.1_ASM974144v1_genomic.fna'
                else:
                    rep = df[df['cluster'] == cluster]['isolate'].to_list()[0]
                    ref = 'assemblies/{}.contigs.fasta.gz'.format(rep)
                

In [6]:
run_snippy_wf(out_dir='snippy_x_fastbaps_2', use_ref=False)

assemblies/ERR12861631.contigs.fasta.gz
assemblies/ERR6057609.contigs.fasta.gz
assemblies/ERR6057651.contigs.fasta.gz
assemblies/ERR12861674.contigs.fasta.gz
assemblies/ERR3566356.contigs.fasta.gz
assemblies/ERR6057683.contigs.fasta.gz
assemblies/ERR3468976.contigs.fasta.gz
assemblies/ERR4022294.contigs.fasta.gz
assemblies/ERR2988423.contigs.fasta.gz
assemblies/ERR4022412.contigs.fasta.gz
assemblies/ERR4339054.contigs.fasta.gz
