In [1]:
from collections import defaultdict
from Bio.SeqIO.FastaIO import SimpleFastaParser as sfp
import subprocess as sp
from Bio import SeqIO, SearchIO, Entrez
import seaborn as sns
import pandas as pd
import os, glob, math, re, gzip, skbio, time, pysam
import matplotlib.pyplot as plt
import warnings
from Bio import Entrez
warnings.filterwarnings('ignore')

In [2]:
def cmdir(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def scaffold(gene):
    if gene != "None":
        try: return re.search("(.+?)_[0-9]+$", gene).group(1)
        except: print(gene)

def sbatch(name, cpus, cmd):
    return "sbatch -J %s -p serc -t 1- -c %d --mem %dG --wrap '%s'" %(name, cpus, cpus*8, cmd)

In [3]:
rootdir = "/scratch/users/ajaffe/photoeco/"
cmdir(rootdir + "recover")

### choose query genomes

In [4]:
cmdir(rootdir + "recover/branchwater")

In [4]:
genomes = pd.read_csv(rootdir + "genome_table.tsv", sep="\t")
# remove recovered if reading in updated version
genomes = genomes[~genomes["genome_name"].str.contains("RR")]
genomes.head()

Unnamed: 0,genome_name,lineage,checkm_completeness,checkm_redundancy,cluster95,cluster99
0,2626541517,AMZ IB,56.79,6.11,5_1,5_5
1,2626541518,AMZ IB,60.34,6.9,5_1,5_10
2,2626541532,AMZ IB,65.62,2.72,5_1,5_14
3,2626541536,AMZ IB,50.14,5.1,5_1,5_17
4,2818991511,AMZ II,74.64,0.27,4_1,4_4


In [16]:
for lineage in genomes.lineage.unique():
    
    if "AMZ" in lineage:
        
        table = genomes[genomes["lineage"]==lineage].sort_values("checkm_completeness", ascending=False)
        gpath = glob.glob(rootdir + "genomes/filtered/" + table["genome_name"].iloc[0] + ".fna")[0]
        
        with open(rootdir + "recover/branchwater/" + os.path.basename(gpath), "w") as out:
            for record in sfp(open(gpath)):
                out.write(">%s\n%s\n" %(record[0], record[1]))
        
        print(lineage, os.path.basename(gpath))

AMZ IA GCA_030741055.1_ASM3074105v1_genomic.fna
AMZ IB GCA_030740755.1_ASM3074075v1_genomic.fna
AMZ II 2818991511.fna
AMZ III 2818991522.fna


Then query branchwater with each of the above.

### process branchwater output

In [5]:
branch = pd.concat(pd.read_csv(item) for item in \
                   glob.glob(rootdir + "recover/branchwater/*csv"))
# filter for quality matches at species-level
branch = branch[(branch["cANI"]>=0.97)].drop_duplicates("acc")
counts = pd.DataFrame(branch.value_counts("bioproject").reset_index())

with open(rootdir + "recover/sample_list.tsv", "w") as out:
    for key, row in counts.iterrows():
        if row["count"] >= 4:
            table = branch[branch["bioproject"]==row["bioproject"]]
            out.write("%s\t%s\n" %(row["bioproject"], ",".join(table["acc"].unique())))

Then go run recover smk workflow.

### check

In [7]:
checks = []

for key, row in counts.iterrows():
        if row["count"] >= 4:
            checks += branch[branch["bioproject"]==row["bioproject"]]["acc"].to_list()
            print(row["bioproject"])

for check in checks:
    if glob.glob(rootdir + "recover/metabat/" + check + "_bins/") == []:
        print(check)

PRJNA704804
PRJEB1787
PRJNA350692
PRJNA1003508
PRJNA634212
PRJEB4352
PRJNA632347


### combine/rename output

In [5]:
cmdir(rootdir + "recover/bins")

In [6]:
for dir in ["recover", "sub10"]:
    
    if dir == "recover":
        t = 100
    else: t = 10
        
    for bin in glob.glob(rootdir + dir + "/metabat/*bins/*fa"):
        
        sam, num = os.path.basename(bin).split(".")[0:2]
        newname = rootdir + "recover/bins/%s.%s.%s.fa" %(sam, t, num)
        
        with open(newname, "w") as out:
            for i,record in enumerate(sfp(open(bin))):
                newcontig = "_".join([os.path.basename(newname).replace(".fa",""), "contig", str(i)])
                out.write(">%s\n%s\n" %(newcontig, record[1]))

### identify output bins

In [6]:
cmdir(rootdir + "recover/gtdbtk/")

In [9]:
genomes = glob.glob(rootdir + "recover/bins/*fa")
n = math.ceil(len(genomes)/25)

for a, i in enumerate(range(0, len(genomes), n)):
    with open(rootdir + "recover/gtdbtk/batch%d.txt" %(a), "w") as out:
        for genome in genomes[i:i + n]:
            out.write(genome + "\t" + os.path.basename(genome).split(".fa")[0] + "\n")

In [10]:
with open(rootdir + "recover/gtdbtk/classify.sh", "w+") as out:
    for batchfile in glob.glob(rootdir + "recover/gtdbtk/batch*.txt"):
        dirname = rootdir + "recover/gtdbtk/%s" %(os.path.basename(batchfile).replace(".txt", ""))
        base = "gtdbtk classify_wf --cpus 20 -x .fa --batchfile %s --out_dir %s" %(batchfile, dirname)
        cmd = sbatch(os.path.basename(batchfile).split(".")[0], 20, base)
        out.write(cmd + "\n")

In [29]:
# parse
gtresults = pd.concat([pd.read_csv(item, sep="\t") for item in \
                       glob.glob(rootdir + "recover/gtdbtk/batch*/gtdbtk.*.summary.tsv")])
cyanos = gtresults[gtresults["classification"].str.contains("Cyano")]
cyanos.head(2)

Unnamed: 0,user_genome,classification,fastani_reference,fastani_reference_radius,fastani_taxonomy,fastani_ani,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,pplacer_taxonomy,classification_method,note,"other_related_references(genome_id,species_name,radius,ANI,AF)",msa_percent,translation_table,red_value,warnings
11,ERR598981.100.8,d__Bacteria;p__Cyanobacteria;c__Cyanobacteriia...,,,,,,,,,,,d__Bacteria;p__Cyanobacteria;c__Cyanobacteriia...,taxonomic novelty determined using RED,classification based on placement in class-lev...,,13.42,11.0,0.890766,
12,ERR598987.100.37,d__Bacteria;p__Cyanobacteria;c__Cyanobacteriia...,,,,,,,,,,,d__Bacteria;p__Cyanobacteria;c__Cyanobacteriia...,taxonomic novelty determined using RED,classification based on placement in class-lev...,,79.81,11.0,0.876367,


### quality

In [26]:
cmdir(rootdir + "recover/quality")
cmdir(rootdir + "recover/quality/checkm")

In [38]:
with open(rootdir + "recover/quality/checkm/run_checkm.sh", "w") as out:
    
    for batchfile in glob.glob(rootdir + "recover/gtdbtk/batch*.txt"):
        
        # create subdir for each batch
        basename = os.path.basename(batchfile).split(".")[0]
        batchdir = rootdir + "recover/quality/checkm/" + basename + "/"
        cmdir(batchdir)
        
        # rewrite batch file to work with checkm
        # subsetting to cyanobacteria
        with open(batchdir + "batchfile.txt", "w") as bf:
            for line in open(batchfile).readlines():
                if line.strip().split("\t")[1] in cyanos["user_genome"].to_list():
                    bf.write("%s\t%s\n" %(line.strip().split("\t")[1],
                        line.strip().split("\t")[0]))
        
        # construct checkm calls
        call = "checkm lineage_wf -t 20 -x .fa --pplacer_threads 20 %s %s" %(batchdir + "batchfile.txt", batchdir)
        call2 = "checkm qa -t 20 -o 1 -f %s --tab_table %s %s" %(batchdir + "output_table.txt",
            batchdir + "lineage.ms", batchdir)
        cmd = sbatch(basename, 20, "%s && %s" %(call, call2))
        out.write(cmd + "\n")

In [6]:
# read in all results
checkm_df = pd.concat([pd.read_csv(item, sep="\t") for item in 
    glob.glob(rootdir + "recover/quality/checkm/batch*/output_table.txt")]).reset_index()
checkm_df = checkm_df[["Bin Id", "Completeness", "Contamination"]]
checkm_df.columns = ["genome_name", "checkm_completeness", "checkm_redundancy"]
checkm_df.head(2)

Unnamed: 0,genome_name,checkm_completeness,checkm_redundancy
0,ERR598981.100.8,12.07,0.34
1,ERR598987.100.37,91.58,9.42


### compare

In [14]:
cmdir(rootdir + "recover/compare")
cmdir(rootdir + "recover/compare/fasta")
cmdir(rootdir + "recover/compare/drep")

In [31]:
for key, row in genomes.iterrows():
    gpath = rootdir + "genomes/filtered/" + row["genome_name"] + ".fna"
    with open(rootdir + "recover/compare/fasta/" + os.path.basename(gpath), "w") as out:
        for record in sfp(open(gpath)):
            out.write(">%s\n%s\n" %(record[0], record[1]))

for key, row in checkm_df.iterrows():
    gpath = rootdir + "recover/bins/" + row["genome_name"] + ".fa"
    with open(rootdir + "recover/compare/fasta/" + os.path.basename(gpath), "w") as out:
        for record in sfp(open(gpath)):
            out.write(">%s\n%s\n" %(record[0], record[1]))

In [7]:
quality = pd.concat([genomes[["genome_name", "checkm_completeness", "checkm_redundancy"]], checkm_df])
quality["genome"] = quality["genome_name"].apply(lambda x: os.path.basename(glob.glob(rootdir + \
                                                 "recover/compare/fasta/" + x + "*")[0]))
quality = quality[["genome", "checkm_completeness", "checkm_redundancy"]].drop_duplicates()
quality.columns = ["genome", "completeness", "contamination"]
quality.to_csv(rootdir + "recover/compare/drep/genomeInformation.csv", sep=",", index=False)

In [37]:
call = "dRep dereplicate %s -pa 0.70 -sa 0.95 -p 20 -comp 40 -con 25 -d -g %s --genomeInfo %s" %(rootdir + \
        "/recover/compare/drep", rootdir + "/recover/compare/fasta/*f*a",
        rootdir + "recover/compare/drep/genomeInformation.csv")
cmd = sbatch("drep", 20, call)
print(cmd)

sbatch -J drep -p serc -t 1- -c 20 --mem 160G --wrap 'dRep dereplicate /scratch/users/ajaffe/photoeco//recover/compare/drep -pa 0.70 -sa 0.95 -p 20 -comp 40 -con 25 -d -g /scratch/users/ajaffe/photoeco//recover/compare/fasta/*f*a --genomeInfo /scratch/users/ajaffe/photoeco/recover/compare/drep/genomeInformation.csv'


### evaluate similarity + export

In [175]:
cmdir(rootdir + "recover/genomes")

In [8]:
cdb = pd.read_csv(rootdir + "recover/compare/drep/data_tables/Cdb.csv")
cdb["genome_name"] = cdb["genome"].apply(lambda x: x.replace(".fna", "").replace(".fa", ""))
cdb = cdb.merge(genomes[["genome_name", "lineage"]], how="left").fillna("None")
cdb = cdb[["genome_name", "lineage", "secondary_cluster"]].rename(columns={"secondary_cluster":"cluster95"})

In [9]:
quality["genome_name"] = quality["genome"].apply(lambda x: x.replace(".fna", "").replace(".fa", ""))
cdb = cdb.merge(quality, how="left")
cdb.query("cluster95=='5_1'").sort_values(["completeness"], ascending=[False]).head(2)

Unnamed: 0,genome_name,lineage,cluster95,genome,completeness,contamination
225,SRR25584950.100.49,,5_1,SRR25584950.100.49.fa,87.38,2.63
216,2818991511,AMZ II,5_1,2818991511.fna,74.64,0.27


In [36]:
for cluster in ["5_1", "6_3", "6_4", "12_1"]:
    
    table = cdb[(cdb["cluster95"]==cluster) & (cdb["lineage"]=='None')]
    
    for key, row in table.iterrows():
        
        if "RR" in row["genome_name"]:
    
            newname = rootdir + "recover/genomes/" + row["genome_name"] + ".fna"
            gpath = rootdir + "recover/bins/" + row["genome_name"] + ".fa"

            with open(newname, "w") as out:
                for record in sfp(open(gpath)):
                    out.write(">%s\n%s\n" %(record[0], record[1]))    

### try re-assembly for select bins

In [16]:
cmdir(rootdir + "reassembly")

In [None]:
with open(rootdir + "reassembly/wrapper.sh", "w") as wrapper:

    for genome in glob.glob(rootdir + "recover/genomes/*fna"):

        # locate original bin file
        parts = os.path.basename(genome).split(".")[0:3]
        dire = "recover" if parts[1] == "100" else "sub10"
        gpath = rootdir + "%s/metabat/%s_bins/%s.%s.fa" %(dire,parts[0], parts[0], parts[2])

        # locate the relevant bam and reads file
        bpath = rootdir + "%s/mapping/%s_%s.sorted.bam" %(dire, parts[0], parts[0])
        rpath = glob.glob(rootdir + "%s/trimmed_reads/%s*.fastq.gz" %(dire,parts[0]))
        aln = pysam.AlignmentFile(bpath, "rb")
        
        # extract reads mapping to every contig in the genome
        reads = set()
        for record in sfp(open(gpath)):
            for read in aln.fetch(record[0].split(" ")[0]):
                reads.add(read.query_name)
        
        readnames = rootdir + "reassembly/%s_reads.txt" %(".".join(parts))
        with open(readnames, "w") as rfile:
            for read in reads:
                rfile.write(read + "\n")
                
        cmds = []
        for direction in rpath: 
            nfile = rootdir + "reassembly/%s" %(os.path.basename(direction).replace(parts[0], ".".join(parts)))
            pull = "seqtk subseq %s %s | pigz -p 20 > %s" %(direction, readnames, nfile)
            cmds.append(pull)
                
        # assemble them
        newfwd = rootdir + "reassembly/%s_1.fastq.gz" %(".".join(parts))
        output = rootdir + "reassembly/%s" %(".".join(parts))
        assemble = "metaspades.py -1 %s -2 %s -o %s --threads 20 --memory 160" %(newfwd, newfwd.replace("_1", "_2"), output)
        
        if not os.path.isfile(output + "/scaffolds.fasta"):
            print(genome)
            wrapper.write(cmds[0] + "\n" + cmds[1] + "\n")
            wrapper.write(assemble + "\n")

In [60]:
with open(rootdir + "reassembly/run_stats.sh", "w") as wrapper:
    
    for assembly in glob.glob(rootdir + "reassembly/*/scaffolds.fasta"):

        #reformat bin assembly
        name = os.path.dirname(assembly).split("/")[-1]
        newfile = assembly.replace("scaffolds", name).replace("fasta", "fna")
        with open(newfile, "w") as out:
            for i,record in enumerate(sfp(open(assembly))):
                if len(record[1]) >= 1500:
                    out.write(">%s\n%s\n" %(name + "_scaffold_" + str(i), record[1]))

        # run stats on them
        new = "stats.sh in=%s out=%s format=3" %(newfile, newfile.replace("fna", "new.results"))
        gpath = rootdir + "recover/genomes/" + name + ".fna"
        old = "stats.sh in=%s out=%s format=3" %(gpath, newfile.replace("fna", "old.results"))
        ani = "fastANI -q %s -r %s -o %s" %(newfile, gpath, newfile.replace("fna", "fastani"))
        wrapper.write(new + "\n" + old + "\n" + ani + "\n")

In [80]:
results = defaultdict(list)

for gdir in glob.glob(rootdir + "reassembly/*/"):
    
    if glob.glob(gdir + "*results") != []:
        
        new = pd.read_csv(glob.glob(gdir + "*.new.*")[0], sep="\t")
        old = pd.read_csv(glob.glob(gdir + "*.old.*")[0], sep="\t")
        ani = pd.read_csv(glob.glob(gdir + "*fastani")[0], sep="\t", header=None)

        results["genome"].append(gdir.split("/")[-2])
        results["fastani"].append(ani[2].iloc[0])
        results["old_cnum"].append(old["n_scaffolds"].iloc[0])
        results["new_cnum"].append(new["n_scaffolds"].iloc[0])
        results["old_len"].append(old["scaf_bp"].iloc[0])
        results["new_len"].append(new["scaf_bp"].iloc[0])
        delta_len = (new["scaf_bp"].iloc[0]-old["scaf_bp"].iloc[0])/old["scaf_bp"].iloc[0]*100
        results["delta_len"].append(delta_len)
        results["old_n50"].append(old["scaf_L50"].iloc[0])
        results["new_n50"].append(new["scaf_L50"].iloc[0])
        
        improved = False
        
        if (new["n_scaffolds"].iloc[0] < old["n_scaffolds"].iloc[0]) & \
           (new["scaf_L50"].iloc[0] > old["scaf_L50"].iloc[0]) & \
           (abs(delta_len)<=10):
            improved = True
        
        results["improved"].append(improved)
        
resultsdf = pd.DataFrame(results)
resultsdf.to_csv(rootdir + "reassembly.tsv", sep="\t", index=False)
resultsdf["improved"].value_counts()

improved
True     45
False    13
Name: count, dtype: int64

### export

In [81]:
cmdir(rootdir + "reassembly/genomes")

In [84]:
for key, row in resultsdf.iterrows():
    
    if row["improved"] == True:
        gpath = rootdir + "reassembly/%s/%s.fna" %(row["genome"], row["genome"])
    else:
        gpath = rootdir + "recover/genomes/" + row["genome"] + ".fna"
        
    with open(rootdir + "reassembly/genomes/" + row["genome"] + ".fna", "w") as out:
        for record in sfp(open(gpath)):
            out.write(">%s\n%s\n" %(record[0], record[1]))

In [87]:
# if not able to be re-assembled
for genome in glob.glob(rootdir + "recover/genomes/*fna"):
    if not os.path.isfile(genome.replace("recover", "reassembly")):
        with open(genome.replace("recover", "reassembly"), "w") as out:
            for record in sfp(open(genome)):
                out.write(">%s\n%s\n" %(record[0], record[1]))