In [8]:
from collections import defaultdict
from Bio.SeqIO.FastaIO import SimpleFastaParser as sfp
from Bio import SeqIO, SearchIO, Entrez
import seaborn as sns
import pandas as pd
import os, glob, math, re, gzip, skbio, time
import matplotlib.pyplot as plt
import warnings
from Bio import Entrez
warnings.filterwarnings('ignore')

In [9]:
def cmdir(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def scaffold(gene):
    if gene != "None":
        try: return re.search("(.+?)_[0-9]+$", gene).group(1)
        except: print(gene)

def sbatch(name, cpus, cmd):
    return "sbatch -J %s -p serc -t 1- -c %d --mem %dG --wrap '%s'" %(name, cpus, cpus*8, cmd)

In [10]:
rootdir = "/scratch/users/ajaffe/photoeco/"
cmdir(rootdir + "curate")

### read in data

In [11]:
genomes = pd.read_csv(rootdir + "genome_table.tsv", sep="\t")
genomes.head()

Unnamed: 0,genome_name,lineage,checkm_completeness,checkm_redundancy,cluster95,cluster99
0,2626541517,AMZ IB,56.79,6.11,5_1,5_5
1,2626541518,AMZ IB,60.34,6.9,5_1,5_11
2,2626541532,AMZ IB,65.62,2.72,5_1,5_15
3,2626541536,AMZ IB,50.14,5.1,5_1,5_21
4,2818991511,AMZ II,74.64,0.27,4_1,4_4


### run clustering

In [14]:
prodigal = open(rootdir + "reassembly/genomes/prodigal.sh", "w")

for genome in glob.glob(rootdir + "reassembly/genomes/*fna"):
   
    name = os.path.basename(genome).replace(".fna","")
    #gpath = rootdir + "recover/bins/" + name + ".fa"
    
    #newname = rootdir + "recover/genomes/tmp/" + name + ".fna"
    #with open(newname, "w") as out:
    #    for record in sfp(open(gpath)):
    #        out.write(">%s\n%s\n" %(record[0], record[1]))
    prodigal.write("prodigal -i %s -a %s -p single\n" %(genome, genome.replace(".fna", ".genes.faa")))

prodigal.close()

In [21]:
wrapper = open(rootdir + "curate/cluster.sh", "w")
orf2bin = {}

for lineage in ["AMZ IA", "AMZ IB", "AMZ II", "AMZ III"]:
    
    workdir = rootdir + "curate/" + lineage.lower().replace(" ", "")
    cmdir(workdir)
    
    table = genomes[genomes["lineage"]==lineage]
    
    # write the proteins
    with open(workdir + "/lineage.faa", "w") as out:
        for key, row in table.iterrows():
            if "RR" in row["genome_name"]:
                faapath = rootdir + "reassembly/genomes/%s.genes.faa" %(row["genome_name"])
            else:
                faapath = rootdir + "genomes/filtered/%s.genes.faa" %(row["genome_name"])
            for record in sfp(open(faapath)):
                out.write(">%s\n%s\n" %(record[0].split(" # ")[0], record[1]))
                orf2bin[record[0].split(" # ")[0]] = row["genome_name"]
    
    # cluster the proteins
    makedb = "mmseqs createdb %s %s" %(workdir + "/lineage.faa", workdir + "/lineage.db")
    cluster = "mmseqs cluster --cov-mode 0 --threads 4 %s %s %s" %(workdir + "/lineage.db", 
        workdir + "/lineage.cluster", workdir + "/tmp")
    process = "mmseqs createtsv %s %s %s %s" %(workdir + "/lineage.db", workdir + "/lineage.db",
                                              workdir + "/lineage.cluster", workdir + "/lineage.cluster.tsv")
    wrapper.write("\n".join([makedb, cluster, process]))
    wrapper.write("\n")
    
wrapper.close()

 ### analyze

In [22]:
results = []

for result in glob.glob(rootdir + "curate/amz*/lineage.cluster.tsv"):
    
    table = pd.read_csv(result, sep="\t", header=None)
    table.columns = ["centroid", "gene"]
    table["scaffold"] = table["gene"].apply(lambda x: "_".join(x.split("_")[:-1]))
    table["bin"] = table["gene"].map(orf2bin)
    table["lineage"] = result.split("/")[-2]
    
    centroids = {}
    
    for i, centroid in enumerate(table.centroid.unique()):
        centroids[centroid] = i
    
    table["cluster"] = table["centroid"].map(centroids)
    results.append(table)

all_results = pd.concat(results)

In [23]:
pfams = all_results.groupby(["lineage", "cluster"], as_index=False).aggregate({"bin":"nunique"})
genomes["lineage_brief"] = genomes["lineage"].apply(lambda x: x.lower().replace(" ", ""))
totals = genomes.groupby("lineage_brief", as_index=False).aggregate({"genome_name":"nunique"}).rename(columns={"genome_name":"total"})
pfams = pfams.merge(totals, how="left", left_on="lineage", right_on="lineage_brief").drop("lineage_brief", axis=1)
pfams["perc_total"] = pfams.apply(lambda x: (x["bin"]/x["total"])*100, axis=1)
all_results = all_results.merge(pfams[["lineage", "cluster", "perc_total"]], how="left")
all_results.head(2)

Unnamed: 0,centroid,gene,scaffold,bin,lineage,cluster,perc_total
0,ERR599070.100.50_contig_114_3,ERR599070.100.50_contig_114_3,ERR599070.100.50_contig_114,ERR599070.100.50,amzia,0,12.5
1,ERR599070.100.50_contig_114_3,JASLWC010000035.1_14,JASLWC010000035.1,GCA_030741055.1_ASM3074105v1_genomic,amzia,0,12.5


### trim

In [31]:
trim_info = defaultdict(list)

for lineage in all_results.lineage.unique():
    
    table = all_results[all_results["lineage"]==lineage]
    
    for scaffold in table.scaffold.unique():
        
        subtable = table[table["scaffold"]==scaffold].drop_duplicates("cluster")
        rare = len(subtable.query("perc_total<10"))
        rare_perc = rare/len(subtable) * 100
        
        trim_info["lineage"].append(lineage)
        trim_info["bin"].append(subtable["bin"].iloc[0])
        trim_info["scaffold"].append(scaffold)
        trim_info["num_pfams"].append(len(subtable))
        trim_info["num_rare"].append(rare)
        trim_info["perc_rare"].append(rare_perc)
        trim_info["trim"].append(rare_perc>=50)

trimdf = pd.DataFrame(trim_info)
trimdf.head()

Unnamed: 0,lineage,bin,scaffold,num_pfams,num_rare,perc_rare,trim
0,amzia,ERR599070.100.50,ERR599070.100.50_contig_114,4,2,50.0,True
1,amzia,GCA_030741055.1_ASM3074105v1_genomic,JASLWC010000035.1,14,1,7.142857,False
2,amzia,ERR598987.100.72,ERR598987.100.72_scaffold_149,6,2,33.333333,False
3,amzia,ERR599070.100.50,ERR599070.100.50_contig_192,5,1,20.0,False
4,amzia,ERR599070.100.50,ERR599070.100.50_contig_196,23,2,8.695652,False


### export

In [25]:
cmdir(rootdir + "curate/trimmed")

In [44]:
for bin in trimdf.bin.unique():
    
    # only do it for newly resolved
    if "RR" in bin:
        
        totrim = trimdf[(trimdf["bin"]==bin) & \
                        (trimdf["trim"]==True)]["scaffold"].to_list()

        gpath = rootdir + "reassembly/genomes/%s.fna" %(bin)

        with open(rootdir + "curate/trimmed/" + bin + ".fna", "w") as out:
            for record in sfp(open(gpath)):
                scaf = record[0].split(" ")[0]
                if scaf not in totrim:
                    out.write(">%s\n%s\n" %(scaf, record[1]))

### compare

In [45]:
cmdir(rootdir + "curate/quality")

In [47]:
 # construct checkm calls
call = "checkm lineage_wf -t 20 -x .fna --pplacer_threads 20 %s %s" %(rootdir + "curate/trimmed", rootdir + "curate/quality")
call2 = "checkm qa -t 20 -o 1 -f %s --tab_table %s %s" %(rootdir + "curate/quality/output_table.txt",
    rootdir + "curate/quality/lineage.ms", rootdir + "curate/quality/")
print(call2)

checkm qa -t 20 -o 1 -f /scratch/users/ajaffe/photoeco/curate/quality/output_table.txt --tab_table /scratch/users/ajaffe/photoeco/curate/quality/lineage.ms /scratch/users/ajaffe/photoeco/curate/quality/


In [48]:
# read in new results
checkm_df = pd.read_csv(rootdir + "curate/quality/output_table.txt", sep="\t").reset_index()
checkm_df = checkm_df[["Bin Id", "Completeness", "Contamination"]]
checkm_df.columns = ["genome_name", "new_completeness", "new_redundancy"]
checkm_df.head(2)

Unnamed: 0,genome_name,new_completeness,new_redundancy
0,ERR598946.10.10,59.98,0.54
1,ERR598946.100.35,83.7,1.45


In [49]:
# read in old results
checkm_df = checkm_df.merge(genomes[["genome_name", "checkm_completeness", "checkm_redundancy"]], how="left")
checkm_df["delta_completeness"] = checkm_df.apply(lambda x: x["new_completeness"] - x["checkm_completeness"], axis=1)
checkm_df["delta_redundancy"] = checkm_df.apply(lambda x: x["new_redundancy"] - x["checkm_redundancy"], axis=1)
checkm_df.sort_values("new_completeness", ascending=True)[0:10]

Unnamed: 0,genome_name,new_completeness,new_redundancy,checkm_completeness,checkm_redundancy,delta_completeness,delta_redundancy
23,SRR11923210.100.2,37.9,0.32,56.65,1.13,-18.75,-0.81
33,SRR13782137.100.27,39.65,1.38,61.07,5.09,-21.42,-3.71
34,SRR16028415.100.26,45.97,2.7,61.92,7.05,-15.95,-4.35
15,ERR868457.100.2,50.56,2.85,63.06,4.44,-12.5,-1.59
50,SRR25584961.100.55,52.43,0.74,69.37,1.69,-16.94,-0.95
32,SRR13782129.100.144,55.54,0.95,55.54,0.95,0.0,0.0
29,SRR11923215.100.61,59.65,0.82,67.62,1.36,-7.97,-0.54
0,ERR598946.10.10,59.98,0.54,76.74,3.14,-16.76,-2.6
53,SRR4465025.100.54,61.73,3.08,62.27,3.35,-0.54,-0.27
18,SRR11787837.100.56,65.11,1.24,80.65,3.68,-15.54,-2.44


In [53]:
checkm_df["delta_completeness"].mean()

-3.434912280701755

In [54]:
checkm_df["delta_redundancy"].mean()

-0.7759649122807019