In [1]:
# imports
import os
import re
import json

import pandas as pd
from Bio.Seq import Seq

from eagle.lib.seqs import SeqsDict

  config = yaml.load(string)


In [2]:
# constants
WORK_DIR = "bacteria"

essential_nucl_fasta = "bacteria/deg_essential/degseq-p.dat"
essential_annot = "bacteria/deg_essential/degannotation-p.dat"

nonessential_nucl_fasta = "bacteria/deg_nonessential/degseq-np.dat"
nonessential_annot = "bacteria/deg_nonessential/degannotation-np.dat"

chr_id_path = "bacteria/chr_id.txt"
summary_path = "bacteria/summary_table.txt"

In [3]:
# lib
def get_gtfs(chr_dict, deg_annot_path, deg_seqs_fasta, deg_essential):
    annot_df = pd.read_csv(deg_annot_path, sep="\t")
    seqs_dict = SeqsDict.load_from_file(deg_seqs_fasta, low_memory=True)

    gtfs_df = pd.DataFrame(list(annot_df.apply(prepare_deg_annot_line, axis=1, args=(chr_dict, seqs_dict))))
    summary_list = list()
    for deg_org in gtfs_df.groupby("org_DEG_ID"):
        summary_list.append(deg_org[1].iloc[0][["org_DEG_ID", "chr_id", "org_name"]].to_dict())
        gtf_df = deg_org[1][pd.notna(deg_org[1]["seqid"])][["seqid", "source", "type", "start", "end", "score", "strand", "frame", "attribute"]]
        
        if deg_essential:
            gtf_path = os.path.join(WORK_DIR, deg_org[0]+"_essential.gtf")
            summary_list[-1].update({"essential_genes": gtf_df.shape[0], "essential_genes_gtf": gtf_path})
        else:
            gtf_path = os.path.join(WORK_DIR, deg_org[0]+"_nonessential.gtf")
            summary_list[-1].update({"nonessential_genes": gtf_df.shape[0], "nonessential_genes_gtf": gtf_path})
        if not gtf_df.empty:
            gtf_df.to_csv(gtf_path, sep="\t", index=False, quotechar="'")
        gtf_path = None
        
        print(summary_list[-1])
    return pd.DataFrame(summary_list)


def prepare_deg_annot_line(row, chr_dict, ess_seqs_dict):
    result_dict = {
        "org_DEG_ID": row["#DEG_ORG"],
        "chr_id": row["#Refseq"],
        "org_name": row["#Organism"],
        # gtf fields
        "seqid": None,
        "source": "DEG",
        "type": "gene",
        "start": int(),
        "end": int(),
        "score": "-",
        "strand": None,
        "frame": ".",
        "attribute": json.dumps({"gene_name": row["#Gene_Name"]}),
    }
    if row["#Refseq"] not in chr_dict:
        return result_dict    
    ori = 1
    match = search_in_chr(ess_seqs_dict[row["#DEG_AC"]], chr_dict[row["#Refseq"]])
    if match is None:
        match = search_in_chr(str(Seq(ess_seqs_dict[row["#DEG_AC"]]).reverse_complement()), chr_dict[row["#Refseq"]])
        ori = -1
    if match is not None:
        result_dict["seqid"] = match[0]
        result_dict["start"] = match[1] + 1
        result_dict["end"] = match[2]
        if ori > 0:
            result_dict["strand"] = "+"
        else:
            result_dict["strand"] = "-"
    return result_dict


def search_in_chr(seq, chr_seq_dict):
    for seq_name in chr_seq_dict:
        match = re.search(seq.lower(), chr_seq_dict[seq_name].lower())
        if match is not None:
            return seq_name.split()[0], match.start(), match.end()


In [4]:
# main
chr_dict = dict()
chr_id_df = pd.read_csv(chr_id_path, sep="\t", index_col="chr_id")
for chr_id in chr_id_df.index:
    chr_dict[chr_id] = SeqsDict.load_from_file(chr_id_df.loc[chr_id]["fna_path"], low_memory=True)

essential_summary_df = get_gtfs(chr_dict=chr_dict, deg_annot_path=essential_annot, deg_seqs_fasta=essential_nucl_fasta, deg_essential=True)
print("got essential summary")
nonessential_summary_df = get_gtfs(chr_dict=chr_dict, deg_annot_path=nonessential_annot, deg_seqs_fasta=nonessential_nucl_fasta, deg_essential=True)
nonessential_summary_df["org_DEG_ID"] = nonessential_summary_df["org_DEG_ID"].apply(lambda org_deg_id: org_deg_id.replace("DNEG", "DEG"))

summary_df = essential_summary_df.merge(nonessential_summary_df, on="org_DEG_ID")
# summary_df["fna_path"] = sammary_df["chr_id"].apply(lambda chr_id: chr_id_df.loc[chr_id]["fna_path"])
summary_df.merge(chr_id_df, on="chr_id")
sammary_df.to_csv(os.path.join(WORK_DIR, "summary_table.txt"))

{'org_name': 'Bacillus subtilis 168', 'essential_genes': 0, 'essential_genes_gtf': 'bacteria/DEG1001_essential.gtf', 'org_DEG_ID': 'DEG1001', 'chr_id': 'NC_000964'}
{'org_name': 'Staphylococcus aureus N315', 'essential_genes': 0, 'essential_genes_gtf': 'bacteria/DEG1002_essential.gtf', 'org_DEG_ID': 'DEG1002', 'chr_id': 'NC_002745'}
{'org_name': 'Vibrio cholerae N16961', 'essential_genes': 0, 'essential_genes_gtf': 'bacteria/DEG1003_essential.gtf', 'org_DEG_ID': 'DEG1003', 'chr_id': 'NC_002505'}
{'org_name': 'Haemophilus influenzae Rd KW20', 'essential_genes': 0, 'essential_genes_gtf': 'bacteria/DEG1005_essential.gtf', 'org_DEG_ID': 'DEG1005', 'chr_id': 'NC_000907'}
{'org_name': 'Mycoplasma genitalium G37', 'essential_genes': 0, 'essential_genes_gtf': 'bacteria/DEG1006_essential.gtf', 'org_DEG_ID': 'DEG1006', 'chr_id': 'NC_000908'}
{'org_name': 'Streptococcus pneumoniae', 'essential_genes': 0, 'essential_genes_gtf': 'bacteria/DEG1007_essential.gtf', 'org_DEG_ID': 'DEG1007', 'chr_id': '

  if self.run_code(code, result):


KeyboardInterrupt: 

In [21]:
ess_annot_df.head(10).T.index

Index([u'#DEG_ORG', u'#DEG_AC', u'#Gene_Name', u'#Gene_Ref', u'#COG',
       u'#Class', u'#Function', u'#Organism', u'#Refseq', u'#Condition', u'#-',
       u'#GO', u'#UNIPROT_AC'],
      dtype='object')