In [50]:
import pandas as pd
import numpy as np
import re

In [51]:
with open("genomes.txt") as f:
    genomes = f.read()
genomes = genomes.split()

In [56]:
def clean_pfam_id(pfam_annot):
    return re.sub(
        "\]", "",
        re.sub(".*\[", "", pfam_annot)
    )

def get_new_annots(df):
    new_annots = []
    for row in df.iterrows():
        row = row[1]
        rank = row["rank"]
        
        new_annot = np.nan
        if rank == "C":
            new_annot = row["ko_id"]
        elif rank == "D":
            pfam_annot = str(row["pfam_hits"])
            if ";" in pfam_annot:
                new_annot = [clean_pfam_id(i) for i in pfam_annot.split(";")]
            else:
                new_annot = clean_pfam_id(pfam_annot)
        new_annots.append(new_annot)
    return new_annots

In [89]:
# genome = "GCF_019343105.1"

fin = []

# methane oxidation
genes = {
    "mmoX"       : "K16157",                                     
    "mmoY"       : "K16158",                                     
    "mmoB"       : "K16160",                                     
    "mmoZ"       : "K16159",                                     
    "mmoZ PFAM"  : "PF02964.19",                                 
    "mmoD"       : "K16162",                                     
    "mmoC"       : "K16161",                                     
    "mmoC PFAM"  : ["PF00175.24", "PF00970.27", "PF00111.30"]    
}

# genes = ["K10944", "K10945", "K10946"] # pMMO-AMO
# genes = ["K16157", "K16158", "K16159", "K16160", "K16161", "K16162", "K10944", "K10945", "K10946"] # ox
# genes = ["K14028", "K16254", "K16255", "K14029", "K16256", "K16257", "K16258", "K16259", "K16260", "K23995"] # mdh, xa

for genome in genomes:
    glist = []
    df = pd.read_csv(f"input_folder/{genome}/annotations.tsv", sep = "\t", index_col = 0)
    df = df.loc[sorted(df.index.tolist()), :]
    df["new_id"] = get_new_annots(df)
    d = df["new_id"].to_dict()
    
    for k in genes:
        gene = genes[k]
        
        if isinstance(gene, list):
            s = []
            for i in d:
                if isinstance(d[i], float) == False:
                    if set(d[i]) == set(gene):
                        s.append(i)
            
        elif isinstance(gene, str):
            s = [i for i in d if d[i] == gene]
        
        
        s = len(s)
        
          
        # s = df.loc[df["new_id"] == gene, :].shape[0]
        glist.append(s)
    
    fin.append(glist)

In [90]:
fin

[[0, 0, 0, 0, 0, 0, 0, 2],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 2],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 

In [91]:
genes2 = [str(i) for i in genes]

In [97]:
fin2 = pd.DataFrame(fin, index = genomes, columns = list(genes.keys()))
pfam_annots = [i.split()[0] for i in fin2.columns.tolist() if "PFAM" in i]

for p in pfam_annots:
    fin2[p] = fin2.loc[:, [p, f"{p} PFAM"]].max(axis = "columns")
    fin2.drop(f"{p} PFAM", axis = "columns", inplace = True)
fin2.replace(0, "X", inplace = True)
fin2.head()  

Unnamed: 0,mmoX,mmoY,mmoB,mmoZ,mmoD,mmoC
GCA_001898965.1,X,X,X,X,X,2
GCA_002412985.1,X,X,X,X,X,X
GCA_002862095.1,X,X,X,X,X,1
GCA_002890675.1,X,X,X,X,X,X
GCA_002929055.1,X,X,X,X,X,X


In [98]:
fin2.to_clipboard()

In [96]:
genome = "GCF_900104955.1"
df = pd.read_csv(f"input_folder/{genome}/annotations.tsv", sep = "\t", index_col = 0)
df = df.loc[sorted(df.index.tolist()), :]
df["new_id"] = get_new_annots(df)
d = df["new_id"].to_dict()

for i in d:
    if isinstance(d[i], float) == False:
        if set(d[i]) == set(["PF00175.24", "PF00970.27", "PF00111.30"]):
            print(i, d[i])

GCF_90010495_01080 ['PF00175.24', 'PF00111.30', 'PF00970.27']
GCF_90010495_12235 ['PF00970.27', 'PF00175.24', 'PF00111.30']
GCF_90010495_21000 ['PF00970.27', 'PF00175.24', 'PF00111.30']
GCF_90010495_25505 ['PF00970.27', 'PF00175.24', 'PF00111.30']
