In [1]:
import pandas as pd
import numpy as np
import glob

In [79]:
def clean_column_ids(df, col):
    return df[col].map(lambda x: x.split("|")[1])

def parse_fasta(protein_fasta):
    id = protein_fasta.split("\n")[0]
    sequence = ''.join(protein_fasta.split("\n")[1:])
    return (id, sequence)

In [81]:
def read_fasta(path):
    db_fasta = ""
    with open(path, "r") as f:
        db_fasta = ("\n" + f.read()).split("\n>")[1:]
    a = []
    for protein_fasta in db_fasta:
        id, seq = parse_fasta(protein_fasta)
        id = id.split("|")[1]
        a.append((id, seq, len(seq)))
    return pd.DataFrame(a, columns=["seq", "str", "seqlen"]).set_index("seq").rename_axis(index=None)

def read_family(path, drop_duplicates=True):
    df = None
    try:
        df = pd.read_csv(path, delimiter=r"\s+", comment="#", usecols=[0,2,4,5,6,17], header=None)
    except pd.errors.EmptyDataError:
        return pd.DataFrame(columns=["family", "seq", "f_eval", "f_score", "f_bias", "n_dom"])
    df.columns = ["family", "seq", "f_eval", "f_score", "f_bias", "n_dom"]
    df["seq"] = clean_column_ids(df, "seq")
    if drop_duplicates:
        df.sort_values(by="f_score", inplace=True, ascending=False)
        df.drop_duplicates(subset=["seq"], keep="first", inplace=True)
    df.set_index("seq", inplace=True)
    return df

In [76]:
def check_cluster(df):
    ids = df[~df.index.duplicated(keep="first")].index
    domains = {}
    for id in ids:
        val = df.loc[id]["family"]
        if type(val) == str:
            domains[id] = [val]
        else:
            domains[id] = val.to_numpy().tolist()
    return domains

def find_multidomain_clusters(path):
    fam_files = sorted(glob.glob(path))
    md_clusters = []
    for file in fam_files:
        df = read_family(file, drop_duplicates=False)

    return md_clusters

# md_clusters = find_multidomain_clusters("./../out/soybean.30.multi-step/hmmscan/cluster_*.out")
# len(md_clusters)

In [92]:
df = read_family("./../out/soybean.30.multi-step/hmmscan/cluster_4.out", drop_duplicates=False)
check_cluster(df)
df.shape

(59, 5)