In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import glob
import numpy as np
import time
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
scaler = preprocessing.MinMaxScaler()


In [2]:
def time_ms():
    return round(time.time()*1000)

def clean_column_ids(df, col):
    return df[col].map(lambda x: x.split("|")[1])

def parse_fasta(protein_fasta):
    id = protein_fasta.split("\n")[0]
    sequence = ''.join(protein_fasta.split("\n")[1:])
    return (id, sequence)

In [3]:
def read_score(path):
    df = pd.read_csv(path,   delimiter=r"\s+", comment="#", usecols=[0,2,4,5,6], header=None)
    df.columns = ["sequence", "reference", "eval", "score", "bias"]
    df.sort_values(by=["score"], inplace=True, ascending=False)
    df["sequence"] = clean_column_ids(df, "sequence")
    df.set_index("sequence", inplace=True)
    return df
def read_family(path):
    df = None
    try:
        df = pd.read_csv(path, delimiter=r"\s+", comment="#", usecols=[0,2,4,5,6,17], header=None)
    except pd.errors.EmptyDataError:
        return pd.DataFrame(columns=["family", "seq", "f_eval", "f_score", "f_bias", "n_dom"])
    df.columns = ["family", "seq", "f_eval", "f_score", "f_bias", "n_dom"]
    df["seq"] = clean_column_ids(df, "seq")
    #df.drop_duplicates(subset=["seq"], keep="first", inplace=True)
    df.set_index("seq", inplace=True)
    return df
def read_len(path):
    db_fasta = ""
    with open(path, "r") as f:
        db_fasta = ("\n" + f.read()).split("\n>")[1:]
    a = []
    for protein_fasta in db_fasta:
        id, seq = parse_fasta(protein_fasta)
        id = id.split("|")[1]
        a.append((id, seq, len(seq)))
    return pd.DataFrame(a, columns=["seq", "str", "seqlen"]).set_index("seq").rename_axis(index=None)
    

In [44]:
def load_all(path, normalise=False):
    hmms_files = sorted(glob.glob(path.replace("$$$", "hmmsearch")))
    phmm_files = sorted(glob.glob(path.replace("$$$", "phmmer")))
    fasta_files = sorted(glob.glob(path.replace("$$$", "clean").replace(".out", ".clean.fasta")))
    dfs = []
    files = list(zip(hmms_files, phmm_files, fasta_files))
    for file in files:
        ref = file[0].split("/")[-1].split(".")[0]
        hmms_df = read_score(file[0]).rename(columns={"score" : "hmms_score", "eval" : "hmms_eval", "bias" : "hmms_bias"})
        phmm_df = read_score(file[1]).rename(columns={"score" : "phmm_score", "eval" : "phmm_eval", "bias" : "phmm_bias"})
        phmm_df.drop(["reference"], axis=1, inplace=True)
        len_df = read_len(file[2])
        df = pd.concat([hmms_df, phmm_df], axis=1)
        if normalise:
            df["phmm_score"] = scaler.fit_transform(df["phmm_score"].values.reshape(-1,1))
            df["hmms_score"] = scaler.fit_transform(df["hmms_score"].values.reshape(-1,1))
        dfs.append(df)
    frame = pd.concat(dfs, axis=0)
    return frame


In [45]:
df = load_all("./../out/uniprot_sprot.90.multi-step/$$$/*.out")
ide = load_all("./../out/uniprot_sprot.90.id.multi-step/$$$/*.out")
hl = load_all("./../out/uniprot_sprot.90.hl.multi-step/$$$/*.out")
sim = load_all("./../out/uniprot_sprot.90.s.multi-step/$$$/*.out")

In [57]:
d1 = read_score("./../out/uniprot_sprot.90.hl.multi-step/hmmsearch/A1E9T8.out").rename(columns={"score" : "hmms_score", "eval" : "hmms_eval", "bias" : "hmms_bias"}).sort_index()

In [59]:
d2 = read_score("./../out/uniprot_sprot.90.hl.multi-step/phmmer/A1E9T8.out").rename(columns={"score" : "phmm_score", "eval" : "phmm_eval", "bias" : "phmm_bias"}).sort_index()


FileNotFoundError: [Errno 2] No such file or directory: './../out/uniprot_sprot.90.hl.multi-step/phmmer/A1E9S5.out'

In [56]:
pd.concat([d1,d2], axis=1)

Unnamed: 0_level_0,reference,hmms_eval,hmms_score,hmms_bias,phmm_eval,phmm_score,phmm_bias
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A0A336,A1E9S5,5.500000e-119,385.9,1.9,1.900000e-106,345.1,1.1
A0ZZ36,A1E9S5,2.100000e-120,390.5,2.8,5.500000e-109,353.4,1.4
A1E9J2,A1E9S5,5.400000e-118,382.6,2.3,1.500000e-113,368.3,1.4
A1E9S5,A1E9S5,4.600000e-116,376.4,2.6,2.000000e-119,387.4,1.4
A1EA09,A1E9S5,8.400000e-118,382.0,2.3,2.000000e-114,371.1,1.3
...,...,...,...,...,...,...,...
Q7YJX1,A1E9S5,2.100000e-118,384.0,2.9,1.900000e-108,351.6,1.3
Q8S8X3,A1E9S5,4.700000e-120,389.4,2.6,3.800000e-108,350.7,1.4
Q9BBT4,A1E9S5,1.200000e-119,388.0,2.2,1.900000e-109,355.0,1.2
Q9M3M5,A1E9S5,4.200000e-115,373.2,2.4,5.500000e-104,337.2,1.5
