In [81]:
# from pandas.api.types import CategoricalDtype

# amino_acid_type = CategoricalDtype(
#     categories=[
#         "A",
#         "C",
#         "D",
#         "E",
#         "F",
#         "G",
#         "H",
#         "I",
#         "K",
#         "L",
#         "M",
#         "N",
#         "P",
#         "Q",
#         "R",
#         "S",
#         "T",
#         "V",
#         "W",
#         "Y",
#     ],
#     ordered=False,
# )


In [158]:
import pandas as pd
amino_acids = [
    "A",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "K",
    "L",
    "M",
    "N",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "V",
    "W",
    "Y",
]
aa_encode_dict = {aa: num for num, aa in enumerate(amino_acids)}
aa_encode = lambda aa: aa_encode_dict[aa]
aa_decode = lambda num: amino_acids[num]
def read_pssm(path):
    header = ""
    sequence=""
    pssm_scores = []
    with open(path) as pssm_file:
        next(pssm_file)
        next(pssm_file)

        amino_acids = pssm_file.readline().strip().split()[:20]
        header = amino_acids
        for line in pssm_file:
            if line == "\n":  # end of file, before overall scores
                break

            values = line.strip().split()
            amino_acid = values[1]
            if amino_acid not in amino_acids:
                raise ValueError(
                    f"unexpected amino acid in pssm file {path.name}: {amino_acid}"
                )

            scores = [float(score) for score in values[2:22]]
            # scores = [int(score) if float.is_integer(score) else score for score in scores]

            if len(scores) != 20:
                raise ValueError(
                    f"incomplete PSSM file: {path.name}. Delete from folder {path.name} and recompute"
                )

            sequence += amino_acid
            pssm_scores.append(scores)
    df_pssm = pd.DataFrame(data=pssm_scores, columns=header)
    if (df_pssm % 1 == 0).all().all():
        df_pssm = df_pssm.astype(int)
    df_pssm = df_pssm.assign(seq=pd.Series(list(sequence)).apply(aa_encode))
    return df_pssm

In [159]:
pssm = read_pssm('/home/ad/blast_db_test/blast/pssm_uniref50_1it/A0A059J0G5.pssm')
pssm

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,seq
0,-1,-2,-3,-4,-1,-2,-3,-3,-3,2,...,-2,4,0,-3,-2,-1,-2,-1,2,10
1,4,-2,-2,-2,-1,-1,-1,0,-2,-2,...,-1,-1,-3,4,1,0,-3,-2,-1,0
2,2,-1,2,0,-2,-1,-1,-1,-1,-3,...,-1,-2,-3,3,3,0,-3,-2,-2,15
3,-1,0,0,0,-3,5,3,-2,0,-3,...,1,-1,-3,-1,0,1,-3,-2,-2,13
4,-1,-2,-1,-2,-2,-1,-1,-2,-2,-2,...,-1,-2,-3,7,0,3,-4,-3,-2,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,-1,-2,-2,-2,-1,-2,-2,-3,-3,3,...,-2,1,0,-2,-2,0,-2,-1,3,7
1563,0,-1,-1,-1,-2,2,-1,4,-1,-1,...,-1,-1,-3,-2,-1,-1,-3,-2,0,5
1564,-1,4,0,-1,-3,3,1,-2,0,-3,...,4,-1,-3,-1,0,-1,-3,-2,-2,8
1565,-1,-2,-2,-3,-1,-2,-2,-3,-3,3,...,-2,1,-1,-2,-1,2,-3,-1,3,7


In [160]:
import sqlite3 as sl

In [161]:
# TODO rewrite PSSM class with tempfiles

In [167]:
from pathlib import Path

# pssm_folder = "../data/intermediate/blast"
pssm_folder = "/home/ad/blast_db_test/blast"

with sl.connect(
    f"/home/ad/blast_db_test/pssm.db"
) as con:
    for blast_db in ["uniref50", "uniref90"]:
        print(blast_db)
        for n_iterations in [1, 3]:
            print(n_iterations)
            path = Path(f"{pssm_folder}/pssm_{blast_db}_{n_iterations}it")
            for pssm_file_path in path.rglob("*.pssm"):
                try:
                    pssm = read_pssm(pssm_file_path)
                    table_name = f"{pssm_file_path.name.split('.')[0]}_{blast_db}_{n_iterations}it"
                    pssm.to_sql(
                        table_name, con, if_exists="replace"
                    )
                except StopIteration:
                    print(
                        f"StopIteration occurred for file {str(pssm_file_path)} in database {blast_db} with {n_iterations} iterations. File might be empty"
                    )
                except ValueError as e:
                    print(e)
                    print(f"file:{str(pssm_file_path)}")


uniref50
1


In [166]:
accessions = ["Q39253_uniref50_1it"]
with sl.connect('/home/ad/blast_db_test/pssm.db') as con:
    for accession in accessions:
        pssm_prot = pd.read_sql_query(f"select * from {accession}", con, index_col="index")
pssm_prot.seq = pssm_prot.seq.apply(aa_decode)
pssm_prot

Unnamed: 0_level_0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,seq
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1,-2,-3,-4,-2,-1,-2,-3,-2,1,...,-2,7,0,-3,-2,-1,-2,-1,1,M
1,5,-2,-2,-2,-1,-1,-1,1,-2,-2,...,-1,-2,-3,-1,2,0,-3,-2,-1,A
2,-1,-3,-1,-2,-3,-3,-3,6,-3,-4,...,-3,-3,-4,-3,1,-2,-4,-4,-4,G
3,0,0,1,-2,-2,-2,-2,0,-2,3,...,-1,-1,-2,-2,2,2,-3,-2,1,I
4,-1,-3,-3,-3,-2,-2,-3,-2,-4,2,...,-3,-1,-2,1,0,2,-4,-3,4,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,2,-2,-1,-1,-2,-1,0,5,-2,-3,...,-1,-2,-3,-2,0,-1,-3,-3,-2,G
459,-1,-3,-3,-4,-1,-3,-3,-4,-4,3,...,-3,1,-1,-3,-2,0,-3,-1,4,V
460,1,-3,-3,-3,-2,-2,-3,-2,-2,0,...,-3,0,4,-3,-1,-2,6,2,-1,F
461,1,-1,0,-1,-1,0,0,-1,-1,-3,...,0,-2,-3,4,4,1,-3,-2,-2,S


In [149]:
pssm_prot
pssm = pssm_prot.drop("sequence", axis=1)
pssm.loc[0,"A"] = -1.1
(pssm % 1 == 0).all().all()

False

In [71]:
con.close()