In [81]:
# from pandas.api.types import CategoricalDtype

# amino_acid_type = CategoricalDtype(
#     categories=[
#         "A",
#         "C",
#         "D",
#         "E",
#         "F",
#         "G",
#         "H",
#         "I",
#         "K",
#         "L",
#         "M",
#         "N",
#         "P",
#         "Q",
#         "R",
#         "S",
#         "T",
#         "V",
#         "W",
#         "Y",
#     ],
#     ordered=False,
# )


In [88]:
amino_acids = [
    "A",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "K",
    "L",
    "M",
    "N",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "V",
    "W",
    "Y",
]
aa_encode_dict = {aa: num for num, aa in enumerate(amino_acids)}
aa_encode = lambda aa: aa_encode_dict[aa]
aa_decode = lambda num: amino_acids[num]

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19}

In [82]:
import pandas as pd
def read_pssm(path):
    header = ""
    sequence=""
    pssm_scores = []
    with open(path) as pssm_file:
        next(pssm_file)
        next(pssm_file)

        amino_acids = pssm_file.readline().strip().split()[:20]
        header = amino_acids
        for line in pssm_file:
            if line == "\n":  # end of file, before overall scores
                break

            values = line.strip().split()
            amino_acid = values[1]
            if amino_acid not in amino_acids:
                raise ValueError(
                    f"unexpected amino acid in pssm file {path.name}: {amino_acid}"
                )

            scores = [float(score) for score in values[2:22]]

            if len(scores) != 20:
                raise ValueError(
                    f"incomplete PSSM file: {path.name}. Delete from folder {path.name} and recompute"
                )

            sequence += amino_acid
            pssm_scores.append(scores)
    df_pssm = pd.DataFrame(data=pssm_scores, columns=header, dtype=float).assign(sequence=pd.Series(list(sequence)))
    return df_pssm

In [83]:
read_pssm('/home/ad/blast_db_test/blast/pssm_uniref50_1it/A0A059J0G5.pssm').sequence

0       M
1       A
2       S
3       Q
4       P
       ..
1562    I
1563    G
1564    K
1565    I
1566    Y
Name: sequence, Length: 1567, dtype: category
Categories (20, object): ['A', 'C', 'D', 'E', ..., 'T', 'V', 'W', 'Y']

In [84]:
import sqlite3 as sl

In [None]:
# TODO rewrite PSSM class with tempfiles
# TODO stopiteration error?
# TODO does first db file have less than 369Mb, now that I am using categoricals? No.
# TODO is sequence column still a categorical, even when reading from sql?
# TODO encode amino acids as numbers and back
# TODO store pssm as int if possible
# TODO two pssm files were empty: Q7RTT9 50_3, Q92887 90_1

In [86]:
from pathlib import Path

# pssm_folder = "../data/intermediate/blast"
pssm_folder = "/home/ad/blast_db_test/blast"

for blast_db in ["uniref50", "uniref90"]:
    for n_iterations in [1, 3]:
        path = Path(f"{pssm_folder}/pssm_{blast_db}_{n_iterations}it")
        with sl.connect(
            f"/home/ad/blast_db_test/pssm_{blast_db}_{n_iterations}it.db"
        ) as con:
            for pssm_file_path in path.rglob("*.pssm"):
                try:
                    pssm = read_pssm(pssm_file_path)
                    pssm.to_sql(
                        pssm_file_path.name.split(".")[0], con, if_exists="replace"
                    )
                except StopIteration:
                    print(
                        f"StopIteration occurred for file {str(pssm_file_path)} in database {blast_db} with {n_iterations} iterations. File might be empty"
                    )


StopIteration occurred for file /home/ad/blast_db_test/blast/pssm_uniref50_3it/Q7RTT9.pssm in database uniref50 with 3 iterations
StopIteration occurred for file /home/ad/blast_db_test/blast/pssm_uniref90_1it/Q92887.pssm in database uniref90 with 1 iterations
StopIteration occurred for file /home/ad/blast_db_test/blast/pssm_uniref90_1it/Q46892.pssm in database uniref90 with 1 iterations


ValueError: incomplete PSSM file: Q08967.pssm. Delete from folder Q08967.pssm and recompute

In [91]:
accessions = ["Q39253"]
with sl.connect('/home/ad/blast_db_test/pssm_uniref50_1it.db') as con:
    for accession in accessions:
        pssm_prot = pd.read_sql_query(f"select * from {accession}", con, index_col="index")
pssm_prot

Unnamed: 0_level_0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,sequence
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.0,-2.0,-3.0,-4.0,-2.0,-1.0,-2.0,-3.0,-2.0,1.0,...,-2.0,7.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,M
1,5.0,-2.0,-2.0,-2.0,-1.0,-1.0,-1.0,1.0,-2.0,-2.0,...,-1.0,-2.0,-3.0,-1.0,2.0,0.0,-3.0,-2.0,-1.0,A
2,-1.0,-3.0,-1.0,-2.0,-3.0,-3.0,-3.0,6.0,-3.0,-4.0,...,-3.0,-3.0,-4.0,-3.0,1.0,-2.0,-4.0,-4.0,-4.0,G
3,0.0,0.0,1.0,-2.0,-2.0,-2.0,-2.0,0.0,-2.0,3.0,...,-1.0,-1.0,-2.0,-2.0,2.0,2.0,-3.0,-2.0,1.0,I
4,-1.0,-3.0,-3.0,-3.0,-2.0,-2.0,-3.0,-2.0,-4.0,2.0,...,-3.0,-1.0,-2.0,1.0,0.0,2.0,-4.0,-3.0,4.0,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,2.0,-2.0,-1.0,-1.0,-2.0,-1.0,0.0,5.0,-2.0,-3.0,...,-1.0,-2.0,-3.0,-2.0,0.0,-1.0,-3.0,-3.0,-2.0,G
459,-1.0,-3.0,-3.0,-4.0,-1.0,-3.0,-3.0,-4.0,-4.0,3.0,...,-3.0,1.0,-1.0,-3.0,-2.0,0.0,-3.0,-1.0,4.0,V
460,1.0,-3.0,-3.0,-3.0,-2.0,-2.0,-3.0,-2.0,-2.0,0.0,...,-3.0,0.0,4.0,-3.0,-1.0,-2.0,6.0,2.0,-1.0,F
461,1.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,-1.0,-3.0,...,0.0,-2.0,-3.0,4.0,4.0,1.0,-3.0,-2.0,-2.0,S


In [71]:
con.close()