In [171]:
import pandas as pd
amino_acids = [
    "A",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "K",
    "L",
    "M",
    "N",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "V",
    "W",
    "Y",
]
aa_encode_dict = {aa: num for num, aa in enumerate(amino_acids)}
aa_encode = lambda aa: aa_encode_dict[aa]
aa_decode = lambda num: amino_acids[num]
def read_pssm(path):
    header = ""
    sequence=""
    pssm_scores = []
    with open(path) as pssm_file:
        next(pssm_file)
        next(pssm_file)

        amino_acids = pssm_file.readline().strip().split()[:20]
        header = amino_acids
        for line in pssm_file:
            if line == "\n":  # end of file, before overall scores
                break

            values = line.strip().split()
            amino_acid = values[1]
            if amino_acid not in amino_acids:
                raise ValueError(
                    f"unexpected amino acid in pssm file {path.name}: {amino_acid}"
                )

            scores = [float(score) for score in values[2:22]]
            # scores = [int(score) if float.is_integer(score) else score for score in scores]

            if len(scores) != 20:
                # This happens when the program is interupted while it is writing.
                raise ValueError(
                    f"incomplete PSSM file: {path.name}. Delete from folder {path.name} and recompute"
                )

            sequence += amino_acid
            pssm_scores.append(scores)
    df_pssm = pd.DataFrame(data=pssm_scores, columns=header)
    if (df_pssm % 1 == 0).all(axis=None):
        df_pssm = df_pssm.astype(int)
    df_pssm = df_pssm.assign(seq=pd.Series(list(sequence)).apply(aa_encode))
    return df_pssm

In [173]:
import sqlite3 as sl

In [174]:
# TODO rewrite PSSM class with tempfiles
# TODO compress pssm db with xz
# TODO method that creates features
# TODO write to temp file, then rename? That should eliminate problems with incomplete pssm files.

In [175]:
from pathlib import Path

# pssm_folder = "../data/intermediate/blast"
pssm_folder = "/home/ad/blast_db_test/blast"

with sl.connect(
    f"/home/ad/blast_db_test/pssm.db"
) as con:
    for blast_db in ["uniref50", "uniref90"]:
        print(blast_db)
        for n_iterations in [1, 3]:
            print(n_iterations)
            path = Path(f"{pssm_folder}/pssm_{blast_db}_{n_iterations}it")
            for pssm_file_path in path.rglob("*.pssm"):
                try:
                    pssm = read_pssm(pssm_file_path)
                    table_name = f"{pssm_file_path.name.split('.')[0]}_{blast_db}_{n_iterations}it"
                    pssm.to_sql(
                        table_name, con, if_exists="replace"
                    )
                except StopIteration:
                    print(
                        f"StopIteration occurred for file {str(pssm_file_path)} in database {blast_db} with {n_iterations} iterations. File might be empty"
                    )
                except ValueError as e:
                    print(e)
                    print(f"file:{str(pssm_file_path)}")


uniref50
1
3
uniref90
1
StopIteration occurred for file /home/ad/blast_db_test/blast/pssm_uniref90_1it/Q46892.pssm in database uniref90 with 1 iterations. File might be empty
3
incomplete PSSM file: Q08967.pssm. Delete from folder Q08967.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/Q08967.pssm
incomplete PSSM file: Q9LEQ3.pssm. Delete from folder Q9LEQ3.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/Q9LEQ3.pssm
incomplete PSSM file: P07658.pssm. Delete from folder P07658.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/P07658.pssm
incomplete PSSM file: P18775.pssm. Delete from folder P18775.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/P18775.pssm
incomplete PSSM file: P17064.pssm. Delete from folder P17064.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/P17064.pssm
incomplete PSSM file: P0ABN9.pssm. Delete from folder P0ABN9.pssm and recompute
file:/home/ad/blast_db_

In [220]:
from sqlite3 import DatabaseError, OperationalError

# def get_pssm(accession, database, n_iterations, con):
#     pssm_prot = pd.read_sql_query(f"select * from {accession}_", con, index_col="index")
#     pssm_prot.seq = pssm_prot.seq.apply(aa_decode)
#     return pssm_prot

accessions = ["Q39253_uniref50_1it","Q39253_uniref50_1it_ERROR"]
def get_pssms(accessions:list):
    with sl.connect('/home/ad/blast_db_test/pssm.db') as con:
        # TODO SQL query that returns multiple dataframes?
        for accession in accessions:
            # https://stackoverflow.com/questions/1601151/how-do-i-check-in-sqlite-whether-a-table-exists
            # TODO better solution?
            res = con.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{accession}';")
            fetch = res.fetchone()
            if fetch is not None:
                pssm_prot = pd.read_sql_query(f"select * from {accession}", con, index_col="index")
                pssm_prot.seq = pssm_prot.seq.apply(aa_decode)
                yield pssm_prot
            else:
                yield "Null"
                # TODO create and return PSSM

list(get_pssms(accessions=accessions))

[       A  R  N  D  C  Q  E  G  H  I  ...  K  M  F  P  S  T  W  Y  V  seq
 index                                ...                                
 0     -1 -2 -3 -4 -2 -1 -2 -3 -2  1  ... -2  7  0 -3 -2 -1 -2 -1  1    M
 1      5 -2 -2 -2 -1 -1 -1  1 -2 -2  ... -1 -2 -3 -1  2  0 -3 -2 -1    A
 2     -1 -3 -1 -2 -3 -3 -3  6 -3 -4  ... -3 -3 -4 -3  1 -2 -4 -4 -4    G
 3      0  0  1 -2 -2 -2 -2  0 -2  3  ... -1 -1 -2 -2  2  2 -3 -2  1    I
 4     -1 -3 -3 -3 -2 -2 -3 -2 -4  2  ... -3 -1 -2  1  0  2 -4 -3  4    V
 ...   .. .. .. .. .. .. .. .. .. ..  ... .. .. .. .. .. .. .. .. ..  ...
 458    2 -2 -1 -1 -2 -1  0  5 -2 -3  ... -1 -2 -3 -2  0 -1 -3 -3 -2    G
 459   -1 -3 -3 -4 -1 -3 -3 -4 -4  3  ... -3  1 -1 -3 -2  0 -3 -1  4    V
 460    1 -3 -3 -3 -2 -2 -3 -2 -2  0  ... -3  0  4 -3 -1 -2  6  2 -1    F
 461    1 -1  0 -1 -1  0  0 -1 -1 -3  ...  0 -2 -3  4  4  1 -3 -2 -2    S
 462    2 -1  0 -1 -1  0  0  0 -1 -2  ...  0 -1 -3 -1  4  1 -3 -2 -1    S
 
 [463 rows x 21 columns],
 'Null']

In [170]:
pssm_prot
pssm = pssm_prot.drop("seq", axis=1)
pssm.loc[0,"A"] = -1.0
(pssm % 1 == 0).all(axis=None)

True

In [71]:
con.close()