In [None]:
import sqlite3 as sl
import pandas as pd
from pathlib import Path
import subprocess
from subpred.fasta import write_fasta

In [171]:
amino_acids = [
    "A",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "K",
    "L",
    "M",
    "N",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "V",
    "W",
    "Y",
]
aa_encode_dict = {aa: num for num, aa in enumerate(amino_acids)}
aa_encode = lambda aa: aa_encode_dict[aa]
aa_decode = lambda num: amino_acids[num]
def read_pssm(path):
    header = ""
    sequence=""
    pssm_scores = []
    with open(path) as pssm_file:
        next(pssm_file)
        next(pssm_file)

        amino_acids = pssm_file.readline().strip().split()[:20]
        header = amino_acids
        for line in pssm_file:
            if line == "\n":  # end of file, before overall scores
                break

            values = line.strip().split()
            amino_acid = values[1]
            if amino_acid not in amino_acids:
                raise ValueError(
                    f"unexpected amino acid in pssm file {path.name}: {amino_acid}"
                )

            scores = [float(score) for score in values[2:22]]
            # scores = [int(score) if float.is_integer(score) else score for score in scores]

            if len(scores) != 20:
                # This happens when the program is interupted while it is writing.
                raise ValueError(
                    f"incomplete PSSM file: {path.name}. Delete from folder {path.name} and recompute"
                )

            sequence += amino_acid
            pssm_scores.append(scores)
    df_pssm = pd.DataFrame(data=pssm_scores, columns=header)
    if (df_pssm % 1 == 0).all(axis=None):
        df_pssm = df_pssm.astype(int)
    df_pssm = df_pssm.assign(seq=pd.Series(list(sequence)).apply(aa_encode))
    return df_pssm

In [174]:
# TODO rewrite PSSM class with tempfiles
# TODO compress pssm db with xz
# TODO method that creates features
# TODO write to temp file, then rename? That should eliminate problems with incomplete pssm files.

In [None]:
def add_pssm_sql(pssm_file_path:str, table_name:str, con:sl.Connection):
    try:
        pssm = read_pssm(pssm_file_path)
        # table_name = f"{pssm_file_path.name.split('.')[0]}_{blast_db}_{n_iterations}it"
        pssm.to_sql(
            table_name, con, if_exists="replace"
        )
    except StopIteration:
        print(
            f"StopIteration occurred for file {str(pssm_file_path)}. File might be empty"
        )
    except ValueError as e:
        print(e)
        print(f"ValueError occured for file:{str(pssm_file_path)}. File might be incomplete.")

In [175]:

# pssm_folder = "../data/intermediate/blast"
pssm_folder = "/home/ad/blast_db_test/blast"

with sl.connect(
    f"/home/ad/blast_db_test/pssm.db"
) as con:
    for blast_db in ["uniref50", "uniref90"]:
        print(blast_db)
        for n_iterations in [1, 3]:
            print(n_iterations)
            path = Path(f"{pssm_folder}/pssm_{blast_db}_{n_iterations}it")
            for pssm_file_path in path.rglob("*.pssm"):
                try:
                    pssm = read_pssm(pssm_file_path)
                    table_name = f"{pssm_file_path.name.split('.')[0]}_{blast_db}_{n_iterations}it"
                    pssm.to_sql(
                        table_name, con, if_exists="replace"
                    )
                except StopIteration:
                    print(
                        f"StopIteration occurred for file {str(pssm_file_path)} in database {blast_db} with {n_iterations} iterations. File might be empty"
                    )
                except ValueError as e:
                    print(e)
                    print(f"file:{str(pssm_file_path)}")


uniref50
1
3
uniref90
1
StopIteration occurred for file /home/ad/blast_db_test/blast/pssm_uniref90_1it/Q46892.pssm in database uniref90 with 1 iterations. File might be empty
3
incomplete PSSM file: Q08967.pssm. Delete from folder Q08967.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/Q08967.pssm
incomplete PSSM file: Q9LEQ3.pssm. Delete from folder Q9LEQ3.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/Q9LEQ3.pssm
incomplete PSSM file: P07658.pssm. Delete from folder P07658.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/P07658.pssm
incomplete PSSM file: P18775.pssm. Delete from folder P18775.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/P18775.pssm
incomplete PSSM file: P17064.pssm. Delete from folder P17064.pssm and recompute
file:/home/ad/blast_db_test/blast/pssm_uniref90_3it/P17064.pssm
incomplete PSSM file: P0ABN9.pssm. Delete from folder P0ABN9.pssm and recompute
file:/home/ad/blast_db_

In [None]:



def create_pssm_file(
    accession: str,
    sequence: str,
    psiblast_location: str,
    fasta_file_name: str,
    pssm_file_name: str,
    blastdb_location: str,
    iterations: int,
    evalue: float = 0.002,
    threads: int = 1,
) -> None:
    write_fasta(
        fasta_file_name=fasta_file_name, fasta_data=[(">" + accession, sequence)]
    )

    log_file_name = f"{pssm_file_name}.log"
    subprocess.run(
        "{} -query {} -db {} -num_iterations {} -inclusion_ethresh {} -num_threads {} -save_pssm_after_last_round\
             -out_ascii_pssm {} -out {} -comp_based_stats {}".format(
            psiblast_location,
            fasta_file_name,
            blastdb_location,
            iterations,
            evalue,
            threads,
            pssm_file_name,
            log_file_name,
            2
            if iterations == 1
            else 1,  # default is 2, but not supported when matrix is PSSM instead of BLOSUM
        ),
        check=True,
        shell=True  # False if platform.system() == "Windows" else True,
        # stdout=subprocess.DEVNULL,
        # stderr=subprocess.DEVNULL,
    )


In [3]:
def get_pssms(table_names:list, sql_db_path:str = '/home/ad/blast_db_test/pssm.db'):
    with sl.connect(sql_db_path) as con:
        for table_name in table_names:
            # https://stackoverflow.com/questions/1601151/how-do-i-check-in-sqlite-whether-a-table-exists
            # TODO better solution?
            res = con.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")
            fetch = res.fetchone()
            if fetch is not None:
                print("Protein was found in SQL database! Reading...")
                pssm_prot = pd.read_sql_query(f"select * from {table_name}", con, index_col="index")
                pssm_prot.seq = pssm_prot.seq.apply(aa_decode)
                yield pssm_prot
            else:
                print("Protein was not found in SQL database. Calling psiblast...")


                yield "Null"
                # TODO create and return PSSM

accessions = ["Q39253_uniref50_1it","Q39253_uniref50_1it_ERROR"]
list(get_pssms(accessions))

NameError: name 'sl' is not defined

In [170]:
pssm_prot
pssm = pssm_prot.drop("seq", axis=1)
pssm.loc[0,"A"] = -1.0
(pssm % 1 == 0).all(axis=None)

True

In [71]:
con.close()