In [16]:
import sqlite3 as sl
import pandas as pd
from pathlib import Path
import subprocess
from subpred.fasta import write_fasta
from subpred.pssm import __process_pssm_file, calculate_pssm_feature

In [2]:
amino_acids = [
    "A",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "K",
    "L",
    "M",
    "N",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "V",
    "W",
    "Y",
]
aa_encode_dict = {aa: num for num, aa in enumerate(amino_acids)}
aa_encode = lambda aa: aa_encode_dict[aa]
aa_decode = lambda num: amino_acids[num]
def read_pssm(path):
    header = ""
    sequence=""
    pssm_scores = []
    with open(path) as pssm_file:
        next(pssm_file)
        next(pssm_file)

        amino_acids = pssm_file.readline().strip().split()[:20]
        header = amino_acids
        for line in pssm_file:
            if line == "\n":  # end of file, before overall scores
                break

            values = line.strip().split()
            amino_acid = values[1]
            if amino_acid not in amino_acids:
                raise ValueError(
                    f"unexpected amino acid in pssm file {path.name}: {amino_acid}"
                )

            scores = [float(score) for score in values[2:22]]
            # scores = [int(score) if float.is_integer(score) else score for score in scores]

            if len(scores) != 20:
                # This happens when the program is interupted while it is writing.
                raise ValueError(
                    f"incomplete PSSM file: {path.name}. Delete from folder {path.name} and recompute"
                )

            sequence += amino_acid
            pssm_scores.append(scores)
    df_pssm = pd.DataFrame(data=pssm_scores, columns=header)
    if (df_pssm % 1 == 0).all(axis=None):
        df_pssm = df_pssm.astype(int)
    df_pssm = df_pssm.assign(seq=pd.Series(list(sequence)).apply(aa_encode))
    return df_pssm

In [3]:
# TODO rewrite PSSM class with tempfiles
# TODO compress pssm db with xz
# TODO method that creates features
# TODO write to temp file, then rename? That should eliminate problems with incomplete pssm files.

In [4]:
def add_pssm_sql(pssm_file_path:str, table_name:str, con:sl.Connection):
    try:
        pssm = read_pssm(pssm_file_path)
        # table_name = f"{pssm_file_path.name.split('.')[0]}_{blast_db}_{n_iterations}it"
        pssm.to_sql(
            table_name, con, if_exists="replace"
        )
    except StopIteration:
        print(
            f"StopIteration occurred for file {str(pssm_file_path)}. File might be empty"
        )
    except ValueError as e:
        print(e)
        print(f"ValueError occured for file:{str(pssm_file_path)}. File might be incomplete.")

In [5]:

# pssm_folder = "../data/intermediate/blast"
pssm_folder = "/home/ad/blast_db_test/blast"

with sl.connect(
    f"/home/ad/blast_db_test/pssm.db"
) as con:
    for blast_db in ["uniref50", "uniref90"]:
        print(blast_db)
        for n_iterations in [1, 3]:
            print(n_iterations)
            path = Path(f"{pssm_folder}/pssm_{blast_db}_{n_iterations}it")
            for pssm_file_path in path.rglob("*.pssm"):
                try:
                    pssm = read_pssm(pssm_file_path)
                    table_name = f"{pssm_file_path.name.split('.')[0]}_{blast_db}_{n_iterations}it"
                    pssm.to_sql(
                        table_name, con, if_exists="replace"
                    )
                except StopIteration:
                    print(
                        f"StopIteration occurred for file {str(pssm_file_path)} in database {blast_db} with {n_iterations} iterations. File might be empty"
                    )
                except ValueError as e:
                    print(e)
                    print(f"file:{str(pssm_file_path)}")


uniref50
1


KeyboardInterrupt: 

In [6]:



def create_pssm_file(
    accession: str,
    sequence: str,
    psiblast_location: str,
    fasta_file_name: str,
    pssm_file_name: str,
    blastdb_location: str,
    iterations: int,
    evalue: float = 0.002,
    threads: int = 1,
) -> None:
    write_fasta(
        fasta_file_name=fasta_file_name, fasta_data=[(">" + accession, sequence)]
    )

    log_file_name = f"{pssm_file_name}.log"
    subprocess.run(
        "{} -query {} -db {} -num_iterations {} -inclusion_ethresh {} -num_threads {} -save_pssm_after_last_round\
             -out_ascii_pssm {} -out {} -comp_based_stats {}".format(
            psiblast_location,
            fasta_file_name,
            blastdb_location,
            iterations,
            evalue,
            threads,
            pssm_file_name,
            log_file_name,
            2
            if iterations == 1
            else 1,  # default is 2, but not supported when matrix is PSSM instead of BLOSUM
        ),
        check=True,
        shell=True  # False if platform.system() == "Windows" else True,
        # stdout=subprocess.DEVNULL,
        # stderr=subprocess.DEVNULL,
    )


In [7]:
def get_pssms(table_names:list, sql_db_path:str = '/home/ad/blast_db_test/pssm.db'):
    with sl.connect(sql_db_path) as con:
        for table_name in table_names:
            # https://stackoverflow.com/questions/1601151/how-do-i-check-in-sqlite-whether-a-table-exists
            # TODO better solution?
            res = con.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")
            fetch = res.fetchone()
            if fetch is not None:
                print("Protein was found in SQL database! Reading...")
                pssm_prot = pd.read_sql_query(f"select * from {table_name}", con, index_col="index")
                pssm_prot.seq = pssm_prot.seq.apply(aa_decode)
                yield pssm_prot
            else:
                print("Protein was not found in SQL database. Calling psiblast...")

                # TODO sequences
                yield "Null"
                # TODO create and return PSSM

accessions = ["Q39253_uniref50_1it","Q39253_uniref50_1it_ERROR"]
list(get_pssms(accessions))

Protein was found in SQL database! Reading...
Protein was not found in SQL database. Calling psiblast...


[       A  R  N  D  C  Q  E  G  H  I  ...  K  M  F  P  S  T  W  Y  V  seq
 index                                ...                                
 0     -1 -2 -3 -4 -2 -1 -2 -3 -2  1  ... -2  7  0 -3 -2 -1 -2 -1  1    M
 1      5 -2 -2 -2 -1 -1 -1  1 -2 -2  ... -1 -2 -3 -1  2  0 -3 -2 -1    A
 2     -1 -3 -1 -2 -3 -3 -3  6 -3 -4  ... -3 -3 -4 -3  1 -2 -4 -4 -4    G
 3      0  0  1 -2 -2 -2 -2  0 -2  3  ... -1 -1 -2 -2  2  2 -3 -2  1    I
 4     -1 -3 -3 -3 -2 -2 -3 -2 -4  2  ... -3 -1 -2  1  0  2 -4 -3  4    V
 ...   .. .. .. .. .. .. .. .. .. ..  ... .. .. .. .. .. .. .. .. ..  ...
 458    2 -2 -1 -1 -2 -1  0  5 -2 -3  ... -1 -2 -3 -2  0 -1 -3 -3 -2    G
 459   -1 -3 -3 -4 -1 -3 -3 -4 -4  3  ... -3  1 -1 -3 -2  0 -3 -1  4    V
 460    1 -3 -3 -3 -2 -2 -3 -2 -2  0  ... -3  0  4 -3 -1 -2  6  2 -1    F
 461    1 -1  0 -1 -1  0  0 -1 -1 -3  ...  0 -2 -3  4  4  1 -3 -2 -2    S
 462    2 -1  0 -1 -1  0  0  0 -1 -2  ...  0 -1 -3 -1  4  1 -3 -2 -1    S
 
 [463 rows x 21 columns],
 'Null']

TODO how long does it take?

In [9]:
from subpred.dataset import create_dataset, SUBSTRATE_KEYWORDS

df_uniprot = create_dataset(
    input_file="../data/raw/uniprot/uniprot_2022_05_evidence1-2_nofragments.tsv",
    # keywords_classes = None,
    # keywords_classes_all = SUBSTRATE_KEYWORDS,
    # keywords_filter = None,
    multi_substrate="keep",
    # outliers=outliers,
    verbose=True,
    # tax_ids_filter=[3702, 9606, 83333, 559292],
    # sequence_clustering=70,
    evidence_code=2,
    invalid_amino_acids="remove_amino_acids",
    # gene_names_only = True,
    # force_update=True,
    # remove_sequence_fragments = True,
    # force_update = False,
    tcdb_substrates_file="../data/raw/tcdb/tcdb_substrates.tsv",
    swissprot_only=False,
)
print(df_uniprot.shape)
df_uniprot.head()


Reading text file...
Did not find pickle, creating new version...
(1021957, 16)


Unnamed: 0_level_0,gene_names,protein_names,reviewed,protein_existence,sequence,organism_id,go_ids,keyword_ids,keywords,tcdb_id,interpro,tcdb_class,tcdb_substrates,keywords_substrates,keywords_transport_related,keywords_location
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A0A0C5B5G6,MT-RNR1,Mitochondrial-derived peptide MOTS-c (Mitochon...,reviewed,Evidence at protein level,MRWQEMGYIFYPRKLR,9606,GO:0001649; GO:0003677; GO:0005615; GO:0005634...,KW-0238; KW-0496; KW-0539; KW-0892; KW-1185; K...,DNA-binding;Mitochondrion;Nucleus;Osteogenesis...,,,0.0,,,,
A0A1B0GTW7,CIROP LMLN2,Ciliated left-right organizer metallopeptidase...,reviewed,Evidence at protein level,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,9606,GO:0004222; GO:0005737; GO:0006508; GO:0007155...,KW-0025; KW-0225; KW-0325; KW-1056; KW-0378; K...,Alternative splicing;Disease variant;Glycoprot...,,IPR001577;,0.0,,,,
A0JNW5,BLTP3B KIAA0701 SHIP164 UHRF1BP1L,Bridge-like lipid transfer protein family memb...,reviewed,Evidence at protein level,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606,GO:0005769; GO:0005829; GO:0034498; GO:0042803...,KW-0025; KW-0175; KW-0963; KW-0967; KW-0597; K...,Alternative splicing;Coiled coil;Cytoplasm;End...,,IPR026728;IPR026854;,0.0,,,,
A0JP26,POTEB3,POTE ankyrin domain family member B3,reviewed,Evidence at protein level,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606,,KW-0025; KW-0040; KW-0175; KW-1185; KW-0677,Alternative splicing;ANK repeat;Coiled coil;Re...,,IPR002110;IPR036770;IPR039497;,0.0,,,,
A0PK11,CLRN2,Clarin-2,reviewed,Evidence at protein level,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606,GO:0007605; GO:0032421; GO:0060088; GO:0060171...,KW-1003; KW-0966; KW-0209; KW-0225; KW-0325; K...,Cell membrane;Cell projection;Deafness;Disease...,9.A.46.1.2,IPR026748;,9.A,,,,


In [None]:
df_uniprot

In [14]:
sequences = df_uniprot.sequence
blast_database = "uniref50"
iterations = 1

table_names = [f"{accession}_{blast_database}_{iterations}it" for accession in sequences.index.tolist()]

pssms = list(get_pssms(table_names=table_names))

# get_pssms

Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was found in SQL database! Reading...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein was not found in SQL database. Calling psiblast...
Protein wa

KeyboardInterrupt: 

In [None]:
pssm_prot
pssm = pssm_prot.drop("seq", axis=1)
pssm.loc[0,"A"] = -1.0
(pssm % 1 == 0).all(axis=None)

True

In [None]:
con.close()