In [228]:
import pandas as pd
import pickle
import networkx as nx
from subpred.util import save_data
from subpred.fasta import read_fasta
import re
import numpy as np

In [229]:
DATASET_FOLDER= "data/datasets/"
RAW_DATA_FOLDER= "data/raw/"

## Uniprot

In [230]:
names = [
    "Uniprot",
    "gene_names",
    "protein_names",
    "reviewed",
    "protein_existence",
    "sequence",
    "organism_id",
    "go_ids",
    "keyword_ids",
    "keywords",
    "tcdb_ids",
    "interpro_ids",
]
dtypes = {
    "Uniprot": "string",
    "gene_names": "string",
    "protein_names": "string",
    "reviewed": "category",
    "protein_existence": "category",
    "sequence": "string",
    "organism_id": "int",
    "go_ids": "string",
    "keyword_ids": "string",
    "keywords": "string",
    "tcdb_ids": "string",
    "interpro_ids": "string",
}

df = pd.read_table(
    RAW_DATA_FOLDER+"uniprot/uniprot_evidence1-2_nofragments.tsv",
    index_col=0,
    header=None,
    names=names,
    dtype=dtypes,
    skiprows=1,
)

In [231]:
df.reviewed = df.reviewed.transform(lambda x: x == "reviewed").astype("bool")

In [232]:
df.protein_existence = df.protein_existence.map(
    {"Evidence at transcript level": 2, "Evidence at protein level": 1}
).astype("int")

In [233]:

# sequences_nonstandard_aa_count = (
#     df[~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))]
#     .sequence.drop_duplicates()
#     .shape[0]
# )
# sequences_count = df.sequence.drop_duplicates().shape[0]
# print(
#     f"proteins with non-standard amino acids: {sequences_nonstandard_aa_count} out of {sequences_count} ({sequences_nonstandard_aa_count / sequences_count * 100:.2f}%)"
# )

In [234]:
# saving accessions of those proteins, for later stats

df_sequences_nonstandard = df[
    ~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))
].sequence.to_frame().drop_duplicates()
save_data(df_sequences_nonstandard, "proteins_nonstandard_aminoacids", folder_path=DATASET_FOLDER)


In [235]:
# removing the AAs (all features are based on 20 AAs)
df.sequence = df.sequence.str.replace(
    re.compile("[^ACDEFGHIKLMNPQRSTVWY]+"), "", regex=True
)

In [236]:
df_sequences = df.drop(
    ["go_ids", "keyword_ids", "keywords", "tcdb_ids", "interpro_ids"], axis=1
)
# df_sequences.to_pickle(DATASET_FOLDER+"uniprot.pickle")
save_data(df_sequences, "uniprot", DATASET_FOLDER)

In [237]:
df_keywords = (
    df.keywords.dropna()
    .str.split(";")
    .explode()
    .str.strip()
    .astype("category")
    .rename("keyword")
    .to_frame()
    .reset_index(drop=False)
    .drop_duplicates()
    .reset_index(drop=True)
)
# df_keywords.to_pickle(DATASET_FOLDER+"keywords.pickle")
save_data(df_keywords, "keywords", DATASET_FOLDER)

In [238]:
df_interpro = (
    df.interpro_ids.dropna()
    .str.rstrip(";")
    .str.split(";")
    .explode()
    .rename("interpro_id")
    .to_frame()
    .reset_index(drop=False)
    .drop_duplicates()
    .reset_index(drop=True)
)
# df_interpro.to_pickle(DATASET_FOLDER+"interpro.pickle")
save_data(df_interpro, "interpro", DATASET_FOLDER)

In [239]:
df_tcdb_uniprot = (
    df.tcdb_ids.dropna()
    .str.rstrip(";")
    .str.split(";")
    .explode()
    .rename("tcdb_id")
    .reset_index(drop=False)
    .drop_duplicates()
    .melt(id_vars=["tcdb_id"], value_vars=["Uniprot"])
)
df_tcdb_uniprot = df_tcdb_uniprot[["tcdb_id", "value"]].drop_duplicates().reset_index(
    drop=True
)
save_data(df_tcdb_uniprot, "tcdb", DATASET_FOLDER)
# .to_pickle(DATASET_FOLDER + "tcdb.pickle")

## 3Di

In [240]:
def process_fasta(fasta_data_3di: list, lowercase: bool = True):
    accession_pattern = re.compile(
        "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
    )
    not_in_alphabet_pattern = re.compile("[^ACDEFGHIKLMNPQRSTVWYX]+")
    for header, sequence in fasta_data_3di:
        header_values = header[1:].split("-")
        accession = header_values[1]
        # number of the split, if more than 2700AA:
        fold_number = int(header_values[2][1:])
        if not re.fullmatch(accession_pattern, accession):
            print(
                "found invalid header in 3di fasta file, unable to read accession: "
                + header
            )
            continue

        if re.search(not_in_alphabet_pattern, sequence):
            print("Invalid characer found in 3di sequence " + sequence)
            continue

        # compatibility with prostt5
        # print(sequence_lower)
        yield accession, fold_number, sequence.lower() if lowercase else sequence


def merge_subsequences(
    df_uniprot_3Di: pd.DataFrame, get_stats: bool = True, overlap_len: int = 1200
):
    # In the case of proteins longer than 2700 amino acids (aa), AlphaFold provides 1400aa long, overlapping fragments.
    # For example, Titin has predicted fragment structures named as Q8WZ42-F1 (residues 1–1400), Q8WZ42-F2 (residues 201–1600), etc.
    # These fragments are currently only available for the human proteome in these proteome archive files, not on the website.
    # All structures seem to have overlap of 1200, might change in future versions
    df_uniprot_3Di_copy = df_uniprot_3Di.copy(deep=True)
    # proteins that were split into more than one structure
    split_proteins = df_uniprot_3Di_copy[
        df_uniprot_3Di_copy.fold_number == 2
    ].Uniprot.unique()

    records_merged_sequences = list()
    records_merging_stats = list()

    for split_protein in split_proteins:
        df_uniprot_3Di_split_protein = df_uniprot_3Di_copy[
            df_uniprot_3Di_copy.Uniprot == split_protein
        ][["fold_number", "sequence3Di"]].sort_values("fold_number")
        subsequences_list = list(
            df_uniprot_3Di_split_protein.sequence3Di.values.ravel()
        )
        sequence_merged = subsequences_list[0]
        sequence_identity_scores = list()
        for subsequence in subsequences_list[1:]:
            matching_pos = np.equal(
                list(subsequence[:overlap_len]), list(sequence_merged[-overlap_len:])
            )
            overlap_seqence_identity = sum(matching_pos) / len(matching_pos)
            sequence_identity_scores.append(overlap_seqence_identity)
            additional_positions = subsequence[overlap_len : len(subsequence)]
            sequence_merged += additional_positions

        records_merging_stats.append(
            [
                split_protein,
                sequence_identity_scores,
            ]
        )

        records_merged_sequences.append([split_protein, sequence_merged])

    df_merged = pd.DataFrame.from_records(
        records_merged_sequences, columns=["Uniprot", "sequence3Di"]
    )
    df_uniprot_3Di_copy = df_uniprot_3Di_copy[
        ~df_uniprot_3Di_copy.Uniprot.isin(df_merged.Uniprot)
    ]
    # any unmerged remain?
    assert (df_uniprot_3Di_copy.fold_number == 1).all()
    df_uniprot_3Di_copy = df_uniprot_3Di_copy.drop("fold_number", axis=1)

    df_uniprot_3Di_copy = pd.concat(
        [df_merged, df_uniprot_3Di_copy], ignore_index=True
    ).set_index("Uniprot")
    df_uniprot_3Di_merging_stats = pd.DataFrame.from_records(
        records_merging_stats,
        columns=[
            "Uniprot",
            "merging_overlaps_seq_ident",
        ],
        index="Uniprot",
    )
    if get_stats:
        return df_uniprot_3Di_copy, df_uniprot_3Di_merging_stats
    else:
        return df_uniprot_3Di_copy


def preprocess_3di_fasta(fasta_path: str, accessions_whitelist: list):

    fasta_data_3di = read_fasta(
        fasta_path
    )  # only contains four model organisms for now

    df_uniprot_3Di = pd.DataFrame.from_records(
        process_fasta(fasta_data_3di=fasta_data_3di),
        columns=["Uniprot", "fold_number", "sequence3Di"],
    ).sort_values(
        ["Uniprot", "fold_number"]
    )  # , index="Uniprot"

    # Only keep those that are in our sequence dataset!
    df_uniprot_3Di = df_uniprot_3Di[df_uniprot_3Di.Uniprot.isin(accessions_whitelist)]

    # also returns stats df with sequence identity between merged structures
    df_uniprot_3Di_merged, df_uniprot_3Di_merging_stats = merge_subsequences(
        df_uniprot_3Di=df_uniprot_3Di
    )

    df_uniprot_3Di_merged = df_uniprot_3Di_merged.assign(
        len_matches_aa_sequence=df.sequence.loc[df_uniprot_3Di_merged.index]
        .str.len()
        .values
        == df_uniprot_3Di_merged.sequence3Di.str.len().values
    )
    assert ~df_uniprot_3Di_merged.index.duplicated().any()
    return df_uniprot_3Di_merged, df_uniprot_3Di_merging_stats

In [241]:
df_uniprot_3Di, df_uniprot_3Di_merging_stats = preprocess_3di_fasta(
    RAW_DATA_FOLDER + "alphafolddb/3Di_sequences.fasta",
    accessions_whitelist=df.index.tolist(),
)
median_merging_overlap = df_uniprot_3Di_merging_stats.merging_overlaps_seq_ident.apply(
    np.median
).median()
mean_merging_overlap = df_uniprot_3Di_merging_stats.merging_overlaps_seq_ident.apply(
    np.mean
)
# TODO mean merging overlap as field
save_data(df_uniprot_3Di, "3Di_alphafold4", folder_path=DATASET_FOLDER)
df_uniprot_3Di

Unnamed: 0_level_0,sequence3Di,len_matches_aa_sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1
A2VEC9,dddpvvvvpppppdapaakdkdkdkdkdwafpfdkdkdkdflvcvv...,True
A4UGR9,dddddddddddddddddddddddddddddddddddddddddddpvv...,True
O14686,dddadddddddddddddddddddddddddddddddddddddddddd...,True
O15018,dfpdlvclvlvlvllvvvlvvvvvvddpdplnvlsvllsvvsnvvs...,True
O15050,dddppppddqllvqlvvlqvqlvvcvvvvnlvsslvslvvscvspv...,True
...,...,...
W5XKT8,ddddddddddddpppppppvqlpllqllvlqddpvlqlvllclqqn...,True
W6CW81,dvvvvvvvlvlvvlvpddpvvlvvvlvvccvppvddsvvsvvspdd...,True
X5JA13,ddpdddddddddddpppdqpaddpvcvpddhdpvvvvcvlcpvldp...,True
X5JB51,ddddpdddddddddddddpqlddpvcvpddhdpvvsvcslccvlcv...,True


## GOA

In [13]:
uniprot_accessions = set(df.index.unique())

df_go_ebi = pd.DataFrame()

print("Filtering GO annotations...", end=" ")
with pd.read_table(
    RAW_DATA_FOLDER+"goa/goa_uniprot_all_ebi_filtered.tsv.gz",
    header=None,
    names=["Uniprot", "qualifier", "go_id", "evidence_code", "aspect", "date"],
    dtype={
        "Uniprot": "string",
        "qualifier": "category",
        "go_id": "string",
        "evidence_code": "category",
        "aspect": "category",
        "date": "int",
    },
    chunksize=10**6,
    # parse_dates=["date"],
) as reader:
    for df_chunk in reader:
        df_chunk_filtered = (
            df_chunk[df_chunk.Uniprot.isin(uniprot_accessions)]
            .drop("date", axis=1)
            .drop_duplicates()
        )
        df_go_ebi = pd.concat([df_go_ebi, df_chunk_filtered], axis=0).reset_index(
            drop=True
        )
print("done")

Filtering GO annotations... done


In [14]:
df_go_ebi = df_go_ebi.drop_duplicates()

In [15]:
# Using numbers starting with 0 as index halves the file size!
df_go_ebi = df_go_ebi.reset_index(drop=True)

save_data(df_go_ebi, "goa", DATASET_FOLDER)
# df_go_ebi.to_pickle(DATASET_FOLDER+"goa.pickle")

## GO

In [18]:
import obonet
import pickle
graph_go = obonet.read_obo(RAW_DATA_FOLDER+"ontologies/go.obo")

save_data(graph_go, "go_obo", folder_path=DATASET_FOLDER)

AlphafoldDB Download