In [2]:
import pandas as pd
import pickle
import networkx as nx
from subpred.util import save_data
from subpred.fasta import read_fasta
import re


In [3]:
DATASET_FOLDER= "data/datasets/"
RAW_DATA_FOLDER= "data/raw/"

## Uniprot

In [4]:
names = [
    "Uniprot",
    "gene_names",
    "protein_names",
    "reviewed",
    "protein_existence",
    "sequence",
    "organism_id",
    "go_ids",
    "keyword_ids",
    "keywords",
    "tcdb_ids",
    "interpro_ids",
]
dtypes = {
    "Uniprot": "string",
    "gene_names": "string",
    "protein_names": "string",
    "reviewed": "category",
    "protein_existence": "category",
    "sequence": "string",
    "organism_id": "int",
    "go_ids": "string",
    "keyword_ids": "string",
    "keywords": "string",
    "tcdb_ids": "string",
    "interpro_ids": "string",
}

df = pd.read_table(
    RAW_DATA_FOLDER+"uniprot/uniprot_evidence1-2_nofragments.tsv",
    index_col=0,
    header=None,
    names=names,
    dtype=dtypes,
    skiprows=1,
)

In [5]:
df.reviewed = df.reviewed.transform(lambda x: x == "reviewed").astype("bool")

In [6]:
df.protein_existence = df.protein_existence.map(
    {"Evidence at transcript level": 2, "Evidence at protein level": 1}
).astype("int")

In [7]:

# sequences_nonstandard_aa_count = (
#     df[~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))]
#     .sequence.drop_duplicates()
#     .shape[0]
# )
# sequences_count = df.sequence.drop_duplicates().shape[0]
# print(
#     f"proteins with non-standard amino acids: {sequences_nonstandard_aa_count} out of {sequences_count} ({sequences_nonstandard_aa_count / sequences_count * 100:.2f}%)"
# )

In [8]:
# saving accessions of those proteins, for later stats

df_sequences_nonstandard = df[
    ~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))
].sequence.to_frame().drop_duplicates()
save_data(df_sequences_nonstandard, "proteins_nonstandard_aminoacids", folder_path=DATASET_FOLDER)


In [9]:
# removing the AAs (all features are based on 20 AAs)
df.sequence = df.sequence.str.replace(
    re.compile("[^ACDEFGHIKLMNPQRSTVWY]+"), "", regex=True
)

In [10]:
df_sequences = df.drop(
    ["go_ids", "keyword_ids", "keywords", "tcdb_ids", "interpro_ids"], axis=1
)
# df_sequences.to_pickle(DATASET_FOLDER+"uniprot.pickle")
save_data(df_sequences, "uniprot", DATASET_FOLDER)

In [11]:
df_keywords = (
    df.keywords.dropna()
    .str.split(";")
    .explode()
    .str.strip()
    .astype("category")
    .rename("keyword")
    .to_frame()
    .reset_index(drop=False)
    .drop_duplicates()
    .reset_index(drop=True)
)
# df_keywords.to_pickle(DATASET_FOLDER+"keywords.pickle")
save_data(df_keywords, "keywords", DATASET_FOLDER)

In [12]:
df_interpro = (
    df.interpro_ids.dropna()
    .str.rstrip(";")
    .str.split(";")
    .explode()
    .rename("interpro_id")
    .to_frame()
    .reset_index(drop=False)
    .drop_duplicates()
    .reset_index(drop=True)
)
# df_interpro.to_pickle(DATASET_FOLDER+"interpro.pickle")
save_data(df_interpro, "interpro", DATASET_FOLDER)

In [13]:
df_tcdb_uniprot = (
    df.tcdb_ids.dropna()
    .str.rstrip(";")
    .str.split(";")
    .explode()
    .rename("tcdb_id")
    .reset_index(drop=False)
    .drop_duplicates()
    .melt(id_vars=["tcdb_id"], value_vars=["Uniprot"])
)
df_tcdb_uniprot = df_tcdb_uniprot[["tcdb_id", "value"]].drop_duplicates().reset_index(
    drop=True
)
save_data(df_tcdb_uniprot, "tcdb", DATASET_FOLDER)
# .to_pickle(DATASET_FOLDER + "tcdb.pickle")

## 3Di

In [None]:
fasta_data_3di = read_fasta(RAW_DATA_FOLDER + "alphafolddb/3Di_sequences.fasta")  # only contains four model organisms for now

accession_pattern = re.compile(
    "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
)
not_in_alphabet_pattern = re.compile("[^ACDEFGHIKLMNPQRSTVWYX]+")

def process_fasta(fasta_data_3di:list, lowercase:bool=True):
    for header, sequence in fasta_data_3di:
        accession = header[1:].split("-")[1]
        fold_number = header[1:].split("-")[2]
        if not re.fullmatch(accession_pattern, accession):
            print("found invalid header in 3di fasta file, unable to read accession: " + header)
            continue

        if re.search(not_in_alphabet_pattern, sequence):
            print("Invalid characer found in 3di sequence "+sequence)
            continue

        # compatibility with prostt5
        # print(sequence_lower)
        yield accession, fold_number, sequence.lower() if lowercase else sequence        

# In the case of proteins longer than 2700 amino acids (aa), AlphaFold provides 1400aa long, overlapping fragments. 
# For example, Titin has predicted fragment structures named as Q8WZ42-F1 (residues 1–1400), Q8WZ42-F2 (residues 201–1600), etc. 
# These fragments are currently only available for the human proteome in these proteome archive files, not on the website.

df_uniprot_3Di = pd.DataFrame.from_records(process_fasta(fasta_data_3di=fasta_data_3di), columns=["Uniprot","fold_number", "sequence3Di"]) #, index="Uniprot"
df_uniprot_3Di

# TODO overlap of 200

# df_uniprot_3Di.groupby("Uniprot")["sequence3Di"].apply("".join)

Unnamed: 0,Uniprot,fold_number,sequence3Di
0,A0A023PYF4,F1,ddddddddddddddwdfdddddddddddddddddddddddddddpp...
1,A0A023PZB3,F1,dpvvvvvvvvvvvvvvvvvvvvvvppdppdppdpppppppppdppd...
2,A0A023PZE8,F1,dppqqvvvlvvvvvvvvvvvpdqpwawewewewddpdpfkikiktw...
3,A0A023PZF2,F1,dppvvvvvvvvvvvvvvlcvlcvvlvvnvvlvvlvvvcvvvlvvlv...
4,A0A023PZG4,F1,ddddpppdcpppppppppvvlvvvqvvppdpvnvvvllvvllvcvv...
...,...,...,...
61222,W5XKT8,F1,ddddddddddddpppppppvqlpllqllvlqddpvlqlvllclqqn...
61223,W6CW81,F1,dvvvvvvvlvlvvlvpddpvvlvvvlvvccvppvddsvvsvvspdd...
61224,X5JA13,F1,ddpdddddddddddpppdqpaddpvcvpddhdpvvvvcvlcpvldp...
61225,X5JB51,F1,ddddpdddddddddddddpqlddpvcvpddhdpvvsvcslccvlcv...


## GOA

In [13]:
uniprot_accessions = set(df.index.unique())

df_go_ebi = pd.DataFrame()

print("Filtering GO annotations...", end=" ")
with pd.read_table(
    RAW_DATA_FOLDER+"goa/goa_uniprot_all_ebi_filtered.tsv.gz",
    header=None,
    names=["Uniprot", "qualifier", "go_id", "evidence_code", "aspect", "date"],
    dtype={
        "Uniprot": "string",
        "qualifier": "category",
        "go_id": "string",
        "evidence_code": "category",
        "aspect": "category",
        "date": "int",
    },
    chunksize=10**6,
    # parse_dates=["date"],
) as reader:
    for df_chunk in reader:
        df_chunk_filtered = (
            df_chunk[df_chunk.Uniprot.isin(uniprot_accessions)]
            .drop("date", axis=1)
            .drop_duplicates()
        )
        df_go_ebi = pd.concat([df_go_ebi, df_chunk_filtered], axis=0).reset_index(
            drop=True
        )
print("done")

Filtering GO annotations... done


In [14]:
df_go_ebi = df_go_ebi.drop_duplicates()

In [15]:
# Using numbers starting with 0 as index halves the file size!
df_go_ebi = df_go_ebi.reset_index(drop=True)

save_data(df_go_ebi, "goa", DATASET_FOLDER)
# df_go_ebi.to_pickle(DATASET_FOLDER+"goa.pickle")

## GO

In [18]:
import obonet
import pickle
graph_go = obonet.read_obo(RAW_DATA_FOLDER+"ontologies/go.obo")

save_data(graph_go, "go_obo", folder_path=DATASET_FOLDER)

AlphafoldDB Download