In [1]:
import pandas as pd
import pickle
import networkx as nx

In [2]:
DATASET_FOLDER= "data/datasets/"

Uniprot

In [3]:
names = [
    "Uniprot",
    "gene_names",
    "protein_names",
    "reviewed",
    "protein_existence",
    "sequence",
    "organism_id",
    "go_ids",
    "keyword_ids",
    "keywords",
    "tcdb_ids",
    "interpro_ids",
]
dtypes = {
    "Uniprot": "string",
    "gene_names": "string",
    "protein_names": "string",
    "reviewed": "category",
    "protein_existence": "category",
    "sequence": "string",
    "organism_id": "int",
    "go_ids": "string",
    "keyword_ids": "string",
    "keywords": "string",
    "tcdb_ids": "string",
    "interpro_ids": "string",
}

df = pd.read_table(
    "data/raw/uniprot/uniprot_evidence1-2_nofragments.tsv",
    index_col=0,
    header=None,
    names=names,
    dtype=dtypes,
    skiprows=1,
)

In [4]:
df.reviewed = df.reviewed.transform(lambda x: x == "reviewed").astype("bool")

In [5]:
df.protein_existence = df.protein_existence.map(
    {"Evidence at transcript level": 2, "Evidence at protein level": 1}
).astype("int")

In [6]:
import re

sequences_nonstandard_aa_count = (
    df[~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))]
    .sequence.drop_duplicates()
    .shape[0]
)
sequences_count = df.sequence.drop_duplicates().shape[0]
# print(
#     f"proteins with non-standard amino acids: {sequences_nonstandard_aa_count} out of {sequences_count} ({sequences_nonstandard_aa_count / sequences_count * 100:.2f}%)"
# )

In [7]:
# saving accessions of those proteins, for later stats
df[
    ~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))
].sequence.to_frame().drop_duplicates().to_csv(
    DATASET_FOLDER + "proteins_nonstandard_aminoacids.txt", header=False, sep="\t"
)

In [8]:
# removing the AAs
df.sequence = df.sequence.str.replace(
    re.compile("[^ACDEFGHIKLMNPQRSTVWY]+"), "", regex=True
)

In [9]:
df_sequences = df.drop(
    ["go_ids", "keyword_ids", "keywords", "tcdb_ids", "interpro_ids"], axis=1
)
df_sequences.to_pickle(DATASET_FOLDER+"uniprot.pickle")

In [10]:
df_keywords = (
    df.keywords.dropna()
    .str.split(";")
    .explode()
    .str.strip()
    .astype("category")
    .rename("keyword")
    .to_frame()
    .reset_index(drop=False)
    .drop_duplicates()
    .reset_index(drop=True)
)
df_keywords.to_pickle(DATASET_FOLDER+"keywords.pickle")

In [11]:
df_interpro = (
    df.interpro_ids.dropna()
    .str.rstrip(";")
    .str.split(";")
    .explode()
    .rename("interpro_id")
    .to_frame()
    .reset_index(drop=False)
    .drop_duplicates()
    .reset_index(drop=True)
)
df_interpro.to_pickle(DATASET_FOLDER+"interpro.pickle")

In [12]:
df_tcdb_uniprot = (
    df.tcdb_ids.dropna()
    .str.rstrip(";")
    .str.split(";")
    .explode()
    .rename("tcdb_id")
    .reset_index(drop=False)
    .drop_duplicates()
    .melt(id_vars=["tcdb_id"], value_vars=["Uniprot"])
)
df_tcdb_uniprot[["tcdb_id", "value"]].drop_duplicates().reset_index(
    drop=True
).to_pickle(DATASET_FOLDER + "tcdb.pickle")

GOA

In [None]:
uniprot_accessions = set(df.index.unique())

df_go_ebi = pd.DataFrame()

with pd.read_table(
    "data/raw/goa/goa_uniprot_all_ebi_filtered.tsv.gz",
    header=None,
    names=["Uniprot", "qualifier", "go_id", "evidence_code", "aspect", "date"],
    dtype={
        "Uniprot": "string",
        "qualifier": "category",
        "go_id": "string",
        "evidence_code": "category",
        "aspect": "category",
        "date": "int",
    },
    chunksize=10**6,
    # parse_dates=["date"],
) as reader:
    for df_chunk in reader:
        df_chunk_filtered = (
            df_chunk[df_chunk.Uniprot.isin(uniprot_accessions)]
            .drop("date", axis=1)
            .drop_duplicates()
        )
        df_go_ebi = pd.concat([df_go_ebi, df_chunk_filtered], axis=0).reset_index(
            drop=True
        )

In [None]:
df_go_ebi

In [None]:
# Using numbers starting with 0 as index halves the file size!
df_go_ebi = df_go_ebi.reset_index(drop=True)
df_go_ebi.to_pickle(DATASET_FOLDER+"goa.pickle")

GO

In [None]:
import obonet
import pickle
graph_go = obonet.read_obo("../data/raw/ontologies/go.obo")

with open(DATASET_FOLDER+"go_obo.pickle", "wb") as pickle_file:
    pickle.dump(graph_go, pickle_file)

Sequence embeddings ProtT5

Alphafold Download