In [57]:
# !python3 ../subpred/uniprot_downloader.py "https://rest.uniprot.org/uniprotkb/search?compressed=false&fields=accession%2Cgene_names%2Cprotein_name%2Creviewed%2Cprotein_existence%2Csequence%2Corganism_id%2Cgo_id%2Ckeywordid%2Ckeyword%2Cxref_tcdb%2Cxref_interpro&format=tsv&query=%28%28fragment%3Afalse%29%20AND%20%28existence%3A1%29%20OR%20%28existence%3A2%29%29&size=500" "../data/raw/uniprot/uniprot_2022_05_evidence1-2_nofragments.tsv"


In [58]:
import pandas as pd

names = [
    "Uniprot",
    "gene_names",
    "protein_names",
    "reviewed",
    "protein_existence",
    "sequence",
    "organism_id",
    "go_ids",
    "keyword_ids",
    "keywords",
    "tcdb_ids",
    "interpro_ids",
]
dtypes = {
    "Uniprot": "string",
    "gene_names": "string",
    "protein_names": "string",
    "reviewed": "category",
    "protein_existence": "category",
    "sequence": "string",
    "organism_id": "int",
    "go_ids": "string",
    "keyword_ids": "string",
    "keywords": "string",
    "tcdb_ids": "string",
    "interpro_ids": "string",
}

df = pd.read_table(
    "../data/raw/uniprot/uniprot_2022_05_evidence1-2_nofragments.tsv",
    index_col=0,
    header=None,
    names=names,
    dtype=dtypes,
    skiprows=1,
)


## Cleanup

In [59]:
df.tcdb_ids = df.tcdb_ids.str.rstrip(";")


In [60]:
print(df.reviewed.value_counts())
df.reviewed = df.reviewed.transform(lambda x: x == "reviewed").astype("bool")
print(df.reviewed.value_counts())


unreviewed    1530743
reviewed       162964
Name: reviewed, dtype: int64
False    1530743
True      162964
Name: reviewed, dtype: int64


In [55]:
df.protein_existence.value_counts()


Evidence at transcript level    772752
Evidence at protein level       249205
Name: protein_existence, dtype: int64

### Filtering out proteins without gene names

Apparently mostly peptides and pollen, no transporters

In [49]:
print("before", len(df))
df = df[~df.gene_names.isnull()]
print("after", len(df))


before 1021957
after 1021957


### Parsing sequences

In [65]:
import re

print("proteins with non-standard amino acids:")
df[~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))].shape[0]


proteins with non-standard amino acids:


45718

Removing non-standard amino acids from sequences:

In [66]:
df.sequence = df.sequence.str.replace(re.compile("[^ACDEFGHIKLMNPQRSTVWY]+"), "")


In [68]:
print("proteins with non-standard amino acids:")
df[~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))].shape[0]


proteins with non-standard amino acids:


0

## Annotation datasets

### Keywords

In [91]:
df_keywords = (
    df.keywords.dropna()
    .str.split(";")
    .explode()
    .str.strip()
    .drop_duplicates()
    .astype("category")
    .rename("keyword")
    .to_frame()
)
df_keywords


Unnamed: 0_level_0,keyword
Uniprot,Unnamed: 1_level_1
A0A0C5B5G6,DNA-binding
A0A0C5B5G6,Mitochondrion
A0A0C5B5G6,Nucleus
A0A0C5B5G6,Osteogenesis
A0A0C5B5G6,Reference proteome
...,...
P15556,Inhibition of host DNA replication by virus
Q8JL78,Actin-dependent inwards viral transport
P26948,Capsule
A0A0A0MQU7,Proteomics identification


There are keyword ids as well, but their order does not match that of the keyword strings column:

In [70]:
df.keyword_ids.str.split(";").explode().str.strip()


Uniprot
A0A0C5B5G6    KW-0238
A0A0C5B5G6    KW-0496
A0A0C5B5G6    KW-0539
A0A0C5B5G6    KW-0892
A0A0C5B5G6    KW-1185
               ...   
X5MPI4        KW-0378
X5MPI4        KW-0645
X5MPI4        KW-0720
X5MPI4        KW-0732
X5MPI5        KW-0808
Name: keyword_ids, Length: 7456053, dtype: object

### Interpro Domains

In [92]:
df_interpro = (
    df.interpro_ids.dropna()
    .str.rstrip(";")
    .str.split(";")
    .explode()
    .drop_duplicates()
    .rename("interpro_id")
    .to_frame()
)
df_interpro

### TCDB substrates

In [None]:
!curl "https://tcdb.org/cgi-bin/substrates/getSubstrates.py" > data/raw/tcdb/tcdb_substrates.tsv

In [97]:
df_substrates = pd.read_table("../data/raw/tcdb/tcdb_substrates.tsv", header=None, names=["tcdb_id", "tcdb_substrates"])
df_substrates


# df.tcdb_ids

# add uniprot accessions
# df_substrates = df_substrates.merge(
#     df.tcdb_id.to_frame().reset_index(drop=False), how="left", on="tcdb_id"
# )
# df_substrates = df_substrates.drop("tcdb_id", axis=1)
# df_substrates = df_substrates[~df_substrates.Uniprot.isnull()]
# df_substrates = df_substrates.set_index("Uniprot", drop=True)

Unnamed: 0,tcdb_id,tcdb_substrates
0,2.A.52.2.2,CHEBI:23337;cobalt(2+)|CHEBI:25517;nickel(2+)
1,2.A.22.2.5,CHEBI:9175;sodium(1+)|CHEBI:8345;potassium(1+)
2,2.A.90.2.4,CHEBI:8816;all-trans-retinol
3,2.A.29.23.1,CHEBI:6635;magnesium(2+)|CHEBI:7793;phosphate(...
4,5.B.6.1.3,CHEBI:10545;electron
...,...,...
8436,2.A.30.5.5,CHEBI:32588;potassium chloride
8437,1.H.1.1.17,CHEBI:3473;cation
8438,8.A.139.2.2,CHEBI:7990;peptide
8439,1.B.6.2.13,CHEBI:25367;molecule


### GO

## Clustering

In [2]:
# from subpred.dataset import create_dataset
# df_uniprot = create_dataset(
#     input_file="../data/raw/uniprot/uniprot_2022_05_evidence1-2_nofragments.tsv",
#     # keywords_classes = None,
#     # keywords_classes_all = SUBSTRATE_KEYWORDS,
#     # keywords_filter = None,
#     multi_substrate="keep",
#     # outliers=outliers,
#     verbose=True,
#     # tax_ids_filter=[3702, 9606, 83333, 559292],
#     evidence_code=2,
#     invalid_amino_acids="remove_amino_acids",
#     # gene_names_only = True,
#     # force_update=True,
#     # remove_sequence_fragments = True,
#     # force_update = True,
#     tcdb_substrates_file="../data/raw/tcdb/tcdb_substrates.tsv",
#     swissprot_only=False,
#     sequence_clustering=70,
# )
# TODO dtypes, take apart, mmseqs2

# TODO categories in dataset py, rework script there
#     - use read_pickle function.
#     - No more keywords


Reading text file...
Overwriting existing pickle...
cd-hit: clustered .......... sequences into finished clusters at threshold 70


## Removing unnecessary columns

Since they are in other datasets

In [79]:
# TODO
df.drop(
    ["go_ids", "keyword_ids", "keywords", "tcdb_ids", "interpro_ids"], axis=1
).to_pickle("")


Unnamed: 0_level_0,gene_names,protein_names,reviewed,protein_existence,sequence,organism_id
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A0A0C5B5G6,MT-RNR1,Mitochondrial-derived peptide MOTS-c (Mitochon...,True,Evidence at protein level,MRWQEMGYIFYPRKLR,9606
A0A1B0GTW7,CIROP LMLN2,Ciliated left-right organizer metallopeptidase...,True,Evidence at protein level,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,9606
A0JNW5,BLTP3B KIAA0701 SHIP164 UHRF1BP1L,Bridge-like lipid transfer protein family memb...,True,Evidence at protein level,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606
A0JP26,POTEB3,POTE ankyrin domain family member B3,True,Evidence at protein level,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606
A0PK11,CLRN2,Clarin-2,True,Evidence at protein level,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606
...,...,...,...,...,...,...
X5MPI1,,Trypsin 1C,False,Evidence at transcript level,MVLLLAVALCSASTYPLVRPIPAGRDLRSRLDGRIVGGSAVSISQY...,7004
X5MPI2,,Chymotrypsin 2,False,Evidence at transcript level,MQLAVFLFCLLGSALALPKARMWTRDESRIIGGSNADIADYPWQLS...,7004
X5MPI3,,Chymotrypsin 7,False,Evidence at transcript level,AGLLFTLAGCVWVRGSHAPLVHSGKWLGAAGGRIVGGRTPYSGSSV...,7004
X5MPI4,,Chymotrypsin 12,False,Evidence at transcript level,MMRQTVLVLALAACVFAAELPIRRVPHSGPQRKFGLKHGRIVGGSD...,7004
