In [57]:
# !python3 ../subpred/uniprot_downloader.py "https://rest.uniprot.org/uniprotkb/search?compressed=false&fields=accession%2Cgene_names%2Cprotein_name%2Creviewed%2Cprotein_existence%2Csequence%2Corganism_id%2Cgo_id%2Ckeywordid%2Ckeyword%2Cxref_tcdb%2Cxref_interpro&format=tsv&query=%28%28fragment%3Afalse%29%20AND%20%28existence%3A1%29%20OR%20%28existence%3A2%29%29&size=500" "../data/raw/uniprot/uniprot_2022_05_evidence1-2_nofragments.tsv"

In [58]:
import pandas as pd

names = [
    "Uniprot",
    "gene_names",
    "protein_names",
    "reviewed",
    "protein_existence",
    "sequence",
    "organism_id",
    "go_ids",
    "keyword_ids",
    "keywords",
    "tcdb_ids",
    "interpro_ids",
]
dtypes = {
    "Uniprot": "string",
    "gene_names": "string",
    "protein_names" : "string",
    "reviewed" : "category",
    "protein_existence": "category",
    "sequence": "string",
    "organism_id": "int",
    "go_ids": "string",
    "keyword_ids": "string",
    "keywords": "string",
    "tcdb_ids": "string",
    "interpro_ids": "string",
}

df = pd.read_table(
    "../data/raw/uniprot/uniprot_2022_05_evidence1-2_nofragments.tsv",
    index_col=0,
    header=None,
    names=names, 
    dtype=dtypes,
    skiprows=1
)


## Cleanup

In [59]:
df.tcdb_ids = df.tcdb_ids.str.rstrip(";")

In [60]:
print(df.reviewed.value_counts())
df.reviewed = df.reviewed.transform(lambda x: x == "reviewed").astype("bool")
print(df.reviewed.value_counts())

unreviewed    1530743
reviewed       162964
Name: reviewed, dtype: int64
False    1530743
True      162964
Name: reviewed, dtype: int64


In [55]:
df.protein_existence.value_counts()

Evidence at transcript level    772752
Evidence at protein level       249205
Name: protein_existence, dtype: int64

### Filtering out proteins without gene names

Apparently mostly peptides and pollen, no transporters

In [49]:
print("before", len(df))
df = df[~df.gene_names.isnull()]
print("after", len(df))

before 1021957
after 1021957


### Parsing sequences

In [65]:
import re

print("proteins with non-standard amino acids:")
df[~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))].shape[0]

proteins with non-standard amino acids:


45718

Removing non-standard amino acids from sequences:

In [66]:
df.sequence = df.sequence.str.replace(re.compile("[^ACDEFGHIKLMNPQRSTVWY]+"), "")

In [68]:
print("proteins with non-standard amino acids:")
df[~df.sequence.str.fullmatch(re.compile("[ACDEFGHIKLMNPQRSTVWY]+"))].shape[0]

proteins with non-standard amino acids:


0

## Annotation datasets

### Keywords

### Interpro Domains

### TCDB substrates

### GO

## Clustering

In [2]:
# from subpred.dataset import create_dataset
# df_uniprot = create_dataset(
#     input_file="../data/raw/uniprot/uniprot_2022_05_evidence1-2_nofragments.tsv",
#     # keywords_classes = None,
#     # keywords_classes_all = SUBSTRATE_KEYWORDS,
#     # keywords_filter = None,
#     multi_substrate="keep",
#     # outliers=outliers,
#     verbose=True,
#     # tax_ids_filter=[3702, 9606, 83333, 559292],
#     evidence_code=2,
#     invalid_amino_acids="remove_amino_acids",
#     # gene_names_only = True,
#     # force_update=True,
#     # remove_sequence_fragments = True,
#     # force_update = True,
#     tcdb_substrates_file="../data/raw/tcdb/tcdb_substrates.tsv",
#     swissprot_only=False,
#     sequence_clustering=70,
# )
# TODO dtypes, take apart, mmseqs2

# TODO categories in dataset py, rework script there
#     - use read_pickle function. 
#     - No more keywords

Reading text file...
Overwriting existing pickle...
cd-hit: clustered .......... sequences into finished clusters at threshold 70


In [6]:
df_uniprot.columns

Index(['gene_names', 'protein_names', 'reviewed', 'protein_existence',
       'sequence', 'organism_id', 'go_ids', 'keyword_ids', 'keywords',
       'tcdb_id', 'interpro', 'tcdb_class', 'tcdb_substrates',
       'keywords_substrates', 'keywords_transport_related',
       'keywords_location'],
      dtype='object')