In [11]:
from subpred.dataset import create_dataset
df = create_dataset(
        # keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
        # keywords_component_filter=["Transmembrane"],
        # keywords_transport_filter=["Transport"],
        input_file="../data/raw/swissprot/uniprot_data_2022_04.tab.gz",
        multi_substrate="keep",
        verbose=True,
        # tax_ids_filter=[3702, 9606, 83333, 559292],
        # outliers=outliers,
        # sequence_clustering=70,
        evidence_code=2,
        invalid_amino_acids="remove_protein",
        # force_update=True
    )
df.shape

Found pickle, reading...


(144929, 15)

## Keywords 

### Substrates

In [26]:
df_substrates = df[df.keywords_substrates != ""]
print(df_substrates.shape)
df_substrates.keywords_substrates.value_counts()

(14296, 15)


Protein transport                                                 3278
Electron transport                                                2139
Ion transport                                                     1057
Oxygen transport                                                   889
Hydrogen ion transport;Ion transport                               740
                                                                  ... 
Ion transport;Neurotransmitter transport;Sodium transport            1
Ion transport;Phosphate transport;Zinc transport                     1
Cobalt transport;Ion transport;Iron transport;Zinc transport         1
Cobalt transport;Copper transport;Ion transport;Zinc transport       1
Amino-acid transport;Ion transport                                   1
Name: keywords_substrates, Length: 87, dtype: int64

In [23]:
df_substrates_long = df_substrates.keywords_substrates.str.split(";").explode()

display(df_substrates_long.value_counts())
display(df_substrates_long.groupby())

Ion transport                         4487
Protein transport                     4114
Electron transport                    2189
Oxygen transport                       889
Hydrogen ion transport                 816
Lipid transport                        785
Sodium transport                       732
Potassium transport                    704
Sugar transport                        703
mRNA transport                         669
Translocation                          626
Amino-acid transport                   545
Calcium transport                      463
Chloride                               392
Iron transport                         293
Zinc transport                         191
Neurotransmitter transport             175
Phosphate transport                    143
Peptide transport                      119
Copper transport                        84
Ammonia transport                       79
Sodium/potassium transport              77
Anion exchange                          51
Sulfate tra

### Transmembrane transport

In [60]:
df_keywords_long = df.keywords.str.split(";").explode().str.strip().reset_index(drop=False)

df_keywords_long = df_keywords_long.rename(columns = {"keywords": "keyword"})

df_keywords_long

Unnamed: 0,Uniprot,keyword
0,A0A024SC78,3D-structure
1,A0A024SC78,Disulfide bond
2,A0A024SC78,Hydrolase
3,A0A024SC78,Secreted
4,A0A024SC78,Serine esterase
...,...,...
1205125,V5XVW4,Capsid protein
1205126,V5XVW4,Direct protein sequencing
1205127,V5XVW4,Virion
1205128,W5X2N3,


In [63]:
prot_arr_tm = df_keywords_long[df_keywords_long.keyword== "Transmembrane"].Uniprot.values
prot_arr_tp = df_keywords_long[df_keywords_long.keyword== "Transport"].Uniprot.values
import numpy as np

prot_arr_tm_tp = np.intersect1d(prot_arr_tm, prot_arr_tp)

prot_arr_tm_tp.size

9624

## Gene ontology (Uniprot column annotation)

Relevant terms:

- transmembrane transporter activity: https://www.ebi.ac.uk/QuickGO/term/GO:0022857
- membrane: https://www.ebi.ac.uk/QuickGO/term/GO:0016020
- plasma membrane: https://www.ebi.ac.uk/QuickGO/term/GO:0005886
- active transmembrane transporter activity
    - primary active transmembrane transporter activity
    - secondary active transmembrane transporter activity
    - etc.

TODO 

- does dataset contain all parent nodes?
- stats on active transport etc.
- pay attention to type of relationship (is_a), and evidence codes
- Which go terms are included in uniprot column? are they filtered?
- get go terms from GO directly, or from QuickGO (limited to 2mio annotations)

In [27]:
# Set of relevant GO terms:

transmembrane_transporter_activity = {
    "GO:0022857",
    "GO:0005386",
    "GO:0015563",
    "GO:0015646",
    "GO:0022891",
    "GO:0022892",
}


In [48]:
import pandas as pd
import re

df_go = df[~df.go_terms.isnull()]
df_go = df_go.go_terms.str.split(";").explode().str.strip().reset_index(drop=False)
go_id_pattern = re.compile("\[(GO\:[0-9]{7})\]")
df_go["go_id"] = df_go.go_terms.str.extract(go_id_pattern)
df_go["go_term"] = df_go.go_terms.str.replace(go_id_pattern, "").str.strip()
df_go = df_go.drop("go_terms", axis=1)
df_go = df_go.drop_duplicates()

In [52]:
df_go[df_go.go_term == "transmembrane transporter activity"]

Unnamed: 0,Uniprot,go_id,go_term
627,A0A0B7P9G0,GO:0022857,transmembrane transporter activity
1746,A0A131MCZ8,GO:0022857,transmembrane transporter activity
4569,A1A5C7,GO:0022857,transmembrane transporter activity
5484,A1Z8N1,GO:0022857,transmembrane transporter activity
6841,A2AJN7,GO:0022857,transmembrane transporter activity
...,...,...,...
1394142,Q9XI74,GO:0022857,transmembrane transporter activity
1394689,Q9ZUS1,GO:0022857,transmembrane transporter activity
1394731,Q9ZVH5,GO:0022857,transmembrane transporter activity
1394779,S0ECK8,GO:0022857,transmembrane transporter activity


In [None]:
obo = get_namespace("http://purl.obolibrary.org/obo/")