# Dataset comparison

Here, we will compare the new dataset to the one used in Manuscript 1, and figure out the impact of switching to GO terms for each of them.

### Imports

In [1]:
from subpred.util import load_df
import pandas as pd
import re

### New dataset

In [2]:
# Reading the new dataset
DATASET_FOLDER = "../data/datasets"
sequences_new = load_df("uniprot")
sequences70_new = load_df("uniprot70")
keywords_new = load_df("keywords")
go_new = load_df("go")

### Old dataset

#### Sequences

In [3]:
# Reading the old dataset from manuscript 1

data_old = pd.read_table(
    "../data/raw/uniprot/swissprot_data_2021_04_manuscript1.tsv.gz", low_memory=False
    # skiprows=1, header=None
)

# Applying the same steps that were also applied to the new dataset
data_old = data_old[data_old.Fragment.isnull()].drop("Fragment", axis=1)

data_old = data_old.rename(
    columns={
        "Entry": "Uniprot",
        "Gene names": "gene_names",
        "Protein names": "protein_names",
        "Organism": "organism",
        "Organism ID": "organism_id",
        "Keyword ID": "keyword_ids",
        "Keywords": "keywords",
        "Gene ontology IDs": "go_ids",
        "Gene ontology (GO)": "go_terms",
        "Cross-reference (TCDB)": "tcdb_ids",
        "Protein existence": "protein_existence",
        "Sequence": "sequence",
    }
)
data_old = data_old.set_index("Uniprot", drop=True) 

data_old = data_old.assign(protein_existence=data_old.protein_existence.map({"Evidence at protein level":1, "Evidence at transcript level":2}))
data_old = data_old[
    data_old.protein_existence.isin(
        {1,2}
    )
]
data_old = data_old[~data_old.gene_names.isnull()]
data_old = data_old.assign(sequence = data_old.sequence.str.replace(re.compile("[^ACDEFGHIKLMNPQRSTVWY]+"), ""))
data_old.head()

Unnamed: 0_level_0,gene_names,protein_names,organism,organism_id,keyword_ids,keywords,go_ids,go_terms,tcdb_ids,protein_existence,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Q5SW45,Mks1,Meckel syndrome type 1 protein homolog,Mus musculus (Mouse),10090,KW-0206; KW-0963; KW-0966; KW-0969; KW-0970; K...,Cell projection;Cilium;Cilium biogenesis/degra...,GO:0001843; GO:0003271; GO:0005737; GO:0005813...,centriole [GO:0005814]; centrosome [GO:0005813...,,1.0,MAEAVWSTDTGEAVYRSRDPVRNLRLRVHLQRITSSNFLHYQPAAQ...
Q00266,MAT1A AMS1 MATA1,S-adenosylmethionine synthase isoform type-1 (...,Homo sapiens (Human),9606,KW-0002; KW-0067; KW-0225; KW-0460; KW-0479; K...,3D-structure;ATP-binding;Disease variant;Disul...,GO:0000096; GO:0001887; GO:0004478; GO:0005524...,cytosol [GO:0005829]; ATP binding [GO:0005524]...,,1.0,MNGPVDGLCDHSLSEGVFMFTSESVGEGHPDKICDQISDAVLDAHL...
P26696,mapk1 mpk1,Mitogen-activated protein kinase 1 (MAP kinase...,Xenopus laevis (African clawed frog),8355,KW-0053; KW-0067; KW-0131; KW-0206; KW-0418; K...,ATP-binding;Apoptosis;Cell cycle;Cytoplasm;Cyt...,GO:0004674; GO:0004707; GO:0005524; GO:0005634...,cytoplasm [GO:0005737]; microtubule organizing...,,1.0,MAAAGAASNPGGGPEMVRGQAFDVGPRYINLAYIGEGAYGMVCSAH...
Q8NB16,MLKL,Mixed lineage kinase domain-like protein (hMLKL),Homo sapiens (Human),9606,KW-0002; KW-0025; KW-0067; KW-0175; KW-0472; K...,3D-structure;ATP-binding;Alternative splicing;...,GO:0004672; GO:0004706; GO:0005524; GO:0005634...,cell junction [GO:0030054]; cytoplasm [GO:0005...,1.A.105.1.1;,1.0,MENLKHIITLGQVIHKRCEEMKYCKKQCRRLGHRVLGLIKPLEMLQ...
P18298,Mat2a Ams2,S-adenosylmethionine synthase isoform type-2 (...,Rattus norvegicus (Rat),10116,KW-0007; KW-0067; KW-0460; KW-0479; KW-0547; K...,ATP-binding;Acetylation;Isopeptide bond;Magnes...,GO:0004478; GO:0005524; GO:0005829; GO:0006556...,cytosol [GO:0005829]; methionine adenosyltrans...,,1.0,MNGQLNGFHEAFIEEGTFLFTSESVGEGHPDKICDQINDAVLDAHL...


In [None]:
# Clustering
from subpred.cdhit import cd_hit
data_old_cluster_repr_70 = cd_hit(data_old.sequence, identity_threshold=70)
data_old_70 = data_old.loc[data_old_cluster_repr_70]

#### Annotations

In [15]:
data_old.keywords.unique().shape

(70452,)

In [21]:
data_old.keywords.dropna().str.split(";").explode().str.strip()

Uniprot
Q5SW45                  Cell projection
Q5SW45                           Cilium
Q5SW45    Cilium biogenesis/degradation
Q5SW45                        Cytoplasm
Q5SW45                     Cytoskeleton
                      ...              
P50402                          Nucleus
P50402                   Phosphoprotein
P50402               Reference proteome
P50402                    Transmembrane
P50402              Transmembrane helix
Name: keywords, Length: 1173407, dtype: object

In [26]:
# Keywords
keywords_old = (
    data_old.keywords.dropna()
    .str.split(";")
    .explode()
    .str.strip()
    .astype("category")
    .rename("keyword")
    .to_frame()
    .reset_index(drop=False)
    .drop_duplicates()
)
keywords_old

Unnamed: 0,Uniprot,keyword
0,Q5SW45,Cell projection
1,Q5SW45,Cilium
2,Q5SW45,Cilium biogenesis/degradation
3,Q5SW45,Cytoplasm
4,Q5SW45,Cytoskeleton
...,...,...
1173402,P50402,Nucleus
1173403,P50402,Phosphoprotein
1173404,P50402,Reference proteome
1173405,P50402,Transmembrane


In [32]:
go_full = load_df("go_complete")

In [38]:
go_full

Unnamed: 0,Uniprot,qualifier,go_id,evidence_code,aspect,date
0,A0A000,enables,GO:0003824,IEA,F,20230125
1,A0A000,enables,GO:0003870,IEA,F,20230125
2,A0A000,enables,GO:0030170,IEA,F,20230125
3,A0A000,involved_in,GO:0009058,IEA,P,20230125
4,A0A000,involved_in,GO:0033014,IEA,P,20230125
...,...,...,...,...,...,...
710667173,Z9JZ82,involved_in,GO:0046677,IEA,P,20230125
710667174,Z9JZ82,involved_in,GO:0055085,IEA,P,20230125
710667175,Z9JZ82,located_in,GO:0005886,IEA,C,20230125
710667176,Z9JZ82,located_in,GO:0016020,IEA,C,20230125


In [36]:
go_old = go_full[go_full.Uniprot.isin(data_old.index)]

## Comparison between unclustered datasets