# Creating matrices for pairs of labels

## Which ones?

- Sequence-based ML model F1 scores
- Sequence similarity scores
- annotation similarity scores
- annotation overlap

## For which dataset?

- First for yeast because of conference

In [1]:
from subpred.util import load_df
import networkx as nx
import re
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset


In [2]:
df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids={559292},
    swissprot_only=False,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=False,
    max_sequence_evidence_code=1,
)
display(df_sequences)
display(df_uniprot_goa)
display(df_go_chebi)


Unnamed: 0_level_0,sequence,reviewed,protein_existence,organism_id,protein_names
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P00401,MVQRWLYSTNAKDIAVLYFMLAIFSGMAGTAMSLIIRLELAAPGSQ...,True,1,559292,Cytochrome c oxidase subunit 1 (EC 7.1.1.9) (C...
P00830,MVLPRLYTATSRAAFKAAKQSAPLLSTSWKRCMASAAQSTPITGKV...,True,1,559292,"ATP synthase subunit beta, mitochondrial (EC 7..."
P04817,MTNSKEDADIEEKHMYNEPVTTLFHDVEASQTHHRRGSIPLKDEKS...,True,1,559292,Arginine permease CAN1 (Canavanine resistance ...
P07213,MKSFITRNKTAILATVAATGTAIGAYYYYNQLQQQQQRGKKNTINK...,True,1,559292,Mitochondrial import receptor subunit TOM70 (7...
P14906,MPTNYEYDEASETWPSFILTGLLMVVGPMTLLQIYQIFFGANAEDG...,True,1,559292,Protein translocation protein SEC63 (Protein N...
...,...,...,...,...,...
P39542,MFQQLSASIRHNAHIIFLCISWYFISSLASQVTKQVLTVCPLPLFL...,True,1,559292,Uncharacterized transporter YJL193W
Q05497,MAGILSKTLSEVHPSLRTNGMGIGNTHRRISLGFLPPNKKNPLVRK...,True,1,559292,Uncharacterized transporter YDR338C
P38318,MEPKRKSGSLAKHDLPQFYLLIMLYLAQGIPVGLAFGTVPFLLKSL...,True,1,559292,Uncharacterized membrane protein YBR220C
P46996,MSNEDETTRLMSSDEMDYLLETAGINALEEIISQNDSTGINLDTNE...,True,1,559292,Uncharacterized membrane protein YJL163C


Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:1901702,salt transmembrane transporter activity
1,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015216,purine nucleotide transmembrane transporter ac...
2,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015932,nucleobase-containing compound transmembrane t...
3,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:1901505,carbohydrate derivative transmembrane transpor...
4,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0005347,ATP transmembrane transporter activity
...,...,...,...,...,...,...,...,...
7732,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0015399,primary active transmembrane transporter activity
7733,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0015318,inorganic molecular entity transmembrane trans...
7734,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0022857,transmembrane transporter activity
7735,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0022890,inorganic cation transmembrane transporter act...


Unnamed: 0,go_id,go_term,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,L-ornithine transmembrane transporter activity,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,S-adenosyl-L-methionine transmembrane transpor...,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
2,GO:0000099,sulfur amino acid transmembrane transporter ac...,CHEBI:26834,sulfur-containing amino acid,has_primary_input
3,GO:0000100,S-methylmethionine transmembrane transporter a...,CHEBI:58252,S-methyl-L-methionine zwitterion,has_primary_input
4,GO:0000102,L-methionine secondary active transmembrane tr...,CHEBI:57844,L-methionine zwitterion,has_primary_input
...,...,...,...,...,...
379,GO:1901682,sulfur compound transmembrane transporter acti...,CHEBI:26835,sulfur molecular entity,has_primary_input
380,GO:1902557,5'-adenylyl sulfate transmembrane transporter ...,CHEBI:58243,5'-adenylyl sulfate(2-),has_primary_input
381,GO:1903089,5-amino-1-ribofuranosylimidazole-4-carboxamide...,CHEBI:28498,acadesine,has_primary_input
382,GO:1903425,fluoride transmembrane transporter activity,CHEBI:17051,fluoride,has_primary_input


## Adjacency matrix

In [3]:
def get_adjacency_matrix(graph, labels:list, edges_filter:set = {"is_a"}):
    subgraph = graph.edge_subgraph(
        [edge for edge in graph.edges(keys=True) if edge[2] in edges_filter]
    )
    assert len(labels) == len(set(labels)), "labels should only contain unique elements"
    subgraph = subgraph.subgraph(labels)

    # scipy sparse matrix
    df_adjacency_matrix = nx.adjacency_matrix(G=subgraph, nodelist=labels)

    df_adjacency_matrix = pd.DataFrame(
        df_adjacency_matrix.todense(), columns=labels, index=labels
    )
    return df_adjacency_matrix

### GO adjacency

In [4]:
graph_go = load_df("go_obo")
go_ids = sorted(df_uniprot_goa.go_id_ancestor.unique())
df_adj_matrix_go2 = get_adjacency_matrix(graph_go, labels=go_ids, edges_filter={"is_a"})

### ChEBI adjacency

In [5]:
graph_chebi = load_df("chebi_obo")
chebi_id_primary = sorted(
    df_go_chebi[df_go_chebi.chebi_go_relation == "has_primary_input"].chebi_id.unique()
)
df_adj_matrix_chebi2 = get_adjacency_matrix(graph_chebi.copy(), labels=chebi_id_primary, edges_filter={"is_a"})

## Chemical similarity

https://rdkit.org/docs/source/rdkit.Chem.html#module-rdkit.Chem

https://rdkit.org/docs/source/rdkit.Chem.Fingerprints.html

https://rdkit.org/docs/source/rdkit.Chem.Fingerprints.MolSimilarity.html
https://rdkit.org/docs/source/rdkit.Chem.Fingerprints.SimilarityScreener.html

In [6]:
from subpred.chemical_similarity import get_pairwise_tanimoto

df_tanimoto = get_pairwise_tanimoto(df_go_chebi.chebi_id.unique())
df_tanimoto



chebi_id,CHEBI:46911,CHEBI:59789,CHEBI:58252,CHEBI:57844,CHEBI:45725,CHEBI:17359,CHEBI:77847,CHEBI:57966,CHEBI:15377,CHEBI:15378,...,CHEBI:33118,CHEBI:58702,CHEBI:58937,CHEBI:18212,CHEBI:49552,CHEBI:16347,CHEBI:15792,CHEBI:58243,CHEBI:28498,CHEBI:17051
chebi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEBI:46911,1.000000,0.222222,0.500000,0.500000,0.222222,0.083333,0.125000,0.360000,0.0,0.0,...,0.043478,0.147059,0.059701,0.083333,0.0,0.212121,0.250000,0.057971,0.096154,0.0
CHEBI:59789,0.222222,1.000000,0.316667,0.234375,0.029412,0.032787,0.064935,0.109375,0.0,0.0,...,0.016667,0.070423,0.148936,0.050000,0.0,0.132353,0.098361,0.486486,0.362319,0.0
CHEBI:58252,0.500000,0.316667,1.000000,0.517241,0.060606,0.076923,0.119048,0.241379,0.0,0.0,...,0.040000,0.138889,0.089552,0.076923,0.0,0.235294,0.230769,0.056338,0.092593,0.0
CHEBI:57844,0.500000,0.234375,0.517241,1.000000,0.060606,0.076923,0.119048,0.241379,0.0,0.0,...,0.040000,0.138889,0.073529,0.076923,0.0,0.235294,0.230769,0.071429,0.092593,0.0
CHEBI:45725,0.222222,0.029412,0.060606,0.060606,1.000000,0.000000,0.027027,0.173913,0.0,0.0,...,0.000000,0.000000,0.015873,0.000000,0.0,0.031250,0.045455,0.015385,0.020408,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CHEBI:16347,0.212121,0.132353,0.235294,0.235294,0.031250,0.083333,0.184211,0.259259,0.0,0.0,...,0.043478,0.147059,0.059701,0.083333,0.0,1.000000,0.304348,0.073529,0.096154,0.0
CHEBI:15792,0.250000,0.098361,0.230769,0.230769,0.045455,0.142857,0.129032,0.411765,0.0,0.0,...,0.076923,0.208333,0.051724,0.142857,0.0,0.304348,1.000000,0.067797,0.119048,0.0
CHEBI:58243,0.057971,0.486486,0.056338,0.071429,0.015385,0.035088,0.147059,0.063492,0.0,0.0,...,0.017857,0.107692,0.195402,0.035088,0.0,0.073529,0.067797,1.000000,0.363636,0.0
CHEBI:28498,0.096154,0.362319,0.092593,0.092593,0.020408,0.048780,0.087719,0.108696,0.0,0.0,...,0.025000,0.076923,0.128205,0.048780,0.0,0.096154,0.119048,0.363636,1.000000,0.0


## Pairwise ML models

- find label combinations with enough samples