# Creating matrices for pairs of labels

## Which ones?

- Sequence-based ML model F1 scores
- Sequence similarity scores
- annotation similarity scores
- annotation overlap

## For which dataset?

- First for yeast because of conference

In [1]:
from subpred.util import load_df
import networkx as nx
import re
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset


In [2]:
df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids={559292},
    swissprot_only=False,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=False,
    max_sequence_evidence_code=1,
)
display(df_sequences)
display(df_uniprot_goa)
display(df_go_chebi)


Unnamed: 0_level_0,sequence,reviewed,protein_existence,organism_id,protein_names
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P00401,MVQRWLYSTNAKDIAVLYFMLAIFSGMAGTAMSLIIRLELAAPGSQ...,True,1,559292,Cytochrome c oxidase subunit 1 (EC 7.1.1.9) (C...
P00830,MVLPRLYTATSRAAFKAAKQSAPLLSTSWKRCMASAAQSTPITGKV...,True,1,559292,"ATP synthase subunit beta, mitochondrial (EC 7..."
P04817,MTNSKEDADIEEKHMYNEPVTTLFHDVEASQTHHRRGSIPLKDEKS...,True,1,559292,Arginine permease CAN1 (Canavanine resistance ...
P07213,MKSFITRNKTAILATVAATGTAIGAYYYYNQLQQQQQRGKKNTINK...,True,1,559292,Mitochondrial import receptor subunit TOM70 (7...
P14906,MPTNYEYDEASETWPSFILTGLLMVVGPMTLLQIYQIFFGANAEDG...,True,1,559292,Protein translocation protein SEC63 (Protein N...
...,...,...,...,...,...
P39542,MFQQLSASIRHNAHIIFLCISWYFISSLASQVTKQVLTVCPLPLFL...,True,1,559292,Uncharacterized transporter YJL193W
Q05497,MAGILSKTLSEVHPSLRTNGMGIGNTHRRISLGFLPPNKKNPLVRK...,True,1,559292,Uncharacterized transporter YDR338C
P38318,MEPKRKSGSLAKHDLPQFYLLIMLYLAQGIPVGLAFGTVPFLLKSL...,True,1,559292,Uncharacterized membrane protein YBR220C
P46996,MSNEDETTRLMSSDEMDYLLETAGINALEEIISQNDSTGINLDTNE...,True,1,559292,Uncharacterized membrane protein YJL163C


Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015932,nucleobase-containing compound transmembrane t...
1,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015216,purine nucleotide transmembrane transporter ac...
2,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015215,nucleotide transmembrane transporter activity
3,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:1901702,salt transmembrane transporter activity
4,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015605,organophosphate ester transmembrane transporte...
...,...,...,...,...,...,...,...,...
7732,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0022804,active transmembrane transporter activity
7733,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0015078,proton transmembrane transporter activity
7734,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0022857,transmembrane transporter activity
7735,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0015399,primary active transmembrane transporter activity


Unnamed: 0,go_id,go_term,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,L-ornithine transmembrane transporter activity,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,S-adenosyl-L-methionine transmembrane transpor...,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
2,GO:0000099,sulfur amino acid transmembrane transporter ac...,CHEBI:26834,sulfur-containing amino acid,has_primary_input
3,GO:0000100,S-methylmethionine transmembrane transporter a...,CHEBI:58252,S-methyl-L-methionine zwitterion,has_primary_input
4,GO:0000102,L-methionine secondary active transmembrane tr...,CHEBI:57844,L-methionine zwitterion,has_primary_input
...,...,...,...,...,...
379,GO:1901682,sulfur compound transmembrane transporter acti...,CHEBI:26835,sulfur molecular entity,has_primary_input
380,GO:1902557,5'-adenylyl sulfate transmembrane transporter ...,CHEBI:58243,5'-adenylyl sulfate(2-),has_primary_input
381,GO:1903089,5-amino-1-ribofuranosylimidazole-4-carboxamide...,CHEBI:28498,acadesine,has_primary_input
382,GO:1903425,fluoride transmembrane transporter activity,CHEBI:17051,fluoride,has_primary_input


## Adjacency matrix

In [3]:
def get_adjacency_matrix(graph, labels:list, edges_filter:set = {"is_a"}):
    subgraph = graph.edge_subgraph(
        [edge for edge in graph.edges(keys=True) if edge[2] in edges_filter]
    )
    assert len(labels) == len(set(labels)), "labels should only contain unique elements"
    subgraph = subgraph.subgraph(labels)

    # scipy sparse matrix
    df_adjacency_matrix = nx.adjacency_matrix(G=subgraph, nodelist=labels)

    df_adjacency_matrix = pd.DataFrame(
        df_adjacency_matrix.todense(), columns=labels, index=labels
    )
    return df_adjacency_matrix

### GO adjacency

In [4]:
graph_go = load_df("go_obo")
go_ids = sorted(df_uniprot_goa.go_id_ancestor.unique())
df_adj_matrix_go2 = get_adjacency_matrix(graph_go, labels=go_ids, edges_filter={"is_a"})

### ChEBI adjacency

In [5]:
graph_chebi = load_df("chebi_obo")
chebi_id_primary = sorted(
    df_go_chebi[df_go_chebi.chebi_go_relation == "has_primary_input"].chebi_id.unique()
)
df_adj_matrix_chebi2 = get_adjacency_matrix(graph_chebi.copy(), labels=chebi_id_primary, edges_filter={"is_a"})

## Chemical similarity

Here, we are using the morgan fingerprints. There are other options that might be better:


In [6]:
from subpred.chemical_similarity import get_pairwise_similarity

methods = [
    "morgan",
    "atompairs",
    "torsions",
    "maccs",
]

for method in methods:
    print(method)
    df_tanimoto = get_pairwise_similarity(
        df_go_chebi.chebi_id.unique(), fingerprint_method=method
    )
    display(df_tanimoto)


morgan




chebi_id,CHEBI:46911,CHEBI:59789,CHEBI:58252,CHEBI:57844,CHEBI:45725,CHEBI:17359,CHEBI:77847,CHEBI:57966,CHEBI:15377,CHEBI:15378,...,CHEBI:33118,CHEBI:58702,CHEBI:58937,CHEBI:18212,CHEBI:49552,CHEBI:16347,CHEBI:15792,CHEBI:58243,CHEBI:28498,CHEBI:17051
chebi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEBI:46911,1.000000,0.218750,0.500000,0.500000,0.222222,0.083333,0.125000,0.360000,0.0,0.0,...,0.043478,0.147059,0.058824,0.083333,0.0,0.212121,0.250000,0.057143,0.074074,0.0
CHEBI:59789,0.218750,1.000000,0.311475,0.230769,0.028986,0.032258,0.064103,0.107692,0.0,0.0,...,0.016393,0.069444,0.134021,0.049180,0.0,0.130435,0.096774,0.493333,0.352113,0.0
CHEBI:58252,0.500000,0.311475,1.000000,0.517241,0.060606,0.076923,0.119048,0.241379,0.0,0.0,...,0.040000,0.138889,0.088235,0.076923,0.0,0.235294,0.230769,0.055556,0.071429,0.0
CHEBI:57844,0.500000,0.230769,0.517241,1.000000,0.060606,0.076923,0.119048,0.241379,0.0,0.0,...,0.040000,0.138889,0.072464,0.076923,0.0,0.235294,0.230769,0.055556,0.071429,0.0
CHEBI:45725,0.222222,0.028986,0.060606,0.060606,1.000000,0.000000,0.027027,0.173913,0.0,0.0,...,0.000000,0.000000,0.015625,0.000000,0.0,0.031250,0.045455,0.015152,0.020000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CHEBI:16347,0.212121,0.130435,0.235294,0.235294,0.031250,0.083333,0.184211,0.259259,0.0,0.0,...,0.043478,0.147059,0.058824,0.083333,0.0,1.000000,0.304348,0.072464,0.074074,0.0
CHEBI:15792,0.250000,0.096774,0.230769,0.230769,0.045455,0.142857,0.129032,0.411765,0.0,0.0,...,0.076923,0.208333,0.050847,0.142857,0.0,0.304348,1.000000,0.066667,0.090909,0.0
CHEBI:58243,0.057143,0.493333,0.055556,0.055556,0.015152,0.034483,0.144928,0.062500,0.0,0.0,...,0.017544,0.106061,0.177778,0.034483,0.0,0.072464,0.066667,1.000000,0.352941,0.0
CHEBI:28498,0.074074,0.352113,0.071429,0.071429,0.020000,0.023256,0.067797,0.083333,0.0,0.0,...,0.024390,0.055556,0.097561,0.023256,0.0,0.074074,0.090909,0.352941,1.000000,0.0


atompairs




chebi_id,CHEBI:46911,CHEBI:59789,CHEBI:58252,CHEBI:57844,CHEBI:45725,CHEBI:17359,CHEBI:77847,CHEBI:57966,CHEBI:15377,CHEBI:15378,...,CHEBI:33118,CHEBI:58702,CHEBI:58937,CHEBI:18212,CHEBI:49552,CHEBI:16347,CHEBI:15792,CHEBI:58243,CHEBI:28498,CHEBI:17051
chebi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEBI:46911,1.000000,0.090323,0.372881,0.411765,0.104167,0.024390,0.139241,0.243902,0.0,0.0,...,0.000000,0.082192,0.057692,0.024390,0.0,0.140845,0.096154,0.052469,0.100000,0.0
CHEBI:59789,0.090323,1.000000,0.137705,0.090323,0.056818,0.006536,0.075529,0.032573,0.0,0.0,...,0.006579,0.061538,0.260042,0.006536,0.0,0.080997,0.015723,0.424883,0.343465,0.0
CHEBI:58252,0.372881,0.137705,1.000000,0.528302,0.045455,0.020000,0.087912,0.153846,0.0,0.0,...,0.000000,0.086420,0.079618,0.020000,0.0,0.232877,0.064516,0.060606,0.063218,0.0
CHEBI:57844,0.411765,0.090323,0.528302,1.000000,0.060000,0.024390,0.097561,0.186047,0.0,0.0,...,0.000000,0.082192,0.057692,0.024390,0.0,0.173913,0.075472,0.042813,0.053892,0.0
CHEBI:45725,0.104167,0.056818,0.045455,0.060000,1.000000,0.000000,0.059829,0.062500,0.0,0.0,...,0.000000,0.008929,0.055072,0.027027,0.0,0.036036,0.022472,0.027397,0.039604,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CHEBI:16347,0.140845,0.080997,0.232877,0.173913,0.036036,0.020000,0.087912,0.132075,0.0,0.0,...,0.000000,0.060241,0.052795,0.020000,0.0,1.000000,0.178571,0.054217,0.057143,0.0
CHEBI:15792,0.096154,0.015723,0.064516,0.075472,0.022472,0.080000,0.041667,0.241379,0.0,0.0,...,0.000000,0.122807,0.032787,0.080000,0.0,0.178571,1.000000,0.021944,0.025478,0.0
CHEBI:58243,0.052469,0.424883,0.060606,0.042813,0.027397,0.016340,0.128931,0.025641,0.0,0.0,...,0.000000,0.054545,0.364465,0.006472,0.0,0.054217,0.021944,1.000000,0.348485,0.0
CHEBI:28498,0.100000,0.343465,0.063218,0.053892,0.039604,0.000000,0.071823,0.040268,0.0,0.0,...,0.000000,0.057803,0.169811,0.000000,0.0,0.057143,0.025478,0.348485,1.000000,0.0


torsions




chebi_id,CHEBI:46911,CHEBI:59789,CHEBI:58252,CHEBI:57844,CHEBI:45725,CHEBI:17359,CHEBI:77847,CHEBI:57966,CHEBI:15377,CHEBI:15378,...,CHEBI:33118,CHEBI:58702,CHEBI:58937,CHEBI:18212,CHEBI:49552,CHEBI:16347,CHEBI:15792,CHEBI:58243,CHEBI:28498,CHEBI:17051
chebi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEBI:46911,1.000000,0.127273,0.545455,0.600000,0.0625,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.016949,0.022222,0.0
CHEBI:59789,0.127273,1.000000,0.145455,0.107143,0.0000,0.0,0.015625,0.000000,0.0,0.0,...,0.0,0.0,0.021277,0.0,0.0,0.033333,0.0,0.630769,0.508197,0.0
CHEBI:58252,0.545455,0.145455,1.000000,0.545455,0.0000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0
CHEBI:57844,0.600000,0.107143,0.545455,1.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.020408,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0
CHEBI:45725,0.062500,0.000000,0.000000,0.000000,1.0000,0.0,0.000000,0.090909,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CHEBI:16347,0.000000,0.033333,0.000000,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.000000,0.0,0.016949,0.022222,0.0
CHEBI:15792,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.022222,0.0,0.0,0.000000,1.0,0.000000,0.000000,0.0
CHEBI:58243,0.016949,0.630769,0.000000,0.000000,0.0000,0.0,0.067797,0.000000,0.0,0.0,...,0.0,0.0,0.056180,0.0,0.0,0.016949,0.0,1.000000,0.551724,0.0
CHEBI:28498,0.022222,0.508197,0.000000,0.000000,0.0000,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.025641,0.0,0.0,0.022222,0.0,0.551724,1.000000,0.0


maccs




chebi_id,CHEBI:46911,CHEBI:59789,CHEBI:58252,CHEBI:57844,CHEBI:45725,CHEBI:17359,CHEBI:77847,CHEBI:57966,CHEBI:15377,CHEBI:15378,...,CHEBI:33118,CHEBI:58702,CHEBI:58937,CHEBI:18212,CHEBI:49552,CHEBI:16347,CHEBI:15792,CHEBI:58243,CHEBI:28498,CHEBI:17051
chebi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEBI:46911,1.000000,0.287671,0.545455,0.620690,0.533333,0.076923,0.232558,0.680000,0.041667,0.043478,...,0.085714,0.150000,0.214286,0.093750,0.040000,0.317073,0.250000,0.180723,0.258065,0.040000
CHEBI:59789,0.287671,1.000000,0.356164,0.338028,0.253333,0.084337,0.231707,0.232877,0.028169,0.014085,...,0.075000,0.132530,0.494382,0.064103,0.013699,0.307692,0.152778,0.659091,0.680000,0.013699
CHEBI:58252,0.545455,0.356164,1.000000,0.857143,0.307692,0.146341,0.183673,0.468750,0.034483,0.035714,...,0.075000,0.159091,0.250000,0.111111,0.033333,0.475000,0.212121,0.170455,0.185714,0.033333
CHEBI:57844,0.620690,0.338028,0.857143,1.000000,0.342857,0.102564,0.173913,0.535714,0.040000,0.041667,...,0.054054,0.146341,0.228571,0.090909,0.038462,0.410256,0.241379,0.151163,0.179104,0.038462
CHEBI:45725,0.533333,0.253333,0.307692,0.342857,1.000000,0.024390,0.177778,0.400000,0.000000,0.043478,...,0.027027,0.022222,0.214286,0.029412,0.040000,0.173913,0.060606,0.139535,0.200000,0.040000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CHEBI:16347,0.317073,0.307692,0.475000,0.410256,0.173913,0.111111,0.297872,0.351351,0.064516,0.032258,...,0.121951,0.227273,0.256757,0.131579,0.030303,1.000000,0.303030,0.204545,0.246377,0.030303
CHEBI:15792,0.250000,0.152778,0.212121,0.241379,0.060606,0.148148,0.272727,0.347826,0.076923,0.083333,...,0.125000,0.346154,0.156250,0.200000,0.071429,0.303030,1.000000,0.129870,0.155172,0.071429
CHEBI:58243,0.180723,0.659091,0.170455,0.151163,0.139535,0.253333,0.329114,0.132530,0.026667,0.013333,...,0.139241,0.225000,0.556818,0.129870,0.012987,0.204545,0.129870,1.000000,0.645570,0.012987
CHEBI:28498,0.258065,0.680000,0.185714,0.179104,0.200000,0.057143,0.268657,0.193548,0.036364,0.000000,...,0.093750,0.130435,0.444444,0.063492,0.000000,0.246377,0.155172,0.645570,1.000000,0.000000


## Pairwise ML models

- find label combinations with enough samples