# Creating matrices for pairs of labels

## Which ones?

- Sequence-based ML model F1 scores
- Sequence similarity scores
- annotation similarity scores
- annotation overlap

## For which dataset?

- First for yeast because of conference

In [1]:
from subpred.util import load_df
from subpred.go_annotations import get_go_subgraph
import networkx as nx
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset


In [2]:
df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids={559292},
    swissprot_only=False,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=False,
    max_sequence_evidence_code=1,
)
display(df_sequences)
display(df_uniprot_goa)
display(df_go_chebi)


Unnamed: 0_level_0,sequence,reviewed,protein_existence,organism_id,protein_names
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P00401,MVQRWLYSTNAKDIAVLYFMLAIFSGMAGTAMSLIIRLELAAPGSQ...,True,1,559292,Cytochrome c oxidase subunit 1 (EC 7.1.1.9) (C...
P00830,MVLPRLYTATSRAAFKAAKQSAPLLSTSWKRCMASAAQSTPITGKV...,True,1,559292,"ATP synthase subunit beta, mitochondrial (EC 7..."
P04817,MTNSKEDADIEEKHMYNEPVTTLFHDVEASQTHHRRGSIPLKDEKS...,True,1,559292,Arginine permease CAN1 (Canavanine resistance ...
P07213,MKSFITRNKTAILATVAATGTAIGAYYYYNQLQQQQQRGKKNTINK...,True,1,559292,Mitochondrial import receptor subunit TOM70 (7...
P14906,MPTNYEYDEASETWPSFILTGLLMVVGPMTLLQIYQIFFGANAEDG...,True,1,559292,Protein translocation protein SEC63 (Protein N...
...,...,...,...,...,...
P39542,MFQQLSASIRHNAHIIFLCISWYFISSLASQVTKQVLTVCPLPLFL...,True,1,559292,Uncharacterized transporter YJL193W
Q05497,MAGILSKTLSEVHPSLRTNGMGIGNTHRRISLGFLPPNKKNPLVRK...,True,1,559292,Uncharacterized transporter YDR338C
P38318,MEPKRKSGSLAKHDLPQFYLLIMLYLAQGIPVGLAFGTVPFLLKSL...,True,1,559292,Uncharacterized membrane protein YBR220C
P46996,MSNEDETTRLMSSDEMDYLLETAGINALEEIISQNDSTGINLDTNE...,True,1,559292,Uncharacterized membrane protein YJL163C


Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:1901505,carbohydrate derivative transmembrane transpor...
1,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0005346,purine ribonucleotide transmembrane transporte...
2,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015932,nucleobase-containing compound transmembrane t...
3,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0000295,adenine nucleotide transmembrane transporter a...
4,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015215,nucleotide transmembrane transporter activity
...,...,...,...,...,...,...,...,...
7732,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0015399,primary active transmembrane transporter activity
7733,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0004129,cytochrome-c oxidase activity
7734,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0008324,monoatomic cation transmembrane transporter ac...
7735,Q9ZZX1,enables,GO:0004129,cytochrome-c oxidase activity,IEA,F,GO:0015075,monoatomic ion transmembrane transporter activity


Unnamed: 0,go_id,go_term,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,L-ornithine transmembrane transporter activity,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,S-adenosyl-L-methionine transmembrane transpor...,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
2,GO:0000099,sulfur amino acid transmembrane transporter ac...,CHEBI:26834,sulfur-containing amino acid,has_primary_input
3,GO:0000100,S-methylmethionine transmembrane transporter a...,CHEBI:58252,S-methyl-L-methionine zwitterion,has_primary_input
4,GO:0000102,L-methionine secondary active transmembrane tr...,CHEBI:57844,L-methionine zwitterion,has_primary_input
...,...,...,...,...,...
379,GO:1901682,sulfur compound transmembrane transporter acti...,CHEBI:26835,sulfur molecular entity,has_primary_input
380,GO:1902557,5'-adenylyl sulfate transmembrane transporter ...,CHEBI:58243,5'-adenylyl sulfate(2-),has_primary_input
381,GO:1903089,5-amino-1-ribofuranosylimidazole-4-carboxamide...,CHEBI:28498,acadesine,has_primary_input
382,GO:1903425,fluoride transmembrane transporter activity,CHEBI:17051,fluoride,has_primary_input


## Adjacency matrix

In [3]:
def get_adjacency_matrix(graph, labels:list, edges_filter:set = {"is_a"}):
    subgraph = graph.edge_subgraph(
        [edge for edge in graph.edges(keys=True) if edge[2] in edges_filter]
    )
    assert len(labels) == len(set(labels)), "labels should only contain unique elements"
    subgraph = subgraph.subgraph(labels)

    # scipy sparse matrix
    df_adjacency_matrix = nx.adjacency_matrix(G=subgraph, nodelist=labels)

    df_adjacency_matrix = pd.DataFrame(
        df_adjacency_matrix.todense(), columns=labels, index=labels
    )
    return df_adjacency_matrix

### GO adjacency

In [4]:
graph_go = load_df("go_obo")
go_ids = sorted(df_uniprot_goa.go_id_ancestor.unique())
df_adj_matrix_go2 = get_adjacency_matrix(graph_go, labels=go_ids, edges_filter={"is_a"})

### ChEBI adjacency

In [5]:
graph_chebi = load_df("chebi_obo")
chebi_id_primary = sorted(
    df_go_chebi[df_go_chebi.chebi_go_relation == "has_primary_input"].chebi_id.unique()
)
df_adj_matrix_chebi2 = get_adjacency_matrix(graph_chebi.copy(), labels=chebi_id_primary, edges_filter={"is_a"})

## Chemical similarity

https://rdkit.org/docs/source/rdkit.Chem.html#module-rdkit.Chem

https://rdkit.org/docs/source/rdkit.Chem.Fingerprints.html

https://rdkit.org/docs/source/rdkit.Chem.Fingerprints.MolSimilarity.html
https://rdkit.org/docs/source/rdkit.Chem.Fingerprints.SimilarityScreener.html

In [3]:
import pandas as pd

Unnamed: 0,Marvin 01211310252D,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,22 24 0 0 0 0,999,V2000,,,,,,,,,,
1,-2.8644 -0.2905,0.0000 C,0 0,0,0,0,0,0,0,0,0,0,0
2,-2.8656 -1.1176,0.0000 C,0 0,0,0,0,0,0,0,0,0,0,0
3,-2.1509 -1.5304,0.0000 C,0 0,0,0,0,0,0,0,0,0,0,0
4,-2.1527 0.1221,0.0000 C,0 0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3711670,> <ChEBI Name>,,,,,,,,,,,,
3711671,travoprost,,,,,,,,,,,,
3711672,> <Star>,,,,,,,,,,,,
3711673,3,,,,,,,,,,,,


In [13]:
file_name = "../../test_files/ChEBI_complete.sdf"
with open(file_name, "r") as file:
    lines = file.readlines()

In [30]:
chebi_ids = list()
# TODO $$$$ as molecule delimiter
# TODO warnings

chebi_id = None
smiles = None
for index in range(len(lines)-1):
    if lines[index].startswith("> <ChEBI ID>"):
        if smiles and not chebi_id:
            smiles = None
        elif chebi_id and not smiles:
            chebi_id = None
        elif chebi_id and smiles:
            chebi_ids.append((chebi_id, smiles))
            chebi_id = None
            smiles = None
        chebi_id = lines[index+1].strip()
    elif lines[index].startswith("> <SMILES>"):
        if chebi_id:
            smiles = lines[index+1].strip()
if chebi_id and smiles:
    chebi_ids.append((chebi_id, smiles))
    chebi_id = None
    smiles = None
chebi_ids[-1]

('CHEBI:746859',
 'CC(C)OC(=O)CCC\\C=C/C[C@H]1[C@@H](O)C[C@@H](O)[C@@H]1\\C=C\\[C@@H](O)COc1cccc(c1)C(F)(F)F')

In [1]:
from rdkit import Chem
file_name = "../../test_files/ChEBI_lite_3star.sdf"

suppl = Chem.SDMolSupplier(file_name)

count = 100

for c, mol in enumerate(suppl):
    print(c)


In [6]:
graph_chebi = load_df("chebi_obo")
data_points = set()
for data_keys in [set(data.keys()) for node, data in graph_chebi.nodes(data=True)]:
    data_points |= data_keys

data_points


{'alt_id',
 'def',
 'is_a',
 'name',
 'property_value',
 'relationship',
 'subset',
 'synonym',
 'xref'}

In [7]:
[data for node, data in graph_chebi.nodes(data="xref") if data]


[['CAS:22325-47-9'],
 ['KEGG:C00462'],
 ['Gmelin:14905',
  'CAS:16984-48-8',
  'CAS:16984-48-8',
  'KEGG:C00742',
  'PDBeChem:F'],
 ['Wikipedia:Sodium_Fluoride',
  'CAS:7681-49-4',
  'KEGG:D00943',
  'KEGG:C08142',
  'CAS:7681-49-4'],
 ['KEGG:D01502', 'CAS:34445-07-3', 'CAS:34445-07-3'],
 ['CAS:7775-41-9', 'Gmelin:122625', 'CAS:7775-41-9'],
 ['PMID:21142041',
  'Beilstein:3570522',
  'PMID:22229781',
  'Reaxys:3570522',
  'PMID:15074950',
  'CAS:429-41-4',
  'PMID:21517057'],
 ['PMID:21141831',
  'PMID:17963376',
  'PMID:1551879',
  'PMID:16223284',
  'PMID:2808407',
  'PMID:11897645',
  'CAS:7787-49-7',
  'PMID:16771420',
  'Reaxys:14667390',
  'PMID:8800204',
  'PMID:16771419',
  'PMID:17979355',
  'MetaCyc:CPD0-1230',
  'PMID:20572721',
  'PMID:19561071',
  'Gmelin:95146',
  'PMID:18651739',
  'CAS:7787-49-7',
  'PMID:20529842'],
 ['CAS:12125-01-8', 'Reaxys:13192588', 'Wikipedia:Ammonium_fluoride'],
 ['PMID:21412559',
  'Beilstein:3902818',
  'Reaxys:3902818',
  'Wikipedia:Potassium

In [14]:
[
    (
        node,
        [
            data_point
            for data_point in data
            if data_point.startswith("http://purl.obolibrary.org/obo/chebi/smiles")
        ],
    )
    for node, data in graph_chebi.nodes(data="property_value")
    if data
]


[('CHEBI:33429',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[*-]" xsd:string']),
 ('CHEBI:30151',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[Al-]" xsd:string']),
 ('CHEBI:16042',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[*-]" xsd:string']),
 ('CHEBI:17051',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[F-]" xsd:string']),
 ('CHEBI:28741',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[F-].[Na+]" xsd:string']),
 ('CHEBI:32129',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[F-].[H][N]([H])([H])[Ag+][N]([H])([H])[H]" xsd:string']),
 ('CHEBI:30340',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[F-].[Ag+]" xsd:string']),
 ('CHEBI:51990',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[F-].CCCC[N+](CCCC)(CCCC)CCCC" xsd:string']),
 ('CHEBI:49499',
  ['http://purl.obolibrary.org/obo/chebi/smiles "F[Be]F" xsd:string']),
 ('CHEBI:66871',
  ['http://purl.obolibrary.org/obo/chebi/smiles "[F-].[H][N+]([H])([H])[H]" xsd:string']),
 ('CHEBI:66872',
  ['http://pur

## Pairwise ML models

- find label combinations with enough samples