In [1]:
from subpred.util import load_df
import networkx as nx
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset
from subpred.go_annotations import get_go_annotations_subset, EVIDENCE_CODE_TO_DESCRIPTION


columns

accession, sequence, reviewed, protein existence, organism id, protein name, go transport annotations, go membrane annotations (descencants of membrane or plasma membrane?), evidence codes, qualifiers, chebi ids, chebi_relations

one with iea, one without

evidence code map to simpler words

In [2]:
ORGANISM_IDS = {559292}
SWISSPROT_ONLY = False
MAX_SEQUENCE_EVIDENCE_CODE = 2
EXCLUDE_IEA_GO_TERMS = False
filename_excel = 'sc_plasma_membrane_transporters_predicted.xlsx'

In [4]:
df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids=ORGANISM_IDS,
    swissprot_only=SWISSPROT_ONLY,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=EXCLUDE_IEA_GO_TERMS,
    max_sequence_evidence_code=MAX_SEQUENCE_EVIDENCE_CODE,
)
# display(df_sequences)
# display(df_uniprot_goa)
# display(df_go_chebi)

In [7]:
df_uniprot_goa[df_uniprot_goa.Uniprot == "P32487"]

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
2220,P32487,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0008514,organic anion transmembrane transporter activity
2221,P32487,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0005342,organic acid transmembrane transporter activity
2222,P32487,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0046943,carboxylic acid transmembrane transporter acti...
2223,P32487,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0015171,amino acid transmembrane transporter activity
2224,P32487,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0022857,transmembrane transporter activity
2225,P32487,enables,GO:0015174,basic amino acid transmembrane transporter act...,IDA,F,GO:0046943,carboxylic acid transmembrane transporter acti...
2226,P32487,enables,GO:0015174,basic amino acid transmembrane transporter act...,IDA,F,GO:0008514,organic anion transmembrane transporter activity
2227,P32487,enables,GO:0015174,basic amino acid transmembrane transporter act...,IDA,F,GO:0005342,organic acid transmembrane transporter activity
2228,P32487,enables,GO:0015174,basic amino acid transmembrane transporter act...,IDA,F,GO:0015174,basic amino acid transmembrane transporter act...
2229,P32487,enables,GO:0015174,basic amino acid transmembrane transporter act...,IDA,F,GO:0015171,amino acid transmembrane transporter activity


In [17]:
df_uniprot_goa_anatomical_entity = get_go_annotations_subset(
    datasets_path="../data/datasets/",
    root_go_term="cellular anatomical entity",
    inner_go_relations={"is_a"},
    namespaces_keep={"cellular_component"},
    proteins_subset=set(df_sequences.index),
    go_protein_qualifiers_filter_set={"located_in"}, 
    annotations_evidence_codes_remove={"IEA"} if EXCLUDE_IEA_GO_TERMS else None,
)

In [19]:
df_uniprot_goa_anatomical_entity[df_uniprot_goa_anatomical_entity.go_term_ancestor == "eisosome"]

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
355,P04817,located_in,GO:0032126,eisosome,IDA,C,GO:0032126,eisosome
439,P05030,located_in,GO:0032126,eisosome,IDA,C,GO:0032126,eisosome
2231,P32487,located_in,GO:0032126,eisosome,IDA,C,GO:0032126,eisosome


In [13]:
df_uniprot_goa_membrane = get_go_annotations_subset(
    datasets_path="../data/datasets/",
    root_go_term="membrane",
    inner_go_relations={"is_a"},
    namespaces_keep={"cellular_component"},
    proteins_subset=set(df_sequences.index),
    go_protein_qualifiers_filter_set={"located_in"}, 
    annotations_evidence_codes_remove={"IEA"} if EXCLUDE_IEA_GO_TERMS else None,
)

In [10]:
df_uniprot_goa_membrane[df_uniprot_goa_membrane.Uniprot == "P32487"]

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
1395,P32487,located_in,GO:0005739,mitochondrion,HDA,C,GO:0043231,intracellular membrane-bounded organelle
1396,P32487,located_in,GO:0005739,mitochondrion,HDA,C,GO:0043227,membrane-bounded organelle
1397,P32487,located_in,GO:0005739,mitochondrion,HDA,C,GO:0005739,mitochondrion
1398,P32487,located_in,GO:0016020,membrane,IEA,C,GO:0016020,membrane


In [124]:
pm_proteins = df_uniprot_goa_membrane[
    df_uniprot_goa_membrane.go_term_ancestor == "plasma membrane"
].Uniprot.unique()
df_sequences_pm = df_sequences[df_sequences.index.isin(pm_proteins)]

df_sequences_pm = df_sequences_pm.assign(
    protein_existence_evidence_at=df_sequences_pm.protein_existence.map(
        {1: "protein level", 2: "transcript level"}
    )
)  # = df_sequences_pm[["reviewed", "protein_existence", "organism_id","protein_names", "sequence"]]

df_sequences_pm = df_sequences_pm.drop("protein_existence", axis=1)

df_sequences_pm = df_sequences_pm.rename(columns={"reviewed": "swissprot_reviewed"})

df_sequences_pm = df_sequences_pm[["swissprot_reviewed", "protein_existence_evidence_at", "organism_id","protein_names", "sequence"]]

df_sequences_pm

Unnamed: 0_level_0,swissprot_reviewed,protein_existence_evidence_at,organism_id,protein_names,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P04817,True,protein level,559292,Arginine permease CAN1 (Canavanine resistance ...,MTNSKEDADIEEKHMYNEPVTTLFHDVEASQTHHRRGSIPLKDEKS...
P17064,True,protein level,559292,Purine-cytosine permease FCY2 (PCP FCY2) (Cyto...,MLEEGNNVYEIQDLEKRSPVIGSSLENEKKVAASETFTATSEDDQQ...
P23900,True,protein level,559292,Glycerol uptake/efflux facilitator protein,MSNPQKALNDFLSSESVHTHDSSRKQSNKQSSDEGRSSSQPSHHHS...
P30605,True,protein level,559292,Myo-inositol transporter 1,MGIHIPYLTSKTSQSNVGDAVGNADSVEFNSEHDSPSKRGKITLES...
P32465,True,protein level,559292,Low-affinity glucose transporter HXT1,MNSTPDLISPQKSNSSNSYELESGRSKAMNTPEGKNESFHDNLSES...
...,...,...,...,...,...
P33335,True,protein level,559292,Protein SGE1 (10-N-nonyl acridine orange resis...,MKSTLSLTLCVISLLLTLFLAALDIVIVVTLYDTIGIKFHDFGNIG...
P53241,True,protein level,559292,Vitamin H transporter (H(+)/biotin symporter),MTISNKSWRSYFPHLRKLPEDDQYLYSDDTNSSIIAEEELHHSVDK...
Q06686,True,protein level,559292,Copper transport protein CTR3 (Copper transpor...,MNMGGSSSTAAKKATCKISMLWNWYTIDTCFIARSWRNDTKGKFAG...
Q12412,True,protein level,559292,Protein PNS1 (pH nine-sensitive protein 1),MPLNEKYERPPQPPPAYDPNHRPPSSSENSAAANVNDGQTPYHFRQ...


In [125]:
df_uniprot_goa_pm = df_uniprot_goa[df_uniprot_goa.Uniprot.isin(pm_proteins)].reset_index(drop=True)
df_uniprot_goa_pm

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0008514,organic anion transmembrane transporter activity
1,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0005342,organic acid transmembrane transporter activity
2,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0015171,amino acid transmembrane transporter activity
3,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0022857,transmembrane transporter activity
4,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0046943,carboxylic acid transmembrane transporter acti...
...,...,...,...,...,...,...,...,...
3149,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IEA,F,GO:0051139,metal cation:proton antiporter activity
3150,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IEA,F,GO:0015385,sodium:proton antiporter activity
3151,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IEA,F,GO:0022857,transmembrane transporter activity
3152,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IEA,F,GO:1901702,salt transmembrane transporter activity


In [126]:
go_to_primary_inputs = (
    df_go_chebi[df_go_chebi.chebi_go_relation == "has_primary_input"]
    .drop(["go_term", "chebi_id"], axis=1)
    .groupby("go_id")
    .apply(lambda x: str(x.chebi_term.tolist()))
)
go_to_participants = (
    df_go_chebi[df_go_chebi.chebi_go_relation == "has_participant"]
    .drop(["go_term", "chebi_id"], axis=1)
    .groupby("go_id")
    .apply(lambda x: str(x.chebi_term.tolist()))
)

In [127]:
df_uniprot_goa_pm = df_uniprot_goa_pm.merge(
    go_to_primary_inputs.rename("chebi_terms_primary_substrates"),
    how="left",
    left_on="go_id_ancestor",
    right_index=True,
)
df_uniprot_goa_pm = df_uniprot_goa_pm.merge(
    go_to_participants.rename("chebi_terms_participating_molecules"),
    how="left",
    left_on="go_id_ancestor",
    right_index=True, 
)

In [128]:
df_uniprot_goa_pm = df_uniprot_goa_pm.assign(
    go_evidence_type=df_uniprot_goa_pm.evidence_code.map(EVIDENCE_CODE_TO_DESCRIPTION)
).drop(
    ["go_id", "go_term", "evidence_code", "aspect"], axis=1
).rename(columns={"go_id_ancestor": "go_id", "go_term_ancestor": "go_term"})  # TODO also membranes? TODO remove original go term?
df_uniprot_goa_pm

Unnamed: 0,Uniprot,qualifier,go_id,go_term,chebi_terms_primary_substrates,chebi_terms_participating_molecules,go_evidence_type
0,P04817,enables,GO:0008514,organic anion transmembrane transporter activity,['organic anion'],,phylogenetically_inferred
1,P04817,enables,GO:0005342,organic acid transmembrane transporter activity,['organic acid'],,phylogenetically_inferred
2,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,['amino acid'],,phylogenetically_inferred
3,P04817,enables,GO:0022857,transmembrane transporter activity,,,phylogenetically_inferred
4,P04817,enables,GO:0046943,carboxylic acid transmembrane transporter acti...,['carboxylic acid anion'],,phylogenetically_inferred
...,...,...,...,...,...,...,...
3149,Q99271,enables,GO:0051139,metal cation:proton antiporter activity,"['proton', 'metal cation']",,electronic_annotation
3150,Q99271,enables,GO:0015385,sodium:proton antiporter activity,"['proton', 'sodium(1+)']","['hydron', 'sodium(1+)']",electronic_annotation
3151,Q99271,enables,GO:0022857,transmembrane transporter activity,,,electronic_annotation
3152,Q99271,enables,GO:1901702,salt transmembrane transporter activity,,,electronic_annotation


In [129]:
df_uniprot_goa_pm = df_uniprot_goa_pm[
    [
        "Uniprot",
        "qualifier",
        "go_id",
        "go_term",
        "go_evidence_type",
        "chebi_terms_primary_substrates",
        "chebi_terms_participating_molecules",
    ]
]
df_uniprot_goa_pm = df_uniprot_goa_pm.drop_duplicates()

df_uniprot_goa_pm

Unnamed: 0,Uniprot,qualifier,go_id,go_term,go_evidence_type,chebi_terms_primary_substrates,chebi_terms_participating_molecules
0,P04817,enables,GO:0008514,organic anion transmembrane transporter activity,phylogenetically_inferred,['organic anion'],
1,P04817,enables,GO:0005342,organic acid transmembrane transporter activity,phylogenetically_inferred,['organic acid'],
2,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,phylogenetically_inferred,['amino acid'],
3,P04817,enables,GO:0022857,transmembrane transporter activity,phylogenetically_inferred,,
4,P04817,enables,GO:0046943,carboxylic acid transmembrane transporter acti...,phylogenetically_inferred,['carboxylic acid anion'],
...,...,...,...,...,...,...,...
3143,Q99271,enables,GO:0015081,sodium ion transmembrane transporter activity,electronic_annotation,['sodium(1+)'],['sodium(1+)']
3144,Q99271,enables,GO:0140828,metal cation:monoatomic cation antiporter acti...,electronic_annotation,,
3146,Q99271,enables,GO:0022853,active monoatomic ion transmembrane transporte...,electronic_annotation,['monoatomic ion'],
3149,Q99271,enables,GO:0051139,metal cation:proton antiporter activity,electronic_annotation,"['proton', 'metal cation']",


## Now, the same thing for membrane type

In [130]:
df_uniprot_goa_membrane_pm = (
    df_uniprot_goa_membrane[df_uniprot_goa_membrane.Uniprot.isin(pm_proteins)]
    .reset_index(drop=True)
    .drop_duplicates()
)
df_uniprot_goa_membrane_pm[
    "evicence_type"
] = df_uniprot_goa_membrane_pm.evidence_code.map(EVIDENCE_CODE_TO_DESCRIPTION)
df_uniprot_goa_membrane_pm = df_uniprot_goa_membrane_pm.drop(
    ["go_id", "go_term", "evidence_code", "aspect"], axis=1
)[
    ["Uniprot", "qualifier", "go_id_ancestor", "go_term_ancestor", "evicence_type"]
].rename(
    columns={"go_id_ancestor": "go_id", "go_term_ancestor": "go_term"}
).drop_duplicates().reset_index(drop=True)

df_uniprot_goa_membrane_pm

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evicence_type
0,P04817,located_in,GO:0043227,membrane-bounded organelle,experimental_evidence_high_throughput
1,P04817,located_in,GO:0005739,mitochondrion,experimental_evidence_high_throughput
2,P04817,located_in,GO:0043231,intracellular membrane-bounded organelle,experimental_evidence_high_throughput
3,P04817,located_in,GO:0031410,cytoplasmic vesicle,electronic_annotation
4,P04817,located_in,GO:0043231,intracellular membrane-bounded organelle,electronic_annotation
...,...,...,...,...,...
1016,Q99271,located_in,GO:0016020,membrane,experimental_evidence
1017,Q99271,located_in,GO:0005886,plasma membrane,electronic_annotation
1018,Q99271,located_in,GO:0016020,membrane,electronic_annotation
1019,Q99271,located_in,GO:0045121,membrane raft,experimental_evidence


In [131]:
df_uniprot_goa_membrane_pm
df_uniprot_goa_pm
df_sequences_pm

Unnamed: 0_level_0,swissprot_reviewed,protein_existence_evidence_at,organism_id,protein_names,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P04817,True,protein level,559292,Arginine permease CAN1 (Canavanine resistance ...,MTNSKEDADIEEKHMYNEPVTTLFHDVEASQTHHRRGSIPLKDEKS...
P17064,True,protein level,559292,Purine-cytosine permease FCY2 (PCP FCY2) (Cyto...,MLEEGNNVYEIQDLEKRSPVIGSSLENEKKVAASETFTATSEDDQQ...
P23900,True,protein level,559292,Glycerol uptake/efflux facilitator protein,MSNPQKALNDFLSSESVHTHDSSRKQSNKQSSDEGRSSSQPSHHHS...
P30605,True,protein level,559292,Myo-inositol transporter 1,MGIHIPYLTSKTSQSNVGDAVGNADSVEFNSEHDSPSKRGKITLES...
P32465,True,protein level,559292,Low-affinity glucose transporter HXT1,MNSTPDLISPQKSNSSNSYELESGRSKAMNTPEGKNESFHDNLSES...
...,...,...,...,...,...
P33335,True,protein level,559292,Protein SGE1 (10-N-nonyl acridine orange resis...,MKSTLSLTLCVISLLLTLFLAALDIVIVVTLYDTIGIKFHDFGNIG...
P53241,True,protein level,559292,Vitamin H transporter (H(+)/biotin symporter),MTISNKSWRSYFPHLRKLPEDDQYLYSDDTNSSIIAEEELHHSVDK...
Q06686,True,protein level,559292,Copper transport protein CTR3 (Copper transpor...,MNMGGSSSTAAKKATCKISMLWNWYTIDTCFIARSWRNDTKGKFAG...
Q12412,True,protein level,559292,Protein PNS1 (pH nine-sensitive protein 1),MPLNEKYERPPQPPPAYDPNHRPPSSSENSAAANVNDGQTPYHFRQ...


In [132]:
with pd.ExcelWriter(filename_excel) as writer:
    df_sequences_pm.to_excel(writer, sheet_name="proteins")
    df_uniprot_goa_pm.to_excel(writer, sheet_name="transport_annotations")
    df_uniprot_goa_membrane_pm.to_excel(writer, sheet_name="membrane_annotations")