Styles and layouts were manually adjusted in Cytoscape. The 60%-100% SSN and 85-100% SSN were saved as pngs, corresponding to the taxomony of class and phylum levels, respectively.

In [1]:
import os
from os.path import join
import pandas as pd
import numpy as np
CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

d:\Python\aox\enzyme-mining-aox


In [2]:
DATADIR = os.path.join(CURRENT_DIR, "data", "aox")
SSN_DIR = join(DATADIR, "graph", "acc")

filenames = {
    "uniprot_sequence": join(DATADIR, "raw", "uniprot_sequence.tsv"), # reported sequence in UniProt
    "sequence_fasta": join(DATADIR, "raw", "sequence.fasta"), # reported sequence to be active (referenced sequence)
    "experiment_result": join(DATADIR, "result", "experiment_result.tsv"), # the experiment results

    # graph
    "node": join(SSN_DIR, "nodes.tsv"),
    "edge": join(SSN_DIR, "edges.tsv"),

    # results
    "nodes_annotate": join(SSN_DIR, "nodes_annotate.tsv"),
    "node_new": join(SSN_DIR, "node_new.tsv"),
    "edge_new": join(SSN_DIR, "edge_new.tsv")
}

In [3]:
# Labeling nodes that report to be active
uniprot_sequence = pd.read_csv(filenames['uniprot_sequence'], sep='\t')
reviewed_active_acc = uniprot_sequence[uniprot_sequence['Reviewed'] == 'reviewed']['Entry'].to_list()
print(reviewed_active_acc)

['P04842', 'F2QY27', 'F2R038', 'P04841', 'Q00922', 'C4R702']


In [4]:
# Disambiguation: uniprot and NCBI accession
ncbi_uniprot_map = [
    ['EAA66666.1', 'Q5BFW3'],
    ['AAR89538.1', 'Q9C1S3'],
    ['AAV66468.1', 'Q5S057'],
    ['AAV66467.2', 'Q5S058'],
    ['AAF82788.1', 'Q9P304'],
    ['XP_001838223.2', 'A8P219'],
    ['XP_007868339.1', 'S7RFS3'], # A8DPS4
    ['CAM84030.1', 'A6PZG7'],
    ['CAM84031.1', 'A6PZG8'],
    ['CAM84032.1', 'A6PZG9'],
    ['AAB57849.1', 'F2QY27'],
    ['AAB57850.1', 'F2R038'],
    ['AFO55203.1', 'I7CMK2'],
    ['AHC95541.1', 'A0A067YIW8'],
    ['AAL56054.1', 'Q8X1N1'],
    ['UXW61383.1', 'A0A977TIR6'],
    ['AAF02495.1', 'Q9UVU1'],
    ['AAF02494.1', 'Q9UVU2'],
    ['AAQ99151.1', 'Q6TAW1'],
    ['AAV66465.1', 'Q5S060'],
    ['AAV66466.1', 'Q5S059'],
    ['CDG66232.1', 'T2M2J4']
]
ncbi_uniprot_map = {
    k:v for k,v in ncbi_uniprot_map
}

In [5]:
from Bio import SeqIO

seqs = []
for seqrecord in SeqIO.parse(filenames['sequence_fasta'], "fasta"):
    seqs.append([seqrecord.id, str(seqrecord.seq)])
ncbi_sequences = pd.DataFrame(seqs, columns=['ncbi id', 'sequence'])
referenced_active_acc = ncbi_sequences['ncbi id'].to_list()
print(referenced_active_acc) # 21
referenced_active_acc = [
   ncbi_uniprot_map[a] for a in referenced_active_acc
]
print(referenced_active_acc)

['EAA66666.1', 'AAR89538.1', 'AAV66468.1', 'AAV66467.2', 'AAF82788.1', 'XP_001838223.2', 'XP_007868339.1', 'CAM84030.1', 'CAM84031.1', 'CAM84032.1', 'AAB57849.1', 'AAB57850.1', 'AFO55203.1', 'AHC95541.1', 'AAL56054.1', 'UXW61383.1', 'AAF02495.1', 'AAF02494.1', 'AAQ99151.1', 'AAV66465.1', 'AAV66466.1']
['Q5BFW3', 'Q9C1S3', 'Q5S057', 'Q5S058', 'Q9P304', 'A8P219', 'S7RFS3', 'A6PZG7', 'A6PZG8', 'A6PZG9', 'F2QY27', 'F2R038', 'I7CMK2', 'A0A067YIW8', 'Q8X1N1', 'A0A977TIR6', 'Q9UVU1', 'Q9UVU2', 'Q6TAW1', 'Q5S060', 'Q5S059']


In [6]:
# Labeling of experimental results (selected nodes and nodes with activity)
experiment_result = pd.read_csv(filenames['experiment_result'], sep='\t')
selected_acc = experiment_result['name'].to_list()

In [7]:
# The sequences used for the experiments were unique
print(len(set(selected_acc))) # 31
print(len(set(ncbi_uniprot_map.get(a,a) for a in selected_acc))) # 31

43
43


In [8]:
experiment_result['r_name'] = experiment_result['name'].apply(lambda x: ncbi_uniprot_map.get(x, x))

selected_acc = experiment_result['r_name'].to_list()
valid_active_acc = experiment_result[experiment_result['activity(mU/mg)'] > 0]['r_name'].to_list()
unsolute_acc = experiment_result[experiment_result['inclusion'] == 1]['r_name'].to_list()
solute_nonactive_acc = experiment_result[(experiment_result['inclusion'] == 0) & (experiment_result['activity(mU/mg)'] < 0.1)]['r_name'].to_list()

In [9]:
accs = []
for acc in set(reviewed_active_acc + referenced_active_acc + selected_acc):
    if acc in selected_acc:
        label_selected = 1
    else:
        label_selected = 0
    if acc in unsolute_acc:
        label_active = 3 # unsolute
    elif acc in solute_nonactive_acc:
        label_active = 2 # solute but not active
    else:
        label_active = 1 # valid or reported to be active
    accs.append([acc, label_selected, label_active])
df_acc = pd.DataFrame(accs, columns=['accession', 'selected', 'active'])
df_acc.to_csv(filenames['nodes_annotate'], sep='\t', )

In [10]:
# De-duplication for nodes and edges
nodes = pd.read_csv(filenames['node'], sep='\t', index_col=0)
nodes['accession'] = nodes['accession'].apply(lambda x: ncbi_uniprot_map.get(x,x))
nodes = nodes.drop_duplicates(subset=['accession'])
nodes

Unnamed: 0,accession,class,phylum,active_sequence,active_tax,score
0,A0A010RMG6,Sordariomycetes,Ascomycota,False,False,0.000250
1,A0A017SR18,Eurotiomycetes,Ascomycota,False,False,0.000967
2,A0A060SVX6,Agaricomycetes,Basidiomycota,False,False,0.000816
3,A0A066X5A7,Sordariomycetes,Ascomycota,False,False,0.000250
4,A0A067N510,Agaricomycetes,Basidiomycota,False,False,0.000219
...,...,...,...,...,...,...
352,W9W4X4,Eurotiomycetes,Ascomycota,False,False,0.000489
353,W9X977,Eurotiomycetes,Ascomycota,False,False,0.000489
354,W9Y7V1,Eurotiomycetes,Ascomycota,False,False,0.000489
355,W9YRP8,Eurotiomycetes,Ascomycota,False,False,0.000489


In [11]:
new_nodes = pd.merge(nodes, df_acc, right_on='accession', left_on='accession', how='left').fillna(0.0)
new_nodes = pd.concat([new_nodes, pd.DataFrame([
        ['A0A428UCX7', 'Sordariomycetes', 'Ascomycota', False, False, 2.497227114343E-4, 0.0, 0.0 ],
        ['S8FNA5', 'Agaricomycetes', 'Basidiomycota', False, False, 8.159640056539E-4, 0.0, 0.0]
    ], columns = new_nodes.columns)]
).reset_index(drop=True)
new_nodes.to_csv(filenames['node_new'], sep='\t')

In [12]:
edges = pd.read_csv(filenames['edge'], sep='\t', index_col=0)
edges['query'] = edges['query'].apply(lambda x: ncbi_uniprot_map.get(x,x))
edges['target'] = edges['target'].apply(lambda x: ncbi_uniprot_map.get(x,x))
edges = edges[edges['query'] != edges['target']] # self-loop
edges['q-t'] = [
    ''.join(sorted([i, j]))
    for i, j in zip(edges['query'], edges['target'])
]
edges = edges.drop_duplicates(subset=['q-t'])
edges = edges.drop(columns=['q-t'])
edges.to_csv(filenames['edge_new'], sep='\t')