# Proyecto Redes - Constructor de red
## Idea
La idea es que a partir del archivo de interacciones, archivo de nodos de ligandos y archivo de nodos de blancos se pueda construir una red de consulta completamente unificada, la cual se guarda como un archivo binario GraphTool.
## Input
Archivo de interacciones
Archivo de nodos - ligandos  
Archivo de nodos - blancos  
## Output
Archivo de red de consulta

In [1]:
# Modify to use other input files
interactions_file = '/home/carlos/Dropbox/2018/Data/CC&D mk. 2/chembl23_GS3_v2.mphase_gt_0.txt.co'
ligands_file = '/home/carlos/Dropbox/2018/Data/CC&D mk. 2/chembl23_GS3_v2.mphase_gt_0.txt.co.ul'
targets_file = '/home/carlos/Dropbox/2018/Data/CC&D mk. 2/chembl23_GS3_v2.mphase_gt_0.txt.co.ut'
#ligands_simmat = '/home/carlos/Dropbox/2018/Data/CC&D mk. 2/chembl23_GS3_v2.mphase_gt_0.txt.co.ul.tc'
ligands_simmat = '/home/carlos/Dropbox/chembl23_GS3_v2.mphase_gt_0.fix.txt.smi.fpt.bin.tanmat.csv'
targets_simmat = '/home/carlos/Dropbox/2018/Data/CC&D mk. 2/chembl23_GS3_v2.mphase_gt_0.txt.co.ut.id'

In [2]:
import pandas as pd

# Load file to dataframes
df_interactions = pd.read_csv(interactions_file, delimiter = '\t', index_col = False)
df_ligands = pd.read_csv(ligands_file, delimiter = '\t', header = None, names = ['SMILES', 'ChEMBL_ID'], index_col = False)
df_targets = pd.read_csv(targets_file, delimiter = '\t', header = None, names = ['Sequence', 'ChEMBL_ID'], index_col = False)

print('First 5 entries of interactions file:\n')
print(df_interactions.head())
print('\n' + '-'*64 + '\n\nFirst 5 entries of ligands node file:\n')
print(df_ligands.head())
print('\n' + '-'*64 + '\n\nFirst 5 entries of targets node file:\n')
print(df_targets.head())
print('\n' + '-'*64 + '\n\nData summary:\n')
print(df_interactions.nunique())

First 5 entries of interactions file:

          Target         Ligand T_Accession  \
0  CHEMBL1075104  CHEMBL1287853      Q5S007   
1  CHEMBL1075104  CHEMBL1289926      Q5S007   
2  CHEMBL1075104  CHEMBL1721885      Q5S007   
3  CHEMBL1075104  CHEMBL1789941      Q5S007   
4  CHEMBL1075104  CHEMBL1908397      Q5S007   

                                              T_Name  \
0  Leucine-rich repeat serine/threonine-protein k...   
1  Leucine-rich repeat serine/threonine-protein k...   
2  Leucine-rich repeat serine/threonine-protein k...   
3  Leucine-rich repeat serine/threonine-protein k...   
4  Leucine-rich repeat serine/threonine-protein k...   

                                    T_Pfam  min(ACT)  avg(ACT)  max(ACT)  \
0  PF00069,PF08477,PF12799,PF13855,PF16095     870.0     935.0    1000.0   
1  PF00069,PF08477,PF12799,PF13855,PF16095     920.0     955.0     990.0   
2  PF00069,PF08477,PF12799,PF13855,PF16095      70.0      71.0      72.0   
3  PF00069,PF08477,PF12799,PF13855,PF

In [3]:
from scipy.stats import stats

# Load similarity matrices to memory
def LoadSimMat2DataFrame(simmat, named_df):
    ''' Load file matrix to a Pandas DataFrame, ignoring self loops. '''
    
    source = []
    target = []
    similarity = []
    
    with open(simmat, 'r') as f:
        for i, line in enumerate(f):
            for j, sim in enumerate(line.split()):
                if i != j and sim != '':
                    source.append(named_df.at[int(i), 'ChEMBL_ID'])
                    target.append(named_df.at[int(j), 'ChEMBL_ID'])
                    similarity.append(float(sim))
                
    df = pd.DataFrame({'Source': source, 'Target': target, 'Similarity': similarity})
    
    return df

# Other dataframe functions
def df_SetFold(df):
    ''' Convert dataframe index into fold identifier. '''
    
    def _lastDigit(n): return (n % 10)
    
    df['Fold'] = df.index
    df['Fold'] = df['Fold'].apply(_lastDigit)

def df_NormalizeSimilarity(df):
    ''' Set range of similarity score from 0 to 100. '''

    value = 100 / float(df['Similarity'][df['Similarity']==df['Similarity'].max()].head(1))
    df['Similarity'] *= value
    df['Similarity'] /= 100
    
def df_GetDistanceAndZScores(df):
    ''' Calculate distance measures and Z-Scores of both measures. '''
    
    df['Distance'] = 1 - df['Similarity']
    df['Z_Similarity'] = stats.zscore(df['Similarity'])
    df['Z_Distance'] = stats.zscore(df['Distance'])
    
def df_Transform2Positives(df, columns):
    ''' Convert all values to positives of given columns by adding the minimum value to every entry. '''
    
    for column in list(columns):
        value = float(df[column][df[column]==df[column].min()].head(1))
        if value < 0:
            df[column] += (value * -1)
        else:
            print('DataFrame column does not comply with requirements (No negative values found)')

def df_Convert2Numeric(df, columns):
    ''' Convert all values to positives of given columns by adding the minimum value to every entry. '''
    
    for column in list(columns):
        df[column] = df[column].astype('float')

df_SetFold(df_ligands)

sim_ligands = LoadSimMat2DataFrame(ligands_simmat, df_ligands)

df_NormalizeSimilarity(sim_ligands)
df_GetDistanceAndZScores(sim_ligands)
df_Transform2Positives(sim_ligands, ['Z_Similarity', 'Z_Distance'])
df_Convert2Numeric(sim_ligands, ['Similarity', 'Distance', 'Z_Similarity', 'Z_Distance'])

print('First 5 entries of processed Ligand similarity matrix:\n')
print(sim_ligands.head())

print('\n'+'-'*64+'\n')

df_SetFold(df_targets)

sim_targets = LoadSimMat2DataFrame(targets_simmat, df_targets)

df_NormalizeSimilarity(sim_targets)
df_GetDistanceAndZScores(sim_targets)
df_Transform2Positives(sim_targets, ['Z_Similarity', 'Z_Distance'])
df_Convert2Numeric(sim_targets, ['Similarity', 'Distance', 'Z_Similarity', 'Z_Distance'])

print('First 5 entries of processed Target similarity matrix:\n')
print(sim_targets.head())

print('\n'+'-'*64+'\n')

print('Technical information for processed Ligand similarity matrix:\n')
print(sim_ligands.info(memory_usage = 'deep'))

print('\n'+'-'*64+'\n')

print('Technical information for processed Target similarity matrix:\n')
print(sim_targets.info(memory_usage = 'deep'))

First 5 entries of processed Ligand similarity matrix:

        Source      Target  Similarity  Distance  Z_Similarity  Z_Distance
0   CHEMBL1002  CHEMBL1000    0.264706  0.735294      3.119216    8.664482
1  CHEMBL10041  CHEMBL1000    0.294643  0.705357      3.471984    8.311714
2  CHEMBL10041  CHEMBL1002    0.402299  0.597701      4.740570    7.043128
3   CHEMBL1006  CHEMBL1000    0.083333  0.916667      0.981974   10.801724
4   CHEMBL1006  CHEMBL1002    0.119048  0.880952      1.402826   10.380872

----------------------------------------------------------------

First 5 entries of processed Target similarity matrix:

          Source         Target  Similarity  Distance  Z_Similarity  \
0  CHEMBL1075132  CHEMBL1075104    0.009525  0.990475      0.172216   
1  CHEMBL1075133  CHEMBL1075104    0.048693  0.951307      0.880368   
2  CHEMBL1075133  CHEMBL1075132    0.006654  0.993346      0.120297   
3  CHEMBL1075144  CHEMBL1075104    0.000348  0.999652      0.006292   
4  CHEMBL1075144

In [4]:
import networkx as nx

# Create graph objects in networkX
Ligands_Graph = nx.from_pandas_edgelist(sim_ligands, 'Source', 'Target', edge_attr = True)
Ligands_Graph = nx.Graph(Ligands_Graph, name = 'Ligands')
nx.set_node_attributes(Ligands_Graph, 'Ligand', 'Type')
nx.set_edge_attributes(Ligands_Graph, 'LL', 'Type')

Targets_Graph = nx.from_pandas_edgelist(sim_targets, 'Source', 'Target', edge_attr = True)
Targets_Graph = nx.Graph(Targets_Graph, name = 'Targets')
nx.set_node_attributes(Targets_Graph, 'Target', 'Type')
nx.set_edge_attributes(Targets_Graph, 'TT', 'Type')

Interactions_Graph = nx.from_pandas_edgelist(df_interactions, 'Ligand', 'Target')
Interactions_Graph = nx.Graph(Interactions_Graph, name = 'Interactions')
nx.set_edge_attributes(Interactions_Graph, 'LT', 'Type')
nx.set_edge_attributes(Interactions_Graph, 100, 'Similarity')
nx.set_edge_attributes(Interactions_Graph, 100, 'Distance')
nx.set_edge_attributes(Interactions_Graph, 100, 'Z_Similarity')
nx.set_edge_attributes(Interactions_Graph, 100, 'Z_Distance')
nx.set_node_attributes(Interactions_Graph, dict(zip(df_ligands.ChEMBL_ID, df_ligands.Fold)), 'Fold')
nx.set_node_attributes(Interactions_Graph, dict(zip(df_targets.ChEMBL_ID, df_targets.Fold)), 'Fold')

print('Information for Ligands similarity graph:\n')
print(nx.info(Ligands_Graph))
print('Edge attributes:', ', '.join(list((list(Ligands_Graph.edges(data=True))[0][2]).keys())))
print('Node attributes:', ', '.join(list((list(Ligands_Graph.nodes(data=True))[0][1]).keys())))

print('\n'+'-'*64+'\n')

print('Information for Targets similarity graph:\n')
print(nx.info(Targets_Graph))
print('Edge attributes:', ', '.join(list((list(Targets_Graph.edges(data=True))[0][2]).keys())))
print('Node attributes:', ', '.join(list((list(Targets_Graph.nodes(data=True))[0][1]).keys())))

print('\n'+'-'*64+'\n')

print('Information for DTI (Interactions) graph:\n')
print(nx.info(Interactions_Graph))
print('Edge attributes:', ', '.join(list((list(Targets_Graph.edges(data=True))[0][2]).keys())))

Information for Ligands similarity graph:

Name: Ligands
Type: Graph
Number of nodes: 1232
Number of edges: 758296
Average degree: 1231.0000
Edge attributes: Similarity, Distance, Z_Similarity, Z_Distance, Type
Node attributes: Type

----------------------------------------------------------------

Information for Targets similarity graph:

Name: Targets
Type: Graph
Number of nodes: 897
Number of edges: 401856
Average degree: 896.0000
Edge attributes: Similarity, Distance, Z_Similarity, Z_Distance, Type
Node attributes: Type

----------------------------------------------------------------

Information for DTI (Interactions) graph:

Name: Interactions
Type: Graph
Number of nodes: 2129
Number of edges: 9641
Average degree:   9.0568
Edge attributes: Similarity, Distance, Z_Similarity, Z_Distance, Type


In [5]:
# Merge all graphs into master graph and save as GraphML
Master_Graph = nx.compose_all([Ligands_Graph, Targets_Graph, Interactions_Graph])
Master_Graph = nx.Graph(Master_Graph, name = 'Complete graph')

print('Information for Master graph:\n')
print(nx.info(Master_Graph))
print('Edge attributes:', ', '.join(list((list(Master_Graph.edges(data=True))[0][2]).keys())))
print('Node attributes:', ', '.join(list((list(Master_Graph.nodes(data=True))[0][1]).keys())))

nx.write_graphml(Master_Graph, interactions_file + '.graphml')
print('\nSaved graph as .graphml file.')

Information for Master graph:

Name: Complete graph
Type: Graph
Number of nodes: 2129
Number of edges: 1169793
Average degree: 1098.9131
Edge attributes: Similarity, Distance, Z_Similarity, Z_Distance, Type
Node attributes: Type, Fold

Saved graph as .graphml file.


In [6]:
import graph_tool.all as gt

# Convert the master graph from networkx to Graph-Tool using Kuan Butts code and save as .graphml file
# and binary graph file (.gt).
#
# Link to blog post: http://kuanbutts.com/2018/08/17/peartree-to-graph-tool/
# TODO: Ask for personal information for acknowledgement

def get_prop_type(value, key=None):
    """
    Performs typing and value conversion for the graph_tool PropertyMap class.
    If a key is provided, it also ensures the key is in a format that can be
    used with the PropertyMap. Returns a tuple, (type name, value, key)
    """
    # Ensure that key is returned as a str type
    if isinstance(key, bytes):
        key = key.decode()

    # Deal with the value
    if isinstance(value, bool):
        tname = 'bool'

    elif isinstance(value, int):
        tname = 'float'
        value = float(value)

    elif isinstance(value, float):
        tname = 'float'

    elif isinstance(value, bytes):
        tname = 'string'
        value = value.decode()

    elif isinstance(value, dict):
        tname = 'object'

    else:
        tname = 'string'
        value = str(value)

    return tname, value, key

def nx2gt(nxG):
    """
    Converts a networkx graph to a graph-tool graph.
    """
    # Phase 0: Create a directed or undirected graph-tool Graph
    gtG = gt.Graph(directed=nxG.is_directed())

    # Add the Graph properties as "internal properties"
    for key, value in nxG.graph.items():
        # Convert the value and key into a type for graph-tool
        tname, value, key = get_prop_type(value, key)

        prop = gtG.new_graph_property(tname) # Create the PropertyMap
        gtG.graph_properties[key] = prop     # Set the PropertyMap
        gtG.graph_properties[key] = value    # Set the actual value

    # Phase 1: Add the vertex and edge property maps
    # Go through all nodes and edges and add seen properties
    # Add the node properties first
    nprops = set() # cache keys to only add properties once
    for node, data in nxG.nodes(data=True):

        # Go through all the properties if not seen and add them.
        for key, val in data.items():
            if key in nprops: continue # Skip properties already added

            # Convert the value and key into a type for graph-tool
            tname, _, key  = get_prop_type(val, key)

            prop = gtG.new_vertex_property(tname) # Create the PropertyMap
            gtG.vertex_properties[key] = prop     # Set the PropertyMap

            # Add the key to the already seen properties
            nprops.add(key)

    # Also add the node id: in NetworkX a node can be any hashable type, but
    # in graph-tool node are defined as indices. So we capture any strings
    # in a special PropertyMap called 'id' -- modify as needed!
    gtG.vertex_properties['ID'] = gtG.new_vertex_property('string')

    # Add the edge properties second
    eprops = set() # cache keys to only add properties once
    for src, dst, data in nxG.edges(data=True):

        # Go through all the edge properties if not seen and add them.
        for key, val in data.items():
            if key in eprops: continue # Skip properties already added

            # Convert the value and key into a type for graph-tool
            tname, _, key = get_prop_type(val, key)

            prop = gtG.new_edge_property(tname) # Create the PropertyMap
            gtG.edge_properties[key] = prop     # Set the PropertyMap

            # Add the key to the already seen properties
            eprops.add(key)

    # Phase 2: Actually add all the nodes and vertices with their properties
    # Add the nodes
    vertices = {} # vertex mapping for tracking edges later
    for node, data in nxG.nodes(data=True):

        # Create the vertex and annotate for our edges later
        v = gtG.add_vertex(n=1)
        vertices[node] = v

        # Set the vertex properties, not forgetting the id property
        data['ID'] = str(node)
        for key, value in data.items():
            tname, value, key = get_prop_type(value, key)
            gtG.vp[key][v] = value # vp is short for vertex_properties

    # Add the edges
    for src, dst, data in nxG.edges(data=True):

        # Look up the vertex structs from our vertices mapping and add edge.
        e = gtG.add_edge(vertices[src], vertices[dst])

        # Add the edge properties
        for key, value in data.items():
            gtG.ep[key][e] = value # ep is short for edge_properties

    # Done, finally!
    return gtG

Master_Graph_gt = nx2gt(Master_Graph)

print('Information for Master graph as Graph-Tool object:\n')
print(Master_Graph_gt)

Master_Graph_gt.save(interactions_file+'.gt')
print('Saved graph object to binary file format.')

ModuleNotFoundError: No module named 'graph_tool'