In [8]:
import numpy as np
import pandas as pd
import networkx as nx
import os

In [2]:
#gets distance matrix and returns similarity matrix

def dist_to_similarity_matrix(matrix_df):
    matrix_np = matrix_df.to_numpy()
    max_df = matrix_np.max()
    sim_np = max_df - matrix_np

    matrix_df.index.name = None
    index_m = matrix_df.index

    sim_df = pd.DataFrame(data=sim_np, index=index_m, columns=index_m)
    
    sort_matrix(sim_df)
    
    return sim_df

In [69]:
#convert tee edit distance matrix to similarity matrix for better comparison
tree_edit = pd.read_csv (r'treeEditDistanceResult.CSV', index_col=0)

df_tree_edit_similarity = dist_to_similarity_matrix(tree_edit)
df_tree_edit_similarity
df_tree_edit_similarity.to_csv('df_tree_edit_similarity.csv')

In [3]:
#fixing symmatry of duos approx similarity matrix
def fix_symmetry_of_similarity(df):
    for ind in df.index:
        col= df[ind]
        row = df.loc[ind]
        for i in range(len(df.index)):
            col_val = col[i]
            row_val = row[i]
            if row_val > col_val:
                df[ind][i] = row_val
            elif col_val > row_val:
                df.loc[ind][i] = col_val

    return df

In [4]:
#sort matrix by row and col indices for easier comparison
def sort_matrix(df):
    df.sort_index(key=lambda x: (x.to_series().str[3:].astype(int)), axis = 0, inplace=True)
    df.sort_index(key=lambda x: (x.to_series().str[3:].astype(int)), axis = 1, inplace=True)

In [49]:
#fix duos approx similarity matrix
df_matrix_approx = pd.read_csv (r'df_duos_approx_entire.csv', index_col=0)

fixed_df_matrix_approx = fix_symmetry_of_similarity(df_matrix_approx)
sort_matrix(fixed_df_matrix_approx)
fixed_df_matrix_approx.to_csv('fixed_df_matrix_approx_entire.csv')

In [50]:
#fix duos gurobi similarity matrix
fixed_df_matrix_gurobi = pd.read_csv (r'df_duos_gurobi_entire.csv', index_col=0)

sort_matrix(fixed_df_matrix_gurobi)
fixed_df_matrix_gurobi.to_csv('fixed_df_matrix_gurobi_entire.csv')

In [5]:
#helper functions for computing functional similarity matrix based on GO terms

import itertools

def compute_jaccard_matrix(df):
    # Iterate through columns and compute jaccard index

    sim_df = pd.DataFrame(columns=df.columns, index=df.columns)
    for col_pair in itertools.combinations(df.columns, 2):
        u1= col_pair[0]
        u2 = col_pair[1]
        sim_df.loc[col_pair] = compute_jaccard(set(df[u1].dropna()), set(df[u2].dropna()))
    
    for i in sim_df.index:
        sim_df[i].loc[i] = 1.0
    
    return sim_df


def compute_jaccard(user1_vals, user2_vals):
    intersection = user1_vals.intersection(user2_vals)
    union = user1_vals.union(user2_vals)
    if float(len(union)) == 0 or len(user1_vals) == 0 or len(user2_vals) == 0:
        return 0.0
    jaccard = len(intersection)/float(len(union))
    return jaccard

In [6]:
#main function to compute functional similarity matrix based on GO terms
def compute_functional_matrix(excel_df):
    
    #get all GO data
    GO_data = excel_df.loc[:,"Gene Ontology": "Unnamed: 22"]

    functional_sim_GO = compute_jaccard_matrix(GO_data.T)

    #set row and col indices for easier understanding
    PKB = excel_df['PKBno.']
    functional_sim_GO.set_index(PKB, inplace=True)
    functional_sim_GO.columns = PKB

    #mirror matrix on diagonal for easier comparison
    zeros_functional_sim_GO = functional_sim_GO.fillna(0)
    np_functional_sim_GO = zeros_functional_sim_GO.to_numpy()
    np_functional_sim_GO = np_functional_sim_GO + np_functional_sim_GO.T - np.diag(np.diag(np_functional_sim_GO))

    #make dataframe from functional matrix
    res = pd.DataFrame(data=np_functional_sim_GO, index=functional_sim_GO.index, columns=functional_sim_GO.index)

    sort_matrix(res)

    return res


In [13]:
#compute functional similarity matrix based on GO terms in our data
excel_df = pd.read_excel (r'data_wang_jcb2020/data_wang_jcb2020/PKBdatasetGeneOntologyInformationCollection.xlsx')
functional_matrix = compute_functional_matrix(excel_df)
functional_matrix.to_csv('functional_similarity_GO.csv')

In [69]:
def sim_to_distance_matrix(matrix_df):

    matrix_np = matrix_df.to_numpy()
    max_df = matrix_np.max()
    print(max_df)
    dist_np = max_df - matrix_np

    matrix_df.index.name = None
    index_m = matrix_df.index

    dist_df = pd.DataFrame(data=dist_np, index=index_m, columns=index_m)
    
    sort_matrix(dist_df)
    
    return dist_df

In [13]:
#fix duos gurobi similarity matrix
gurobi_df_dist = pd.read_csv (r'fixed_df_matrix_gurobi_entire.csv', index_col=0)
gurobi_df_dist.head()


Unnamed: 0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
PKB4,61.0,28.0,24.0,24.0,28.0,23.0,24.0,27.0,26.0,25.0,...,30.0,32.0,40.0,38.0,38.0,30.0,22.0,24.0,18.0,33.0
PKB6,28.0,51.0,37.0,33.0,51.0,27.0,35.0,44.0,35.0,34.0,...,23.0,26.0,32.0,28.0,32.0,22.0,20.0,22.0,17.0,33.0
PKB7,24.0,37.0,51.0,30.0,37.0,32.0,38.0,39.0,30.0,33.0,...,24.0,26.0,31.0,31.0,33.0,24.0,18.0,21.0,17.0,29.0
PKB8,24.0,33.0,30.0,49.0,33.0,37.0,31.0,30.0,35.0,41.0,...,22.0,25.0,29.0,29.0,30.0,23.0,19.0,24.0,18.0,28.0
PKB9,28.0,51.0,37.0,33.0,51.0,27.0,35.0,44.0,35.0,34.0,...,23.0,26.0,32.0,28.0,32.0,22.0,20.0,22.0,17.0,33.0


In [15]:
gurobi_df_dist = sim_to_distance_matrix(gurobi_df_dist)
gurobi_df_dist.to_csv('gurobi_df_dist.csv')
gurobi_df_dist.head()

179.0


Unnamed: 0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
PKB4,-118.0,-151.0,-155.0,-155.0,-151.0,-156.0,-155.0,-152.0,-153.0,-154.0,...,-149.0,-147.0,-139.0,-141.0,-141.0,-149.0,-157.0,-155.0,-161.0,-146.0
PKB6,-151.0,-128.0,-142.0,-146.0,-128.0,-152.0,-144.0,-135.0,-144.0,-145.0,...,-156.0,-153.0,-147.0,-151.0,-147.0,-157.0,-159.0,-157.0,-162.0,-146.0
PKB7,-155.0,-142.0,-128.0,-149.0,-142.0,-147.0,-141.0,-140.0,-149.0,-146.0,...,-155.0,-153.0,-148.0,-148.0,-146.0,-155.0,-161.0,-158.0,-162.0,-150.0
PKB8,-155.0,-146.0,-149.0,-130.0,-146.0,-142.0,-148.0,-149.0,-144.0,-138.0,...,-157.0,-154.0,-150.0,-150.0,-149.0,-156.0,-160.0,-155.0,-161.0,-151.0
PKB9,-151.0,-128.0,-142.0,-146.0,-128.0,-152.0,-144.0,-135.0,-144.0,-145.0,...,-156.0,-153.0,-147.0,-151.0,-147.0,-157.0,-159.0,-157.0,-162.0,-146.0


61

In [48]:
in_path="/home/jana/Documents/BIONETs/Code/tree_match_approx_validator/data_test/"

def normalize_matrix(matrix_df, in_path):
    '''function to normalize similarity matrix by dividing every entry by the maximum number of edges of the two compared trees
        input: matrix_df :  similarity matrix as dataframe
                in_path:    path of the data for the computation of this matrix (trees in gml format)'''

    matrix_copy = matrix_df.copy(deep=True)
    

    for i in range(len(matrix_copy.index)):
        for j in range(len(matrix_copy.index)):
            idx_i = matrix_copy.index[i]
            idx_j = matrix_copy.index[j]

            in_path_i = in_path + idx_i + "/1.txt"
            in_path_j = in_path + idx_j + "/1.txt"

            edge_count_i = nx.read_gml(in_path_i, label='id').number_of_edges()
            edge_count_j = nx.read_gml(in_path_j, label='id').number_of_edges()

            new_val = matrix_copy.loc[idx_i, idx_j] / max(edge_count_i,edge_count_j)

            matrix_copy.loc[idx_i, idx_j] = new_val
            matrix_copy.loc[idx_j, idx_i] = new_val
    
    return matrix_copy

In [67]:
gurobi = pd.read_csv (r'fixed_df_matrix_gurobi_entire.csv', index_col=0)
gurobi.head()

Unnamed: 0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
PKB4,61.0,28.0,24.0,24.0,28.0,23.0,24.0,27.0,26.0,25.0,...,30.0,32.0,40.0,38.0,38.0,30.0,22.0,24.0,18.0,33.0
PKB6,28.0,51.0,37.0,33.0,51.0,27.0,35.0,44.0,35.0,34.0,...,23.0,26.0,32.0,28.0,32.0,22.0,20.0,22.0,17.0,33.0
PKB7,24.0,37.0,51.0,30.0,37.0,32.0,38.0,39.0,30.0,33.0,...,24.0,26.0,31.0,31.0,33.0,24.0,18.0,21.0,17.0,29.0
PKB8,24.0,33.0,30.0,49.0,33.0,37.0,31.0,30.0,35.0,41.0,...,22.0,25.0,29.0,29.0,30.0,23.0,19.0,24.0,18.0,28.0
PKB9,28.0,51.0,37.0,33.0,51.0,27.0,35.0,44.0,35.0,34.0,...,23.0,26.0,32.0,28.0,32.0,22.0,20.0,22.0,17.0,33.0


In [63]:
in_path="/home/jana/Documents/BIONETs/Code/tree_match_approx_validator/data_test/"

new_gurobi= normalize_matrix(gurobi, in_path)
new_gurobi.head()

Unnamed: 0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
PKB4,1.0,0.007525,0.00645,0.00645,0.007525,0.006181,0.00645,0.007256,0.006987,0.006719,...,0.008062,0.0086,0.00483,0.004691,0.001484,0.008062,0.005912,0.00645,0.004837,0.004567
PKB6,0.007525,1.0,0.014225,0.012687,0.019608,0.010381,0.013456,0.016917,0.013456,0.013072,...,0.007334,0.008002,0.003864,0.003457,0.00125,0.008458,0.007689,0.008458,0.006536,0.004567
PKB7,0.00645,0.014225,1.0,0.011534,0.014225,0.012303,0.01461,0.014994,0.011534,0.012687,...,0.007653,0.008002,0.003744,0.003827,0.001289,0.009227,0.00692,0.008074,0.006536,0.004014
PKB8,0.00645,0.012687,0.011534,1.0,0.012687,0.01541,0.011918,0.011534,0.014577,0.0164,...,0.007015,0.007695,0.003502,0.00358,0.001172,0.009579,0.007913,0.009996,0.007497,0.003875
PKB9,0.007525,0.019608,0.014225,0.012687,1.0,0.010381,0.013456,0.016917,0.013456,0.013072,...,0.007334,0.008002,0.003864,0.003457,0.00125,0.008458,0.007689,0.008458,0.006536,0.004567


In [65]:
new_gurobi.to_csv('normalized_gurobi.csv')

In [68]:
new_gurobi.head()

Unnamed: 0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
PKB4,1.0,0.007525,0.00645,0.00645,0.007525,0.006181,0.00645,0.007256,0.006987,0.006719,...,0.008062,0.0086,0.00483,0.004691,0.001484,0.008062,0.005912,0.00645,0.004837,0.004567
PKB6,0.007525,1.0,0.014225,0.012687,0.019608,0.010381,0.013456,0.016917,0.013456,0.013072,...,0.007334,0.008002,0.003864,0.003457,0.00125,0.008458,0.007689,0.008458,0.006536,0.004567
PKB7,0.00645,0.014225,1.0,0.011534,0.014225,0.012303,0.01461,0.014994,0.011534,0.012687,...,0.007653,0.008002,0.003744,0.003827,0.001289,0.009227,0.00692,0.008074,0.006536,0.004014
PKB8,0.00645,0.012687,0.011534,1.0,0.012687,0.01541,0.011918,0.011534,0.014577,0.0164,...,0.007015,0.007695,0.003502,0.00358,0.001172,0.009579,0.007913,0.009996,0.007497,0.003875
PKB9,0.007525,0.019608,0.014225,0.012687,1.0,0.010381,0.013456,0.016917,0.013456,0.013072,...,0.007334,0.008002,0.003864,0.003457,0.00125,0.008458,0.007689,0.008458,0.006536,0.004567


In [71]:
gurobi_df_dist = sim_to_distance_matrix(new_gurobi)
gurobi_df_dist.head()
gurobi_df_dist.to_csv('normalized_gurobi_dist.csv')

1.0


In [75]:
approx = pd.read_csv (r'normalized_approx.csv', index_col=0)
#approx.head()
new_approx = sim_to_distance_matrix(approx)
new_approx.head()
new_approx.to_csv('normalized_apporx_dist.csv')

1.0


In [76]:
df_tree_edit_similarity = pd.read_csv (r'df_tree_edit_similarity.csv', index_col=0)
df_tree_edit_similarity.head()

Unnamed: 0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
PKB4,191,145,141,142,145,140,147,147,142,142,...,134,135,116,133,53,157,134,132,142,115
PKB6,145,191,178,172,191,168,179,187,176,175,...,137,135,111,117,52,146,145,140,151,122
PKB7,141,178,191,172,178,175,180,180,170,175,...,136,135,120,123,55,150,141,140,150,118
PKB8,142,172,172,191,172,180,173,171,181,186,...,138,136,108,120,51,144,144,145,148,119
PKB9,145,191,178,172,191,168,179,187,176,175,...,137,135,111,117,52,146,145,140,151,122


In [77]:
norm_tree = normalize_matrix(df_tree_edit_similarity, in_path)

In [78]:
norm_tree.head()

Unnamed: 0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
PKB4,3.131148,0.038968,0.037893,0.038162,0.038968,0.037624,0.039506,0.039506,0.038162,0.038162,...,0.036012,0.036281,0.014008,0.01642,0.00207,0.042193,0.036012,0.035474,0.038162,0.015917
PKB6,0.038968,3.745098,0.068435,0.066128,0.073433,0.064591,0.06882,0.071895,0.067666,0.067282,...,0.043686,0.041551,0.013404,0.014444,0.002031,0.056132,0.055748,0.053825,0.058055,0.016886
PKB7,0.037893,0.068435,3.745098,0.066128,0.068435,0.067282,0.069204,0.069204,0.065359,0.067282,...,0.043367,0.041551,0.014491,0.015185,0.002148,0.05767,0.05421,0.053825,0.05767,0.016332
PKB8,0.038162,0.066128,0.066128,3.897959,0.066128,0.074969,0.066513,0.065744,0.075385,0.0744,...,0.044005,0.041859,0.013042,0.014815,0.001992,0.059975,0.059975,0.060392,0.061641,0.016471
PKB9,0.038968,0.073433,0.068435,0.066128,3.745098,0.064591,0.06882,0.071895,0.067666,0.067282,...,0.043686,0.041551,0.013404,0.014444,0.002031,0.056132,0.055748,0.053825,0.058055,0.016886


In [83]:
df_tree_edit_dist= pd.read_csv (r'treeEditDistanceResult.CSV', index_col=0)
tree_dist = sort_matrix(df_tree_edit_dist)
df_tree_edit_dist.head()

Unnamed: 0_level_0,PKB4,PKB6,PKB7,PKB8,PKB9,PKB10,PKB11,PKB13,PKB14,PKB15,...,PKB206,PKB207,PKB218,PKB233,PKB236,PKB240,PKB241,PKB242,PKB247,PKB248
dist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PKB4,0,46,50,49,46,51,44,44,49,49,...,57,56,75,58,138,34,57,59,49,76
PKB6,46,0,13,19,0,23,12,4,15,16,...,54,56,80,74,139,45,46,51,40,69
PKB7,50,13,0,19,13,16,11,11,21,16,...,55,56,71,68,136,41,50,51,41,73
PKB8,49,19,19,0,19,11,18,20,10,5,...,53,55,83,71,140,47,47,46,43,72
PKB9,46,0,13,19,0,23,12,4,15,16,...,54,56,80,74,139,45,46,51,40,69


In [85]:
norm_tree = normalize_matrix(df_tree_edit_dist, in_path)

In [90]:
norm_tree.max()

PKB4      0.016931
PKB6      0.021915
PKB7      0.021530
PKB8      0.021658
PKB9      0.021915
            ...   
PKB240    0.022299
PKB241    0.024102
PKB242    0.023540
PKB247    0.028320
PKB248    0.012595
Length: 72, dtype: float64