In [8]:
"""
Created on Apr 2022

@authors: jkiederle
"""

import numpy as np
import pandas as pd

In [None]:
#sort matrix by row and col indices for easier comparison
def sort_matrix(df):
    df.sort_index(key=lambda x: (x.to_series().str[3:].astype(int)), axis = 0, inplace=True)
    df.sort_index(key=lambda x: (x.to_series().str[3:].astype(int)), axis = 1, inplace=True)

In [2]:
#gets distance matrix and returns similarity matrix

def dist_to_similarity_matrix(matrix_df):
    matrix_np = matrix_df.to_numpy()
    max_df = matrix_np.max()
    sim_np = max_df - matrix_np

    matrix_df.index.name = None
    index_m = matrix_df.index

    sim_df = pd.DataFrame(data=sim_np, index=index_m, columns=index_m)
    
    sort_matrix(sim_df)
    
    return sim_df

In [None]:
def sim_to_distance_matrix(matrix_df):

    matrix_np = matrix_df.to_numpy()
    max_df = matrix_np.max()
    print(max_df)
    dist_np = max_df - matrix_np

    matrix_df.index.name = None
    index_m = matrix_df.index

    dist_df = pd.DataFrame(data=dist_np, index=index_m, columns=index_m)
    
    sort_matrix(dist_df)
    
    return dist_df

In [5]:
#helper functions for computing functional similarity matrix based on GO terms

import itertools

def compute_jaccard_matrix(df):
    # Iterate through columns and compute jaccard index

    sim_df = pd.DataFrame(columns=df.columns, index=df.columns)
    for col_pair in itertools.combinations(df.columns, 2):
        u1= col_pair[0]
        u2 = col_pair[1]
        sim_df.loc[col_pair] = compute_jaccard(set(df[u1].dropna()), set(df[u2].dropna()))
    
    for i in sim_df.index:
        sim_df[i].loc[i] = 1.0
    
    return sim_df


def compute_jaccard(user1_vals, user2_vals):
    intersection = user1_vals.intersection(user2_vals)
    union = user1_vals.union(user2_vals)
    if float(len(union)) == 0 or len(user1_vals) == 0 or len(user2_vals) == 0:
        return 0.0
    jaccard = len(intersection)/float(len(union))
    return jaccard

In [6]:
#main function to compute functional similarity matrix based on GO terms
def compute_functional_matrix(excel_df):
    
    #get all GO data
    GO_data = excel_df.loc[:,"Gene Ontology": "Unnamed: 22"]

    functional_sim_GO = compute_jaccard_matrix(GO_data.T)

    #set row and col indices for easier understanding
    PKB = excel_df['PKBno.']
    functional_sim_GO.set_index(PKB, inplace=True)
    functional_sim_GO.columns = PKB

    #mirror matrix on diagonal for easier comparison
    zeros_functional_sim_GO = functional_sim_GO.fillna(0)
    np_functional_sim_GO = zeros_functional_sim_GO.to_numpy()
    np_functional_sim_GO = np_functional_sim_GO + np_functional_sim_GO.T - np.diag(np.diag(np_functional_sim_GO))

    #make dataframe from functional matrix
    res = pd.DataFrame(data=np_functional_sim_GO, index=functional_sim_GO.index, columns=functional_sim_GO.index)

    sort_matrix(res)

    return res


In [13]:
#compute functional similarity matrix based on GO terms in our data
excel_df = pd.read_excel (r'data_wang_jcb2020/data_wang_jcb2020/PKBdatasetGeneOntologyInformationCollection.xlsx')
functional_matrix = compute_functional_matrix(excel_df)
functional_matrix.to_csv('functional_similarity_GO.csv')