In [1]:
import pandas as pd
import numpy as np
import pickle
import random
from sklearn.neighbors import KDTree
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [2]:
local_stringdb = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/data/string/lfs-stringdb/'
# load local STRING database and names
stringdf = pd.read_csv(local_stringdb+'9606.protein.info.v12.0.txt', sep='\t', header=0, usecols=['#string_protein_id', 'preferred_name'])
stringdf['preferred_name'] = stringdf['preferred_name'].str.upper()
stringId2name = stringdf.set_index('#string_protein_id')['preferred_name'].to_dict()
name2stringId = stringdf.set_index('preferred_name')['#string_protein_id'].to_dict()
stringdf = pd.read_csv(local_stringdb+'9606.protein.aliases.v12.0.txt', sep='\t', header=0, usecols=['#string_protein_id', 'alias']).drop_duplicates(['alias'], keep='first')
stringdf['alias'] = stringdf['alias'].str.upper()
aliases2stringId = stringdf.set_index('alias')['#string_protein_id'].to_dict()

def get_df(dataset):

    files_path = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/results/models/Dataset-'+dataset+'--RE-True--W-True--Epochs-15000--D-4--RH-25--LR-0.1--LP-False--CUDA-True/'
    path1 = files_path + 'latent.pkl'
    path2 = files_path + 'RE.pkl'

    with open(path1, 'rb') as file:
        latent = pickle.load(file)
    latent = np.array(latent)
    with open(path2, 'rb') as file:
        re = pickle.load(file)
    re = np.array(re)

    convertpath = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/data/datasets/'+dataset+'/ppi_index.pkl'
    with open(convertpath, 'rb') as file:
        ppi_index = pickle.load(file)
    data = np.concatenate((latent, re[:, np.newaxis]), axis=1)
    df_latent = pd.DataFrame()
    for i, d in enumerate(range(data.shape[1])):
        if i == len(range(data.shape[1]))-1:
            col_name = 're'
        else:
            col_name = str(d+1)+'d'
        df_latent[col_name] = data.T[d]
    df_latent['node'] = df_latent.index
    inv_dict = {v: k for k, v in ppi_index.items()}
    df_latent = df_latent.add_prefix(dataset+'_')
    df_latent['gene'] = df_latent[dataset+'_node'].map(inv_dict)
    df = df_latent.loc[:, ~df_latent.columns.str.endswith('node')]
    return df

In [3]:
ppidf = get_df('ppi')
scdf = get_df('sc')
stdf = get_df('st')
df = pd.merge(ppidf,scdf,on='gene')
df = pd.merge(df,stdf,on='gene')
print(len(ppidf),len(scdf),len(stdf),len(df))

18767 16773 17121 15298


In [4]:
root = '/novo/omdb/pds02/PDS2843/data/sprint_tid_ascvd/gzn/thesis/HBDM/data/disease/'
disease_source = 'DIS_CAD'
diseasedf = pd.read_csv(root+disease_source+'.tsv',sep='\t')
pos_genes = diseasedf['Gene']
pos_genes = pos_genes.map(aliases2stringId)
pos_genes =  pos_genes.map(stringId2name)
print(len(diseasedf),len(set(pos_genes)))
df['label'] = df['gene'].apply(lambda x: 1 if x in pos_genes.to_list() else -1)

1708 1520


In [None]:
X = df[['ppi_1d', 'ppi_2d', 'ppi_3d', 'ppi_4d', 'ppi_re','sc_1d',
       'sc_2d', 'sc_3d', 'sc_4d', 'sc_re', 'st_1d', 'st_2d', 'st_3d', 'st_4d',
       'st_re']].to_numpy()
y = df['label'].to_numpy()

In [11]:
pos = df[df['label']==1].index

In [17]:
test_index = random.sample(pos.tolist(), 300)
train_index = set(pos.tolist()) - set(test_index)

In [19]:
ppi = df[['ppi_1d', 'ppi_2d', 'ppi_3d', 'ppi_4d', 'ppi_re']].to_numpy()
train_nodes = [ppi[i] for i in train_index]
test_nodes = [ppi[i] for i in test_index]

In [31]:
my_array = np.array([-2.11718631, 1.04805505, 3.69179893, -2.45958257, 0.39355791])

new_array = my_array.reshape(1, -1)

In [52]:


def get_neighbors(ppi,train_index):
    k = 5
    start = []
    dist = []
    neighbor = []

    kdtree = KDTree(ppi[[col for col in ppi.columns]].to_numpy(), leaf_size=20)
    for i in train_index:
        given_point = ppi.iloc[i][[col for col in ppi.columns]].to_numpy().reshape(1, -1)
        # Perform a k-NN search to find the k+1 nearest neighbors
        distances, indices = kdtree.query(given_point, k=k)
        start += (k-1)*[i]
        dist += distances.reshape(-1).tolist()[1:]
        neighbor += indices.reshape(-1).tolist()[1:]
    neighbor_df = pd.DataFrame({'start': start, 'neighbor': neighbor, 'distance': dist})
    return neighbor_df

In [53]:
ppinei = get_neighbors(df[['ppi_1d', 'ppi_2d', 'ppi_3d', 'ppi_4d', 'ppi_re']],train_index)
scnei = get_neighbors(df[['sc_1d','sc_2d', 'sc_3d', 'sc_4d', 'sc_re']],train_index)
stnei = get_neighbors(df[['st_1d', 'st_2d', 'st_3d', 'st_4d','st_re']],train_index)

In [55]:
print(len(set(ppipre.keys())&set(test_index)))
print(len(set(scpre.keys())&set(test_index)))
print(len(set(stpre.keys())&set(test_index)))

115
77
74


In [58]:
allpredict = pd.concat([ppinei, scnei, stnei], axis=0)

In [62]:
duplicate_rows = allpredict[allpredict.duplicated()]

In [73]:
top100 = allpredict['neighbor'].value_counts().head(100).index.to_list()
print(len(set(top100)&set(test_index)))

In [75]:
top100 = allpredict['neighbor'].value_counts().head(100).index.to_list()
print(len(set(top100)&set(test_index)))

10


In [77]:
top100 = ppinei['neighbor'].value_counts().head(100).index.to_list()
print(len(set(top100)&set(test_index)))

12


In [79]:
top100 = scnei['neighbor'].value_counts().head(100).index.to_list()
print(len(set(top100)&set(test_index)))

1


In [80]:
top100 = stnei['neighbor'].value_counts().head(100).index.to_list()
print(len(set(top100)&set(test_index)))

1


In [65]:
print(len(set(allpredict['neighbor'].to_list())&set(test_index)))

198
