In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt

import pickle

In [2]:
data_path = 'data/'

In [3]:
class WalkBiased(object):
    
    def __init__(self, data_path, merge_file, node_file, relation_file, node_to_class_file, d_bias,  all_proteins = False):
        self.zero_dist_len = 0
        self.__data_path = data_path
        
        self.__merge_file = merge_file
        self.__node_file = node_file
        self.__relation_file = relation_file   
        self.__node_to_class_file = node_to_class_file
        self.__d_bias = d_bias
        
        self.__edge_col_names  = ['h','t','r'] 
        self.__merge_col_names = ['h','t','r', 'h_id', 't_id', 'r_id']
        self.__other_col_names = ['id', 'name']
        self.__merge_col_id =    ['h_id', 't_id', 'r_id']
        
        self.__all_proteins = all_proteins
        
        self.__read_files()
        self.__get_node_groupby()
        self.__load_node_to_class()
        self.__set_biased()
        self.__prepare_dicts()
        
        self.__validate_all_dicts()
        
    def __load_node_to_class(self):
        with open(data_path + self.__node_to_class_file + '.pkl', 'rb') as f:
            self.__d_node_to_class = pickle.load(f)
        print('__load_node_to_class| __d_node_to_class: {}'.format(len(self.__d_node_to_class)))
    
    def __sets_to_lists(self, d, biased = False):
        r = dict()
        c = 0
        for p in d:
            
            try:
                lst = list(d[p])
                if biased == False:
                    
                    prob = self.__get_dist_point(lst)
                else:
                    prob = self.__get_dist(lst)
                r[p] = lst
                r[p].append(prob)
            except:
                c+=1
                continue
        print('{} out of {}'.format(c, len(d)))
        return r
        
    def __change_dict_value_type(self):
        print('__change_dict_value_type| started.')
        self.__pro_other  = self.__sets_to_lists(self.__pro_other, True)
        self.__other_pro  = self.__sets_to_lists(self.__other_pro)
        self.__pro_other2 = self.__sets_to_lists(self.__pro_other2, True)
        self.__other_pro2 = self.__sets_to_lists(self.__other_pro2)
        self.__pro_path   = self.__sets_to_lists(self.__pro_path)
        self.__path_pro   = self.__sets_to_lists(self.__path_pro)
        print('__change_dict_value_type| done.')
        print()
        
    def __prepare_dicts(self):
        self.__prepare_dict_pro_path()
        self.__prepare_dict_pro_x()
        self.__change_dict_value_type()
        print('__prepare_dicts| self.zero_dist_len: {}'.format(self.zero_dist_len))
        
    def __get_data_ids_by_relation(self, list_relations, isin = True):
        if isin == False:
                return self.__df_merge[~self.__df_merge['r'].isin(list_relations)][self.__merge_col_id]
        return self.__df_merge[self.__df_merge['r'].isin(list_relations)][self.__merge_col_id]
     
    def __prepare_dict_pro_x(self):
        mcols = self.__merge_col_id
        
        df_others_short = self.__get_data_ids_by_relation(['hasPathway'], isin = False)
        print('__prepare_dict_pro_x| df_other: {}'.format(df_others_short.shape))
        
        print('__prepare_dict_pro_x| processing the other to protein (other_pro) ...')
        
        if self.__all_proteins == False:
            lst_pro_path_Proteins = list(self.__pro_path.keys())
            print('__prepare_dict_pro_x| lst_pro_path_Proteins: {}'.format(len(lst_pro_path_Proteins)))
        else:
            print('__prepare_dict_pro_x| ALL PROTEINS MODE:')
            lst_pro_path_Proteins = list(self.__df_merge['h_id'].unique())
            print('__prepare_dict_pro_x| list all proteins: {}'.format(len(lst_pro_path_Proteins)))
    
        df_others_short_selected_proteins = df_others_short[df_others_short[mcols[0]].isin(lst_pro_path_Proteins)]
        print('__prepare_dict_pro_x| df_others_short_selected: {}'.format(df_others_short_selected_proteins.shape))
        
        other_pro = dict()

        for tup in df_others_short_selected_proteins.itertuples():
            if tup[2] not in other_pro:
                other_pro[tup[2]] = set()
            other_pro[tup[2]].add((tup[1],tup[3]))

        self.__other_pro = other_pro
        print('__prepare_dict_pro_x| other_pro: {}'.format(len(other_pro))) 
        
        print('__prepare_dict_pro_x| processing the protein to other (pro_other) ...')
        
        lst_other_pro_Others = list(other_pro.keys())
        print('__prepare_dict_pro_x| lst_other_pro_Others: {}'.format(len(lst_other_pro_Others)))
        
        df_others_short_selected_others = df_others_short[df_others_short[mcols[1]].isin(lst_other_pro_Others)]
        print('__prepare_dict_pro_x| df_others_short_selected_others: {}'.format(df_others_short_selected_others.shape))
        
        pro_other = dict()

        for tup in df_others_short_selected_others.itertuples():

            if tup[1] not in pro_other:
                pro_other[tup[1]] = set()
            pro_other[tup[1]].add((tup[2],tup[3]))
        
        self.__pro_other = pro_other
        print('__prepare_dict_pro_x| pro_other: {}'.format(len(pro_other))) 
        
        print('__prepare_dict_pro_x| The Final dict: pro_other: {}, other_pro: {}'.format(len(pro_other), len(other_pro))) 
        
        print('__prepare_dict_pro_x| processing the protein to other 2 (pro_other2) ...')
        
        lst_path_pro_Proteins = [j for i in self.__path_pro for j in self.__path_pro[i]]
        print('__prepare_dict_pro_x| lst_path_pro_Proteins: {}'.format(len(lst_path_pro_Proteins)))
        
        df_shorts_short_seleected_proteins2 = df_others_short[df_others_short[mcols[0]].isin(lst_path_pro_Proteins)]
        print('__prepare_dict_pro_x| df_shorts_short_seleected_proteins2: {}'.format(df_shorts_short_seleected_proteins2.shape))
        
        pro_other2 = dict()

        for tup in df_shorts_short_seleected_proteins2.itertuples():

            if tup[1] not in pro_other2:
                pro_other2[tup[1]] = set()
            pro_other2[tup[1]].add((tup[2],tup[3]))

        self.__pro_other2 = pro_other2
        print('__prepare_dict_pro_x| pro_other2: {}'.format(len(pro_other2))) 
        
        print('__prepare_dict_pro_x| processing the other to protein 2 (other_pro2) ...')
        
        lst_pro_other2_Others = [j[0] for i in pro_other2 for j in pro_other2[i]]
        print('__prepare_dict_pro_x| lst_pro_other2_Others: {}'.format(len(lst_pro_other2_Others)))
        
        df_shorts_short_seleected_others2 = df_others_short[df_others_short['t_id'].isin(lst_pro_other2_Others)]
        print('__prepare_dict_pro_x| df_shorts_short_seleected_others2: {}'.format(df_shorts_short_seleected_others2.shape))
        
        other_pro2 = dict()
        for tup in df_shorts_short_seleected_others2.itertuples():

            if tup[2] not in other_pro2:
                other_pro2[tup[2]] = set()
            other_pro2[tup[2]].add((tup[1],tup[3]))

        self.__other_pro2 = other_pro2
        print('__prepare_dict_pro_x| other_pro2: {}'.format(len(other_pro2))) 
        
        print('__prepare_dict_pro_x| The Final dict: pro_other2: {}, other_pro2: {}'.format(len(pro_other2), len(other_pro2))) 

        print()
    
        
    def __prepare_dict_pro_path(self):
    
        df_hasPathway = self.__get_data_ids_by_relation(['hasPathway'])
        print('__prepare_dict_pro_path| df_hasPathway: {}'.format(df_hasPathway.shape))
        
        pro_path = dict()
        path_pro = dict()

        for tup in df_hasPathway.itertuples():
            if tup[1] not in pro_path:
                pro_path[tup[1]] = set()
            pro_path[tup[1]].add(tup[2])
            if tup[2] not in path_pro:
                path_pro[tup[2]] = set()
            path_pro[tup[2]].add(tup[1])

        self.__pro_path = pro_path
        self.__path_pro = path_pro
        
        print('__prepare_dict_pro_path| pro_path: {}, path_pro: {}'.format(len(pro_path), len(path_pro)))
        print()
    
    def __read_files(self):
        mcols = self.__merge_col_names
        
        self.__df_merge = pd.read_csv(self.__data_path + self.__merge_file + '.csv'
                                      , dtype={mcols[0]: object, mcols[1]: object, mcols[2]:object, mcols[3]:int
                                               , mcols[4]:int, mcols[5]:int})
        print('__read_processed_files| df_merge: {}'.format(self.__df_merge.shape))
        
        self.__df_nodes = pd.read_csv(self.__data_path + self.__node_file + '.csv')
        print('__read_processed_files| df_nodes: {}'.format(self.__df_nodes.shape))
        
        self.__df_relations = pd.read_csv(self.__data_path + self.__relation_file + '.csv')
        print('__read_processed_files| df_relations: {}'.format(self.__df_relations.shape))
        
        print()  
        
    def do_biased_walks(self, numwalks, walk_len, desc = '-rem-HL', seed = 5):
        walk_file_path = 'walks-pro-x-pro-path-{}-{}-graphpattern2vec{}'.format(numwalks, walk_len, desc)
        
        # v for protein, a for pathway, f for others

        error_count = 0
        dc = 0
        
        pro_other  = self.__pro_other
        other_pro  = self.__other_pro 
        pro_other2 = self.__pro_other2 
        other_pro2 = self.__other_pro2
        pro_path   = self.__pro_path
        path_pro   = self.__path_pro
        
        random.seed(seed)
        outfile = open(data_path + walk_file_path, 'w')
        
        count_ppi, count_non_ppi = 0, 0 
        
        for pr in tqdm(pro_other):
            
            for j in range(numwalks ): #wnum walks
                start = pr
                outline = ' v' + str(start)
                try:
                    i = 0
                    while i < walk_len:    
                        break_save = False
                        
                        others0 = pro_other[start]                        
                        
                        other0 = random.choices(others0[:-1], others0[-1])[0]
                        
                        num_try = len(others0) * 10
                        
                        while(other0[1] == 2 and num_try !=0):
                            outline += ' v' + str(other0[0] )
                            i += 1
                            if i >= walk_len:
                                break_save = True
                                break;
                            
                            
                            if other0[0] in pro_other.keys():
                                
                                lst = [i for i in list(pro_other[other0[0]][:-1])]    
                                
                                if len(lst) != 0 :
                                    
                                    prob = self.__get_dist(lst)                                                                                                        

                                    other0 = random.choices(lst_non_ppi, prob)[0]
                                    break
                                    
                                other0 = random.choices(others0[:-1], others0[-1])[0]
                                
                            else:
#                                 break_save = True
#                                 break
#                                other0 = random.choices(others0[:-1], others0[-1])[0]

                            num_try-=1   
                            
                         
                        if (num_try == 0):
                            break_save = False
                            break;
                        
                        if (break_save == True):
                            break_save = False
                            break;
                            
                            
                        outline += ' f' + str(other0[0] )
                        i += 1
                        if i >= walk_len:
                            break;
                            
                        
                        pros = other_pro[other0[0]]
                        start = random.choices(pros[:-1], pros[-1])[0][0]
                        outline += ' v' + str(start )
                        i += 1
                        if i >= walk_len:
                            break;
                        
                        paths0 = pro_path[start]
                        path0 = random.choices(paths0[:-1], paths0[-1])[0]
                        outline += ' a' + str(path0 )
                        i += 1
                        if i >= walk_len:
                            break;


                        pros1 = path_pro[path0]
                        pro1 = random.choices(pros1[:-1], pros1[-1])
                        outline += ' v' + str(pro1[0] )
                        i += 1
                        if i >= walk_len:
                            break;
                            
                        others1 = pro_other2[pro1[0]]
                        other1 = random.choices(others1[:-1], others1[-1])[0]

                        num_try = len(others1)
                        
                        while(other1[1] == 2 and num_try !=0):
                            outline += ' v' + str(other1[0] )
                            i += 1
                            if i >= walk_len:                            
                                break_save = True
                                break;
                            
                            if other1[0] in pro_other2.keys():
                                lst = [i for i in list(pro_other2[other1[0]][:-1])]
                                
                                if len(lst) != 0 :
                                                                        
                                    prob = self.__get_dist(lst)
                                    
                                    other1 = random.choices(lst_non_ppi, prob)[0]
                                    break
                                       
                                other1 = random.choices(others1[:-1], others1[-1])[0]
                                                                                                        
                            else:
                                break_save = True
                                break;
#                                 other1 = random.choices(others1[:-1], others1[-1])[0]

                            num_try-=1   
                                       
                        if (num_try == 0):
                            break_save = False
                            break;

                            
                        if (break_save == True):
                            break_save = False
                            break;
                        
                        outline += ' f' + str(other1[0] )
                        i += 1
                        if i >= walk_len:
                            break;
                            
                        pros2 = other_pro2[other1[0]]
                        start = random.choices(pros2[:-1], pros2[-1])[0][0]

                        outline += ' v' + str(start )
                        i += 1
                        if i >= walk_len:
                            break;
                            
                        dc+=1
                except Exception as e:
                    outfile.write(outline + "\n")
                    error_count += 1
                    continue

                outfile.write(outline + "\n")

        outfile.close()  
        print('do_walks| errors : %d out of %d'%(error_count, dc))
        print()
        
    def __get_node_groupby(self):
        df_m = self.__df_merge
        df_n = self.__df_nodes
        cols = self.__merge_col_names
#         ['h','t','r', 'h_id', 't_id', 'r_id']
        
        df_g_h_r = df_m.groupby([cols[3], cols[5]])[cols[4]].count().reset_index(name = cols[2] + '_count')
        df_g_t_r = df_m.groupby([cols[4], cols[5]])[cols[3]].count().reset_index(name = cols[1] + '_count')
        print('__get_node_groupby| df_g_h_r: {}, df_g_t_r: {}'.format(df_g_h_r.shape, df_g_t_r.shape))
        
        dict_t = dict()
        dict_w = dict()
        for i in df_g_h_r.itertuples():
            h_id  = i[1]
            r_id  = i[2]
            count = i[3]
            if h_id not in dict_w:
                dict_w[h_id] = dict()

            dict_w[h_id][r_id] = count
            
            if h_id not in dict_t:
                dict_t[h_id] = 0
            dict_t[h_id] += count

        print('__get_node_groupby| dict_w after first columns: {}'.format(len(dict_w)))
        print('__get_node_groupby| dict_t after first columns: {}'.format(len(dict_t)))

        for i in df_g_t_r.itertuples():
            t_id  = i[1]
            r_id  = i[2]
            count = i[3]
            if t_id not in dict_w:
                dict_w[t_id] = dict()

                if r_id not in dict_w[t_id]:
                    dict_w[t_id][r_id] = count
                else:
                    dict_w[t_id][r_id] += count
            if t_id not in dict_t:
                dict_t[t_id] = 0
            dict_t[t_id] += count

        print('__get_node_groupby| dict_w after second columns: {}'.format(len(dict_w)))
        print('__get_node_groupby| dict_t after second columns: {}'.format(len(dict_t)))
        self.__dict_w = dict_w
        self.__dict_t = dict_t
        print()
        
    def __set_biased(self):
        d_bias = self.__d_bias
        total_weights = 0
        for k in d_bias:
            total_weights += d_bias[k]

        d_por = { k: d_bias[k] / total_weights for k in d_bias }
        self.__d_por = d_por
        
    def __get_dist(self, lst):
        d_dist = self.__d_por
        
        if type(lst[0])== tuple:
            targets = list(list(zip(*lst))[0])
        else:
            targets = lst
            
        lst_weights = [ d_dist[self.__d_node_to_class[i]] if self.__d_node_to_class[i] in d_dist else 0  for i in targets]
        sum_weights = sum(lst_weights)
        
        if sum_weights == 0:
            self.zero_dist_len += 1
            raise Exception("")
        
        nei_prob = [ lst_weights[w]/sum_weights for w in range(len(lst_weights)) if w != len(lst_weights)-1]
        remained = 1 - sum(nei_prob)
        nei_prob.append(remained)
        
        return nei_prob
            
    # used to be __get_dist_v2
    def __get_dist_point(self, lst, lam=1):
        d_t = self.__dict_t
        if type(lst[0])== tuple:
            targets = list(list(zip(*lst))[0])
        else:
            targets = lst

        nei_degree =  [ d_t[i] for i in targets]
        sum_degree = sum(nei_degree)
        nei_point = [ sum_degree/p for p in nei_degree]
        sum_point = sum(nei_point)

        nei_prob = [ nei_point[w]/sum_point  for w in range(len(nei_point)) if w != len(nei_point)-1]
        remained = 1 - sum(nei_prob)
        nei_prob.append(remained)
        return nei_prob
        
    def __get_dist_v1(self, lst, lam=1):
        d_t = self.__dict_t
        if type(lst[0])== tuple:
            targets = list(list(zip(*lst))[0])
        else:
            targets = lst
            
        weights =  [ lam/d_t[i] for i in targets]
        s = np.sum(weights)
        prob = []
        for i in range(len(weights)):
            if i != len(weights)-1 :
                prob.append(weights[i]/s)
            else:
                prob.append(1.0 - np.sum(prob[:i]))
        return prob
    
    def __validate_all_dicts(self):
        df_n   = self.__df_nodes
        set_n  = set(df_n['id'])
        print('validate_all_dicts| set of nodes: {}'.format(len(set_n)))
        
        pro_other  = self.__pro_other
        other_pro  = self.__other_pro 
        pro_other2 = self.__pro_other2 
        other_pro2 = self.__other_pro2
        pro_path   = self.__pro_path
        path_pro   = self.__path_pro
        
        pro_other_ks  = list(pro_other.keys())
        other_pro_ks  = list(other_pro.keys())
        pro_other2_ks = list(pro_other2.keys())
        other_pro2_ks = list(other_pro2.keys())
        pro_path_ks   = list(pro_path.keys())
        path_pro_ks   = list(path_pro.keys())
        
        list_all_keys = pro_other_ks + other_pro_ks + pro_other2_ks + other_pro2_ks + pro_path_ks + path_pro_ks
        set_all_keys = set(list_all_keys)
        
        print('validate_all_dicts| list_all_keys: {}, set_all_keys: {}'.format(len(list_all_keys), len(set_all_keys)))
        
        set_rest = set_n - set_all_keys
        print('missing nodes from dicts: {}'.format(len(set_rest)))
        
        self.missing_nodes_from_dict = set_rest
        
        

    def get__df_merge(self):
        return self.__df_merge
    
    def get__df_nodes(self):
        return self.__df_nodes
    
    def get__df_relations(self):
        return self.__df_relations
    
    def get__dict_w(self):
        return self.__dict_w
    
    def get__dict_t(self):
        return self.__dict_t
    
    def get__pro_other(self):
        return self.__pro_other
    
    def get__pro_path (self):
        return self.__pro_path 

In [4]:
#          {'BP', 'CC', 'FD', 'MF', 'PTM', 'Pathway', 'Protein'}
# d_bias = { 'BP': 1, 'CC':1, 'MF': 1, 'Protein':1, 'PTM':1, 'FD':1}
# d_bias = { 'BP': 1, 'CC':1, 'MF': 2}
d_bias = { 'BP': 1, 'CC':1, 'MF': 1, 'Protein':1, 'PTM':1, 'FD':1}
w = WalkBiased(data_path, 'df_merge_cc_train', 'df_nodes_cc', 'df_relations', 'dict_node_to_class', d_bias)

__read_processed_files| df_merge: (2851116, 6)
__read_processed_files| df_nodes: (212557, 2)
__read_processed_files| df_relations: (7, 2)

__get_node_groupby| df_g_h_r: (466654, 3), df_g_t_r: (53540, 3)
__get_node_groupby| dict_w after first columns: 178320
__get_node_groupby| dict_t after first columns: 178320
__get_node_groupby| dict_w after second columns: 212557
__get_node_groupby| dict_t after second columns: 212557

__load_node_to_class| __d_node_to_class: 212557
__prepare_dict_pro_path| df_hasPathway: (135206, 3)
__prepare_dict_pro_path| pro_path: 56767, path_pro: 1584

__prepare_dict_pro_x| df_other: (2715910, 3)
__prepare_dict_pro_x| processing the other to protein (other_pro) ...
__prepare_dict_pro_x| lst_pro_path_Proteins: 56767
__prepare_dict_pro_x| df_others_short_selected: (1403451, 3)
__prepare_dict_pro_x| other_pro: 43232
__prepare_dict_pro_x| processing the protein to other (pro_other) ...
__prepare_dict_pro_x| lst_other_pro_Others: 43232
__prepare_dict_pro_x| df_other

In [5]:
w.do_biased_walks(40, 40, desc = '-biased-v4_ALL-vCH_br', seed = 5)

100%|██████████| 169660/169660 [1:02:12<00:00, 45.46it/s] 

do_walks| errors : 2163185 out of 31077897






In [None]:
#./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-100-40-graphpattern2vec-biased-v3-realBiased -output graphpattern2vec_bias/data/emb-pro-x-pro-path-100-40-graphpattern2vec-biased-v3-realBiased -pp 1 -size 128 -window 7 -negative 5 -threads 32
#./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_BP1-v2 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_BP1-v2 -pp 1 -size 128 -window 7 -negative 5 -threads 32
#./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_BP1-CC1-MF2-v2 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_BP1-CC1-MF2-v2 -pp 1 -size 128 -window 7 -negative 5 -threads 32

#./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_ALL1 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_ALL1 -pp 1 -size 128 -window 7 -negative 5 -threads 32
#./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_CC1 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_CC1 -pp 1 -size 128 -window 7 -negative 5 -threads 32

#./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_PTM1 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_PTM1 -pp 1 -size 128 -window 7 -negative 5 -threads 32
# ./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_MF1 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_MF1 -pp 1 -size 128 -window 7 -negative 5 -threads 32

# ./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_FD1 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_FD1 -pp 1 -size 128 -window 7 -negative 5 -threads 32

# ./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_Protein1 -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_Protein1 -pp 1 -size 128 -window 7 -negative 5 -threads 32

# ./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-10-20-graphpattern2vec-biased-v4_CH_ALL -output graphpattern2vec_bias/data/emb-pro-x-pro-path-10-20-graphpattern2vec-biased-v4_CH_ALL -pp 1 -size 128 -window 7 -negative 5 -threads 32

# ./metapath2vec -train graphpattern2vec_bias/data/walks-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_ALL-vCH -output graphpattern2vec_bias/data/emb-pro-x-pro-path-40-40-graphpattern2vec-biased-v4_ALL-vCH -pp 1 -size 128 -window 7 -negative 5 -threads 32

### Notes

 - BP1-CC1-MF2: 
   - __prepare_dicts| self.zero_dist_len: 204291
   - do_walks| errors : 225246 out of 39528297
 - BP1:
   - __prepare_dicts| self.zero_dist_len: 232622
   - do_walks| errors : 202709 out of 39479486
 - BP1-v2:
   - __prepare_dicts| self.zero_dist_len: 87807
   - do_walks| errors : 3840522 out of 4657056
 - BP1-CC1-MF2-v2: 
   - __prepare_dicts| self.zero_dist_len: 59476
   - do_walks| errors : 4242917 out of 9706899
 - CC1
   - __prepare_dicts| self.zero_dist_len: 157248
   - do_walks| errors : 1871624 out of 1494901
 - PTM
   - __prepare_dicts| self.zero_dist_len: 225492
   - do_walks| errors : 10864 out of 1732
 - MF:
   - __prepare_dicts| self.zero_dist_len: 122413
   - do_walks| errors : 2721979 out of 3571832
 - FD:
   - __prepare_dicts| self.zero_dist_len: 27422
   - do_walks| errors : 1404711 out of 29340687
 - Protein:
   - __prepare_dicts| self.zero_dist_len: 198779
   - 769080 out of 0
   
 - CH_ALL 20, 20:
   - __prepare_dicts| self.zero_dist_len: 0
   - 373602 out of 4233333
   
 - BP1-CC1-MF2-vCH:
   - __prepare_dicts| self.zero_dist_len: 59476
   - 4242917 out of 9706899
   
 - biased-v4_ALL-vCH:
   - __prepare_dicts| self.zero_dist_len: 0

- number of walks 150, len:40, seed = 2: 
  - 08:50 -- 10:13(50%) --> 11:41
- number of walks 100, len:80, seed = 2:
  - 13:40 -- 14:30(20%) --> 16:21(70%) -- 17:28
- number of walks 100, len:40, seed = 71
  - 09:30 --> 11:13
- number of walks 40, len:40, seed = 2
  - 15:12 --> 15:54
- number of walks:1, len: 10, seed = 20
  - 17s
- number of walks:1, len:5, seed = 20
  - 8s
  
- REAL Bised 100, 40, 5
  - started: 10:22 -- 20% 10:38 --

In [None]:
# ./metapath2vec -train graphpattern2vec_same/data/run-11/walks-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_cc_train_same_run11           -output graphpattern2vec_same/data/run-11/emb-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_cc_train_same_run11                      -pp 1 -size 128 -window 7 -negative 5 -threads 32

# ./metapath2vec -train graphpattern2vec_walk/data/walks-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_cc_train_while_loop_no_raise_all_pro -output graphpattern2vec_walk/data/emb-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_cc_train_while_loop_no_raise_all_pro            -pp 1 -size 128 -window 7 -negative 5 -threads 32


In [None]:
def set_biased(d_bias):

    total_weights = 0
    for k in d_bias:
        total_weights += d_bias[k]
        
    d_por = { k: d_bias[k] / total_weights for k in d_bias }
    print(d_por)
    
    
d_bias = { 'BP': 1, 'CC': 1, 'MF': 2}
set_biased(d_bias)

####  Walks for rewire graph (I don't do rewireing for this version. rewireing done in gp2v_rewire package)

In [None]:
# itr = 2
# w2 = WalkBiased(data_path, 'df_merge_rew_train-NoSplit_{}'.format(itr), 'df_nodes_cc', 'df_relations')

In [None]:
# w2.do_biased_walks(100, 40, desc = '-biased-v2-rewire-merge_rew_train_NoSplit_{}'.format(itr))

In [None]:
# PATHGO          : do_walks| errors : 2729941 out of 108559699
# PATHGO-bised-v2 : do_walks| errors :  906173 out of 116408720
# rewire:         : do_walks| errors : 1029512 out of 115184002


In [None]:

# ./metapath2vec -train graphpattern2vec_rewire/data/walks-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_rew_train_NoSplit_1 -output graphpattern2vec_rewire/data/emb-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_rew_train_NoSplit_1 -pp 1 -size 128 -window 7 -negative 5 -threads 32

In [None]:
# ./metapath2vec -train graphpattern2vec_rewire/data/walks-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_rew_train -output graphpattern2vec_rewire/data/emb-pro-x-pro-path-100-40-graphpattern2vec-biased-v2-rewire-merge_rew_train -pp 1 -size 128 -window 7 -negative 5 -threads 32

In [None]:
# shows the new Walk with possibility of v-v-v-f-v-a is working
# it seems that (6931825, 21213)  less than 0.1% of the time compare to all adjacent tokens
# if devided by 5 : 21213/1386365 1% of time

# check the previous walks...-fixed when ppi does not apeasr in the walks at all



# def get_dist(lst,dict_t, lam=0.5):
#     d_t = dict_t
    
#     if type(lst[0])== tuple:
#         targets = list(list(zip(*lst))[0])
#     else:
#         targets = lst

#     weights =  [ lam/d_t[i] for i in targets]
#     s = np.sum(weights)
#     prob = []
    
#     for i in range(len(weights)):
#         if i != len(weights)-1 :
#             prob.append(weights[i]/s)
#         else:
#             prob.append(1.0 - np.sum(prob[:i]))
            
#     return prob

# def dist(lst, dict_t):
#     sum_degree = 0
#     for i in dict_t:
#         sum_degree += dict_t[i]
#     print('sum degree: {}'.format(sum_degree))
    
#     for i in lst:
#         points = [ sum_degree/dict_t[i] for i in dict_t]
#     print('points: {}'.format(points))    
#     sum_point = sum(points)
#     print('sum_point: {}'.format(sum_point))    
    
#     nei_prob = [ points[w]/sum_point  for w in range(len(points)) if w != len(points)-1]
#     remained = 1 - sum(nei_prob)
#     nei_prob.append(remained)
#     if (sum(nei_prob) != 1):
#         print('error: {}'.format(nei_prob))
#     return nei_prob
# lst = ['a','b','c']
# dict_t = {'a':10, 'b':1, 'c':10000}
# for i in [0.005, 0.05, 0.5]:
#     print(get_dist(lst, dict_t, lam=i))
    
# # d = dist(lst, dict_t)
# # print(d)
# c = 0
# l = list()
# for line in tqdm(open('data/walks-pro-x-pro-path-100-40-graphpattern2vec-HL-Path-Biased-fixed' , 'r')):
#     w = line.strip().split(' ')
#     prev = ''
#     prev_item = ''
#     for i in w:
#         c+=1
#         if prev == i[0] == 'v':
#             l.append([i, prev_item])
#         prev = i[0]
#         prev_item = i
# c, len(l)

# shows the new Walk with possibility of v-v-v-f-v-a is working
# it seems that (6931825, 21213)  less than 0.1% of the time compare to all adjacent tokens
# if devided by 5 : 21213/1386365 1% of time

# check the previous walks...-fixed when ppi does not appeasr in the walks at all

In [None]:
import random
def Mafia_roles():
    p = ['Navid', 'Abbas', 'Mehdi', 'Sanaz hagh', 'Arash', 'Sanaz Sattar', 'Soroush', 'Elham', 'saber', 'Mona']
    r = ['Mafia', 'Mafia', 'God Father', 'Karagah', 'Doctor','shahrvand','shahrvand','shahrvand','shahrvand','shahrvand']
    if len(p) == len(r):
        sel_p = random.sample(p, len(p))
        sel_r = random.sample(r, len(r))
        final = list(zip (sel_p, sel_r))
        
        print(final)
    else:
        print('lists had different length')

Mafia_roles()

In [None]:
    def validate_all_dicts(self):
        df_n   = self.__df_nodes
        set_n  = set(df_n['id'])
        print('validate_all_dicts| set of nodes: {}'.format(len(set_n)))
        
        pro_other  = self.__pro_other
        other_pro  = self.__other_pro 
        pro_other2 = self.__pro_other2 
        other_pro2 = self.__other_pro2
        pro_path   = self.__pro_path
        path_pro   = self.__path_pro
        
        pro_other_ks  = list(pro_other.keys())
        other_pro_ks  = list(other_pro.keys())
        pro_other2_ks = list(pro_other2.keys())
        other_pro2_ks = list(other_pro2.keys())
        pro_path_ks   = list(pro_path.keys())
        path_pro_ks   = list(path_pro.keys())
        
        list_all_keys = pro_other_ks + other_pro_ks + pro_other2_ks + other_pro2_ks + pro_path_ks + path_pro_ks
        set_all_keys = set(list_all_keys)
        
        print('validate_all_dicts| list_all_keys: {}, set_all_keys: {}'.format(len(list_all_keys), len(set_all_keys)))
        
        set_rest = set_n - set_all_keys
        print('missing nodes from dicts: {}'.format(len(set_rest)))
        
        self.missing_nodes_from_dict = set_rest

In [None]:
with open('data/dict_node_to_class.pkl', 'rb') as f:
            d_node_to_class = pickle.load(f)
print('__load_node_to_class| __d_node_to_class: {}'.format(len(d_node_to_class)))

In [None]:
myset = set()
for i in d_node_to_class:
    myset.add(d_node_to_class[i])
myset