In [5]:
import os
import numpy as np
import gzip
import pickle
import time
import random

import torch
from torch_geometric.data import Data

import import_ipynb
from constants import *
from simulation import simul
from utils import *

from algorithms.centrality import outdegree, betweenness, pagerank
from algorithms.greedy import greedy_orig
from algorithms.BPM import BPM, greedy_BPM
from algorithms.KED import KED
from algorithms.MDS import MDS
from algorithms.RIS import RIS
from algorithms.random import random_edge
from algorithms.DiffIM import DiffIM, DiffIMp, DiffIMpp

In [6]:
def save_mask(save_dir, file_num, mask, seed_size, infl_orig, infl_masked, time_alg):
    with open(save_dir+f'/mask_{file_num}.txt','w') as f:
        f.write('seed infl_orig infl_masked reduced\n')
        f.write(f'{seed_size} {infl_orig} {infl_masked} {(infl_orig-infl_masked)/(infl_orig-seed_size)}\n')
        f.write('u v p time%\n')
        for i in range(len(mask)):
            u,v,p = mask[i]
            t = time_alg[i]
            f.write(f'{int(u)} {int(v)} {p} {t}\n')


def save_record(save_dir, record, time_pretrain):
    # record : [k,(seed_size,infl_orig,infl_masked,time)]
    reduceds = (record[:,1]-record[:,2])/(record[:,1]-record[:,0])
    times = record[:,3]
    with open(save_dir+'/summary.txt','w') as f:
        f.write(f'reduced_avg: {np.mean(reduceds)}, reduced_std: {np.std(reduceds)}, time_avg: {np.mean(times)}, time_std: {np.std(times)}\n')
        f.write(f'pretrain time: {time_pretrain}\n')
        f.write('seed infl_orig infl_masked reduced time\n')
        for seed_size, infl_orig, infl_masked, time_alg in record: f.write(f'{seed_size} {infl_orig} {infl_masked} {(infl_orig-infl_masked)/(infl_orig-seed_size)} {time_alg}\n')


def pipeline(alg_name, dataset_name, del_edge_num, save=True, save_tag=None, data_num=None, adv_log_save=False, verbose=False, **alg_kwags):
    # prepare saving dir
    if save:
        dataset_tag = ABBR.get(dataset_name, dataset_name[:-7])
        save_dir = RESULT_DIR+dataset_tag+'/'+str(del_edge_num)+'/'+alg_name
        if save_tag : save_dir += save_tag
        if not os.path.exists(save_dir): os.makedirs(save_dir)

        if adv_log_save:
            adv_log_dir = save_dir+'/adv_log/'
            if not os.path.exists(adv_log_dir): os.makedirs(adv_log_dir)

    torch.cuda.empty_cache()

    # algorithm list
    basic_ind2seed = {'BPM':BPM, 'KED':KED, 'random':random_edge, 'outdegree':outdegree, 'betweenness':betweenness, 'pagerank':pagerank}  # seed-independent
    basic_algs = {'MDS':MDS, 'greedy':greedy_orig, 'MBPM':greedy_BPM, 'RIS':RIS}
    pyg_algs = {'DiffIM':DiffIM, 'DiffIM+':DiffIMp, 'DiffIM++':DiffIMpp}
    
    # load algorithm
    is_once = False
    is_pyg = False
    time_pretrain = 0
    if alg_name in basic_algs:
        algorithm = basic_algs[alg_name]
    elif alg_name in basic_ind2seed:
        is_once = True
        algorithm = basic_ind2seed[alg_name]
    elif alg_name in pyg_algs:
        is_pyg = True
        algorithm = pyg_algs[alg_name]
    else: raise Exception("not supported algorithm name")
    
    # load dataset
    graph_name = dataset_name.split('-')[0]
    n,m,adj_list = txt2adj(graph_name)
    with gzip.open(DATASET_DIR+dataset_name, 'rb') as f: rawdata = pickle.load(f)
    if data_num==None: data_num = len(rawdata)
    if del_edge_num=='all': del_edge_num = m

    if is_once:  # seed-independent algs run only once
        is_seed, probs = rawdata[0]
        seed_idx = np.where(is_seed==1)[0]
        
        mask, time_alg = algorithm(adj_list, seed_idx, del_edge_num, **alg_kwags)
            
        if not isinstance(time_alg,list) : time_alg = [time_alg]*del_edge_num
        elif len(time_alg)==1: time_alg = time_alg*del_edge_num

        adj_mat_masked = list2mat(adj_list)
        for u,v,p in mask: adj_mat_masked[int(u)][int(v)]=0
        adj_list_masked = mat2list(adj_mat_masked)

    record = []
    if verbose: print('seed\tinfl_orig\tinfl_mask\treldec%\t\ttime')
    for file_num, (is_seed, probs) in enumerate(rawdata[:data_num]):  
        seed_idx = np.where(is_seed==1)[0]

        adv_log_filename = adv_log_dir+f'{file_num}.txt' if adv_log_save else None

        if not is_once:
            # call algorithm
            if is_pyg: mask, time_alg = algorithm(adj_list, seed_idx, probs, del_edge_num, log_filename=adv_log_filename, **alg_kwags)
            else: mask, time_alg = algorithm(adj_list, seed_idx, del_edge_num, **alg_kwags)
    
            # adjust time_alg
            if not isinstance(time_alg,list) : time_alg = [time_alg]*del_edge_num
            elif len(time_alg)==1: time_alg = time_alg*del_edge_num
    
            # apply mask to adj_list. (convert to adj_mat -> apply mask -> convert to adj_list)
            adj_mat_masked = list2mat(adj_list)
            for u,v,p in mask: adj_mat_masked[int(u)][int(v)]=0
            adj_list_masked = mat2list(adj_mat_masked)

        # influence before/after apply mask
        infl_orig = sum(probs)
        infl_masked = sum(simul(adj_list_masked,seed_idx))
        if verbose: print(f'{len(seed_idx)}\t{infl_orig :.2f}\t\t{infl_masked :.2f}\t\t{(infl_orig-infl_masked)/(infl_orig-len(seed_idx)) :.4f}\t\t{time_alg[-1]:.4f}')
        record.append([len(seed_idx),infl_orig,infl_masked,time_alg[-1]])

        # save mask
        if save: save_mask(save_dir, file_num, mask, len(seed_idx), infl_orig, infl_masked, time_alg)

    record = np.array(record)
    avgtime = np.mean(record[:,3])
    avgreldec = np.mean((record[:,1]-record[:,2])/(record[:,1]-record[:,0]))
    if verbose: print(f'avgreldec%: {avgreldec:.4f}, avgtime: {avgtime:.4f}')
    
    # save record
    if save: save_record(save_dir, record, time_pretrain)
    return avgreldec, record, time_pretrain

In [7]:
# for greedy-like algorithm, if already have budget k results, then we can generate budget<k reulsts quickly
def make_subresult(alg_name, dataset_name, from_del_edge_num, to_del_edge_num, save_tag=None):
    # prepare saving dir
    dataset_tag = ABBR.get(dataset_name, dataset_name[:-7])
    from_save_dir = RESULT_DIR+dataset_tag+'/'+str(from_del_edge_num)+'/'+alg_name
    to_save_dir = RESULT_DIR+dataset_tag+'/'+str(to_del_edge_num)+'/'+alg_name
    if save_tag : from_save_dir += save_tag; to_save_dir += save_tag
    if not os.path.exists(from_save_dir): print('no such directory'); return
    if not os.path.exists(to_save_dir): os.makedirs(to_save_dir)
    
    # load dataset
    graph_name = dataset_name.split('-')[0]
    n,m,adj_list = txt2adj(graph_name)
    with gzip.open(DATASET_DIR+dataset_name, 'rb') as f: rawdata = pickle.load(f)

    record = []
    for file_num, (is_seed, probs) in enumerate(rawdata): 
        mask = []
        time_alg = []
        with open(from_save_dir+f'/mask_{file_num}.txt','r') as f:
            f.readline()
            seed_size, infl_orig, _, _ = map(float,f.readline().split())
            f.readline()
            for i in range(to_del_edge_num):
                u,v,p,t = map(float,f.readline().split())
                mask.append((int(u),int(v),p))
                time_alg.append(t)
        seed_idx = np.where(is_seed==1)[0]

        # apply mask to adj_list. (convert to adj_mat -> apply mask -> convert to adj_list)
        adj_mat_masked = list2mat(adj_list)
        for u,v,p in mask: adj_mat_masked[u][v]=0
        adj_list_masked = mat2list(adj_mat_masked)

        # influence after apply mask
        infl_masked = sum(simul(adj_list_masked,seed_idx))
        record.append([len(seed_idx),infl_orig,infl_masked,time_alg[-1]])

        # save mask
        save_mask(to_save_dir, file_num, mask, len(seed_idx), infl_orig, infl_masked, time_alg)

    record = np.array(record)
    avgtime = np.mean(record[:,3])
    avgreldec = np.mean((record[:,1]-record[:,2])/(record[:,1]-record[:,0]))

    with open(from_save_dir+'/summary.txt','r') as f:
        f.readline()
        time_pretrain = float(f.readline().split()[-1])
    
    # save record
    save_record(to_save_dir, record, time_pretrain)
    return avgreldec, record, time_pretrain