In [None]:
# imports
import matplotlib.pyplot as plt
from random import uniform, seed
import numpy as np
import time
import networkx as nx
from tqdm import tqdm
import pickle
import random
import os
import collections
import ast

In [None]:
# constants
budget = 40

In [None]:
# In: graph, the set of seed nodes, spread probability, Monte-Carlo simulations --> Out: marginal gain

def IC(g,S,p=0.5,mc=1000):

    # Loop over the Monte-Carlo Simulations
    spread = []
    for i in range(mc):
        
        # Simulate propagation process      
        new_active, A = S[:], S[:]
        while new_active:

            # For each newly active node, find its neighbors that become activated
            new_ones = []
            for node in new_active:

                node_dict = dict(g[node])
                node_neighbours = list(node_dict.keys())
                
                global node_weights_globaldict
                weights_list = node_weights_globaldict[node]
                weights_np = np.array(weights_list) 

                # Determine neighbors that become infected
                np.random.seed((int)(node)+i)                
                success = (np.random.uniform(0,1,len(g[node]))+weights_np) > p
                new_ones += list(np.extract(success, node_neighbours))

            new_active = list(set(new_ones) - set(A))
            
            # Add newly activated nodes to the set of activated nodes
            A += new_active
            
        spread.append(len(A))
        
    return(np.mean(spread))

In [None]:
# In: graph, #nodes in the set of seed nodes, spread probability, Monte-Carlo simulations, cost dictionary and budget --> Out: set of seed nodes, reward and time

def greedy(g,k,p=0.1,mc=1000,node_cost_dict={}, budget=budget):

    S, spread, timelapse, start_time = [], [], [], time.time()
    
    # Find k nodes with largest marginal gain
    for _ in tqdm(range(k)):
        node = None

        # Loop over nodes that are not yet in seed set to find biggest marginal gain
        best_spread = 0
        # for j in set(range(len(g.nodes)))-set(S):
        for j in set(g.nodes)-set(S):

            if node_cost_dict[j]<=budget:
                # Get the spread
                s = IC(g,S + [j],p,mc)/node_cost_dict[j]

                # Update the winning node and spread so far
                if s > best_spread:
                    best_spread, node = s, j
                    
        if node != None:
            # Add the selected node to the seed set
            S.append(node)
            budget = budget - node_cost_dict[node]

            print('**GREEDY HERE** - picked up node:',node)
            
            # Add estimated spread and elapsed time
            spread.append(best_spread)
            timelapse.append(time.time() - start_time)

    return(S,spread,timelapse)

In [None]:
# In: graph, #nodes in the set of seed nodes, dimeter of detection, Monte-Carlo simulations --> Out: set of seed nodes, reward and time

def detection_time(g,k,d=10,mc=1000):

    nodes_latency_dict = dict()

    for root in tqdm(set(g.nodes)):
        for i in range(mc):
            random.seed((int)(root+i))

            previous_node = root
            next_node = root
            latency_list = list()
            current_depth = 1
            latency = 0
            while current_depth<=d:
                node_dict = dict(g[previous_node])
                node_neighbours = list(node_dict.keys())
                next_node = random.choice(node_neighbours)

                key = (previous_node,next_node)
                global edge_latency_dict
                latency += edge_latency_dict[key]

                previous_node = next_node
                current_depth+=1
            
            latency_list.append(latency)
            
        nodes_latency_dict[root] = np.mean(latency_list)

    return(nodes_latency_dict)

In [None]:
# In: graph, #nodes in the set of seed nodes, spread probability, Monte-Carlo simulations, cost dictionary and budget --> Out: set of seed nodes, reward and time

def celf(g,k,p=0.1,mc=1000,node_cost_dict={}, budget=budget):  
    
    start_time = time.time() 
    marg_gain = []
    for node in g.nodes:
        marg_gain.append(IC(g,[node],p,mc)/node_cost_dict[node])

    Q = sorted(zip(g.nodes,marg_gain), key=lambda x: x[1],reverse=True)

    S, spread, SPREAD = [Q[0][0]], Q[0][1], [Q[0][1]]
    Q, LOOKUPS, timelapse = Q[1:], [len(g.nodes)], [time.time()-start_time]
    budget = budget - node_cost_dict[Q[0][0]]
    
    # --------------------
    # Find the next k-1 nodes using the list-sorting procedure
    # --------------------

    for _ in tqdm(range(k-1)):    

        check, node_lookup = False, 0

        # Shorten the candidate list based on whether the cost of a node is less or equal to our budget
        j=0
        while j < len(Q):
            if node_cost_dict[Q[j][0]] > budget:
                Q.pop(j)
                j = j-1
            j = j+1
        
        if Q:
            while not check:
                
                # Count the number of times the spread is computed
                node_lookup += 1
                
                # Recalculate spread of top node
                current = Q[0][0]
                
                # Evaluate the spread function and store the marginal gain in the list
                marg_gain = (IC(g,S+[current],p,mc)/node_cost_dict[current]) - spread
                Q[0] = (current,marg_gain)

                # Re-sort the list
                Q = sorted(Q, key = lambda x: x[1], reverse = True)

                # Check if previous top node stayed on top after the sort
                check = (Q[0][0] == current)

            # Select the next node
            spread += Q[0][1]
            S.append(Q[0][0])
            budget = budget - node_cost_dict[Q[0][0]]
            SPREAD.append(spread)
            LOOKUPS.append(node_lookup)
            timelapse.append(time.time() - start_time)

            # Remove the selected node from the list
            Q = Q[1:]

    return(S,SPREAD,timelapse,LOOKUPS)

In [None]:
def getFile(myfile):
    dir = os.getcwd() + '\\' + 'datasets'
    for f in os.listdir(dir):
        if myfile in f and 'data' in f:
            return os.getcwd() + '\\' + 'datasets' + '\\' + f
    return 'Error' 

def readFile(filepath):
    filehandler = open(filepath, 'r',encoding='utf-8')
    Lines = filehandler.readlines()
    return Lines,filepath

In [None]:
target_file = 'haggle'
lines,filepath = readFile(getFile(target_file))

In [None]:
edge_list = list()
edge_weight_dict = dict()

if target_file == 'adolescent':
    for line in lines:
        tokens = line.strip().split(' ')
        node1 = (int)(tokens[0])-1
        node2 = (int)(tokens[1])-1
        edge = (node1,node2)
        weight = (float)(tokens[2])

        edge_list.append(edge)
        edge_weight_dict[edge] = weight

elif target_file == 'infectious':
    for line in lines:
        tokens = line.strip().split(' ')
        node1 = (int)(tokens[0])-1
        node2 = (int)(tokens[1])-1
        edge = (node1,node2)

        edge_list.append(edge)

    edge_weight_dict = dict(collections.Counter(edge_list))

elif target_file == 'haggle':
    for line in lines:
        tokens = line.strip().split(' ')
        node1 = (int)(tokens[0])-1
        node2 = (int)(tokens[1].split('\t')[0])-1
        edge = (node1,node2)

        edge_list.append(edge)

    edge_weight_dict = dict(collections.Counter(edge_list))

elif target_file == 'malawi':
    for i in range(1,len(lines)):
        line = lines[i]
        tokens = line.strip().split(',')
        node1 = (int)(tokens[3])-1
        node2 = (int)(tokens[4])-1
        edge = (node1,node2)

        edge_list.append(edge)

    edge_weight_dict = dict(collections.Counter(edge_list))

elif target_file == 'hospital':
    mapping_file = os.getcwd() + '\\' + 'datasets' + '\\hospital_mapping.txt'
    # reading the data from the file
    with open(mapping_file) as f:
        data = f.read()
    # reconstructing the data as a dictionary
    hospital_dict = ast.literal_eval(data)

    for line in lines:
        tokens = line.strip().split('\t')
        node1 = hospital_dict[(int)(tokens[1])]
        node2 = hospital_dict[(int)(tokens[2])]
        edge = (node1,node2)

        edge_list.append(edge)

    edge_weight_dict = dict(collections.Counter(edge_list))

In [None]:
# construct the network
g = nx.Graph()
g.add_edges_from(edge_list)

In [None]:
# find outdegree
degree_list = []
for k,v in g.degree():
    degree_list.append(v)
degree_freq_dict = collections.Counter(degree_list)

outdegree_values = list(degree_freq_dict.keys())
outdegree_freq = list(degree_freq_dict.values())
  
fig = plt.figure(figsize = (8, 5))
 
# creating the bar plot
plt.bar(outdegree_values, outdegree_freq, color ='blue')
plt.xlabel("Degree")
plt.ylabel("Frequency")
plt.title("Degree Distribution")

# convert y-axis to Logarithmic scale
plt.yscale("log")
# plt.xscale("log")
plt.show()

In [None]:
print('total number of nodes in the graph = ',len(g.nodes))
print('total number of edges in the graph = ',len(g.edges))

degree_centrality = nx.degree_centrality(g)

# find average degree centrality of the nodes in the network
avg_degree_centrality = 0
for i in degree_centrality:
    avg_degree_centrality += degree_centrality[i]
avg_degree_centrality = avg_degree_centrality/len(degree_centrality)
print('average degree centrality = ',avg_degree_centrality)
print('average degree = ',sum(degree_list)/len(degree_list))

print('diameter = ',nx.diameter(g))

# COSTS 
- DEFAULT: unit_cost_variable = True ==> costs = 1 for unit-cost algorithms
- unit_cost_variable = False ==> costs based on conditions for cost-sensitive algorithms

In [None]:
unit_cost_variable = True
node_cost_dict = dict()
if unit_cost_variable:
    for i in g.nodes:
        node_cost_dict[i] = 1
else:
    cost_list = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]
    minimum_degree = min(degree_list)
    maximum_degree = max(degree_list)
    chunk = (int)((maximum_degree-minimum_degree)/len(cost_list))
    for i in g.nodes:
        for j in range(len(cost_list)):
            starting_point = minimum_degree + j*chunk
            if j != len(cost_list)-1:
                ending_point = minimum_degree + (j+1)*chunk
            else:
                ending_point = maximum_degree

            if g.degree(i)>=starting_point and g.degree(i)<=ending_point:
                node_cost_dict[i] = cost_list[j]
                break

# WEIGHTS
- DEFAULT: weights_off_variable = True ==> weights = [0, 0, 0, 0, 0] for non-weight-sensitive algorithms
- weights_off_variable = False ==> weights = [-0.2, -0.1, 0, 0.1, 0.2] for weight-sensitive algorithms

In [None]:
weights_off_variable = True
node_neighweights_globaldict = dict()
if weights_off_variable:
    weights = [0, 0, 0, 0, 0]
else:
    weights = [-0.2, -0.1, 0, 0.1, 0.2]

random.seed(11)
for current_node in g.nodes:
    neighbours_weight_dict = dict()

    tmp_dict = dict(g[current_node])
    neighbours = list(tmp_dict.keys())

    for neighbour in neighbours:
        if neighbour in node_neighweights_globaldict.keys():
            num = node_neighweights_globaldict[neighbour][current_node]
        else:
            num = random.choice(weights)
        neighbours_weight_dict[neighbour] = num

    node_neighweights_globaldict[current_node] = neighbours_weight_dict

In [None]:
node_weights_globaldict = dict()
for k,v in node_neighweights_globaldict.items():
    node_weights_globaldict[k] = list(v.values())

if target_file != 'infectious':
    node_weights_globaldict = edge_weight_dict

# TIMESTAMPS
by default the objective function is population affected but with the code in this section we change the objective function to the detetion time

In [None]:
random.seed(11)
edge_latency_dict = dict()
latencies = [x+1 for x in range(20)]
for current_edge in g.edges:
    edge_latency_dict[current_edge] = random.choice(latencies)

    reverse_edge = (current_edge[1], current_edge[0])
    edge_latency_dict[reverse_edge] = edge_latency_dict[current_edge]

output = detection_time(g,10,d = 20, mc = 1000)
output_top_nodes = dict()
for _ in range(10):
    Keymax = max(zip(output.values(), output.keys()))[1]
    Valuemax = max(zip(output.values(), output.keys()))[0]
    output_top_nodes[Keymax] = Valuemax
    output.pop(Keymax)

In [None]:
betweenness_centrality = nx.betweenness_centrality(g)
outlist = [125,31,49,163,48]

# report statistics
print('Placement : {} & {} & {} & {} & {}'.format(outlist[0],outlist[1],outlist[2],outlist[3],outlist[4]))
print('Deg. centrl. : {:.5f} & {:.5f} & {:.5f} & {:.5f} & {:.5f}'.format(degree_centrality[outlist[0]],degree_centrality[outlist[1]],degree_centrality[outlist[2]],degree_centrality[outlist[3]],degree_centrality[outlist[4]]))
print('Betw. centrl. : {:.7f} & {:.7f} & {:.7f} & {:.7f} & {:.7f}'.format(betweenness_centrality[outlist[0]],betweenness_centrality[outlist[1]],betweenness_centrality[outlist[2]],betweenness_centrality[outlist[3]],betweenness_centrality[outlist[4]]))
print('Node degree : {} & {} & {} & {} & {}'.format(g.degree(outlist[0]),g.degree(outlist[1]),g.degree(outlist[2]),g.degree(outlist[3]),g.degree(outlist[4])))

# RUN

In [None]:
celf_output   = celf(g,5,p = 0.5,mc = 100, node_cost_dict=node_cost_dict, budget=budget)
print("celf output:   " + str(celf_output[0]))

# save results
output_dict = dict()
output_dict['celf_output'] = celf_output
output_file = os.getcwd() + '\\' + 'outputs' + '\\' + target_file + '_unitcosts='+str(unit_cost_variable) + '_weightsoff='+str(weights_off_variable)+'.pickle'
with open(output_file, 'wb') as handle:
    pickle.dump(output_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
betweenness_centrality = nx.betweenness_centrality(g)
outlist = celf_output[0]
# report statistics
print('Placement = {} & {} & {} & {} & {}'.format(outlist[0],outlist[1],outlist[2],outlist[3],outlist[4]))
print('Deg. centrl. = {:.5f} & {:.5f} & {:.5f} & {:.5f} & {:.5f}'.format(degree_centrality[outlist[0]],degree_centrality[outlist[1]],degree_centrality[outlist[2]],degree_centrality[outlist[3]],degree_centrality[outlist[4]]))
print('Betw. centrl. = {:.7f} & {:.7f} & {:.7f} & {:.7f} & {:.7f}'.format(betweenness_centrality[outlist[0]],betweenness_centrality[outlist[1]],betweenness_centrality[outlist[2]],betweenness_centrality[outlist[3]],betweenness_centrality[outlist[4]]))
print('Node degree = {} & {} & {} & {} & {}'.format(g.degree(outlist[0]),g.degree(outlist[1]),g.degree(outlist[2]),g.degree(outlist[3]),g.degree(outlist[4])))

In [None]:
greedy_output = greedy(g,10,p = 0.5,mc = 100, node_cost_dict=node_cost_dict, budget=budget)
print("greedy output: " + str(greedy_output[0]))

# save results
# output_dict = dict()
# output_dict['greedy_output'] = greedy_output
# output_file = 'small_greedy_costs.pickle'
# with open(output_file, 'wb') as handle:
#     pickle.dump(output_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def plot_CT(x, y_greedy, y_celf):
     
    # first plot with X and Y data
    plt.plot(x, y_greedy, linestyle='--', marker='o', color='r', label='GREEDY')
    
    # second plot with x1 and y1 data
    plt.plot(x, y_celf, linestyle='--', marker='o', color='b', label='CELF')
    
    plt.xlabel("Nodes selected")
    plt.ylabel("Time in seconds")
    plt.title('Computation time: CELF vs GREEDY')
    plt.legend()
    plt.show()

In [None]:
# plot time
x_axis = [1,2,3,4,5,6,7,8,9,10]
plot_CT(x_axis, greedy_output[2], celf_output[2])