In [1]:
import os
from pickle import load
from argparse import ArgumentParser
from glob import glob

import pandas as pd
import numpy as np
import random

from math import exp, log, floor, ceil
from igraph import Graph

NUM_ROW = 50000000
PERCENT = 0.01

In [49]:
DELTA = 7 # how far a period to look ahead for
PAST_WINDOW = 3 # how far back to look for infection status

def write_vertex_features(vertex_features, subgraph, age_map):
    # for each node in subgraph, write features to vertex_features
    # age
    # coreness
    # authority
    # rareness
    subgraph_pids = subgraph.vs["name"]
    
    coreness = subgraph.coreness()
    authority = subgraph.authority_score(weights="duration")
    degrees = np.array([subgraph.degree(i) for i in range(len(subgraph_pids))])

    zero_degrees = degrees == 0
    degrees[zero_degrees] = 1
    rareness = 1.0/degrees
    rareness[zero_degrees] = 0
    
    vertex_features[tuple(subgraph_pids), 0] = [age_map[pid] for pid in subgraph_pids]
    vertex_features[tuple(subgraph_pids), 1] = [coreness[subgraph.vs.find(name=pid).index] for pid in subgraph_pids]
    vertex_features[tuple(subgraph_pids), 2] = [authority[subgraph.vs.find(name=pid).index] for pid in subgraph_pids]

    vertex_features[tuple(subgraph_pids), 3] = rareness
    
    return vertex_features

def process_subgraph(subgraph, si_table, age_map, vertex_features, ego_pid, cutoff=None, max_size=50):

    # for each infected node in subgraph create
    # a positive instance, at time t-delta

    # create a negative instance for neighbors not infected at time - delta

    subgraph_pids = subgraph.vs["name"]
    si_subgraph = si_table[si_table.index.get_level_values("pid").isin(subgraph_pids)]

    if len(si_subgraph) == 0:
        return None

    pid_arr = np.pad(np.array(subgraph_pids), (0, max_size-len(subgraph_pids)), "constant", constant_values=0)
    valid_days = si_subgraph.index.get_level_values("infected")
    if cutoff:
        valid_days = valid_days[valid_days >= cutoff]

    n = len(valid_days.unique())

    labels = np.zeros((n,), dtype=int)
    influence_feature = np.zeros((n,max_size,2))
    adjacency_matrix = np.zeros((n,max_size, max_size))
    vertex_id = np.zeros((n,max_size), dtype=int)

    subgraph_adj = np.array(subgraph.get_adjacency().data)
    pad_num = max_size - subgraph_adj.shape[0]  
    adj = np.pad(subgraph_adj, (0,pad_num), "constant", constant_values=0)
    
    vertex_features = write_vertex_features(vertex_features, subgraph, age_map)
    
    label_idx = 0

    for day in valid_days.unique():
        # look forward delta steps to identify 
        min_day = day - PAST_WINDOW

        day_level = si_subgraph.index.get_level_values("infected").to_series()
        back_subgraph = si_subgraph[day_level.between(min_day, day).values]
    
        state_t_d = si_subgraph[day_level.between(day, day+DELTA, inclusive="right").values]

        labels[label_idx] = ego_pid in state_t_d.index.get_level_values("pid")
        adjacency_matrix[label_idx] = adj
        vertex_id[label_idx] = pid_arr

        influence_feature[label_idx,:,0] = (pid_arr == ego_pid).astype(int)

        # get neighbors who are infected at time t-delta 
        is_inf = np.isin(pid_arr, back_subgraph.index.get_level_values("pid").unique()).astype(int)
        influence_feature[label_idx,:,1] = is_inf
       
        label_idx += 1
 
    return (labels, adjacency_matrix, influence_feature, vertex_id, vertex_features)

def rand_walk_igraph(graph, start_node, size, restart_prob, wname="weight"):
    """
    Do an edge-weighted random walk with restart starting from start_node
    on graph.

    graph - an igraph graph with vertex attributes "name" and edge attributes wname
    start_node - a vertex in graph (note that this *should not* be the vertex name)
    size - the maximum size of the random walk
    restart_prob - the probability of returning to start_node at each step
    wname (optional) - the name of the edge attribute to weight

    returns - subgraph (an igraph graph produced through the random walk)
    """
    nodes = set([start_node])
    current = start_node

    # To avoid a situation where we end up in a loop, we limit the number
    # of steps we can take without adding a new node

    max_step = 200
    step = 0

    while len(nodes) < size:
        curr_size = len(nodes) # TODO: preprocess pid_part to ensure degree > 0 => avoid max_step iterations
        if random.random() < restart_prob or len(graph.neighbors(current)) == 0:
            current = start_node
        else:
            poss_edges = graph.incident(current)
            poss_weights = np.array([graph.es[n][wname] for n in poss_edges])
            if len(poss_edges) == 0:
                None
                # print(f"There are no incident edges for {current}, but neighbors {graph.neighbors(current)}")
            if poss_weights.sum() == 0:
                # print(f"Edge weights are 0 for {current}")
                current = start_node
                # TODO: make into exception
            else:
                #poss_weights = np.nan_to_num(poss_weights)
                new_edge = np.random.choice(poss_edges, p = poss_weights/poss_weights.sum())

                if graph.es[new_edge].source == current:
                    current = graph.es[new_edge].target
                else:
                    current = graph.es[new_edge].source 
            nodes.add(current)
        if len(nodes) == curr_size:
            step += 1
        else:
            step = 0
        if step == max_step:
            return graph.induced_subgraph(list(nodes))
    return graph.induced_subgraph(list(nodes))

In [64]:
disease_outcome_data_path = './data/pandemic/centralized/train/va_disease_outcome_training.csv.gz'
person_data_path = './data/pandemic/centralized/train/va_person.csv.gz'
population_network_data_path = './data/pandemic/centralized/train/va_population_network.csv.gz'

disease_outcome_df = pd.read_csv(disease_outcome_data_path, compression='gzip', nrows=NUM_ROW)
person_df = pd.read_csv(person_data_path, compression='gzip', nrows=NUM_ROW)
population_network_df = pd.read_csv(population_network_data_path, compression='gzip', nrows=300000)

In [4]:
# read in population file (person_df)
pop_file = person_df
age_map = pop_file[["pid", "age"]].set_index("pid")["age"].to_dict()
max_id = max(age_map.keys())
print(f"Made age map, {max_id} is the max_id")
vertex_features = np.zeros((max_id+1, 4))

Made age map, 7688058 is the max_id


In [48]:
# read in subgraph list (disease_outcome_df)
disease_data = disease_outcome_df
is_infected = disease_data["state"] == "I"
pid = disease_data[is_infected]["pid"]
day = disease_data[is_infected]["day"]

inf_time_df = pd.DataFrame({"pid" : pid, "day" : day})

is_rec = disease_data["state"] == "R"
pid = disease_data[is_rec]["pid"]
day = disease_data[is_rec]["day"]

rec_time_df = pd.DataFrame({"pid" : pid, "day" : day})

def lookup_rec(row):
    pid_subset = rec_time_df[(rec_time_df["pid"] == row["pid"]) & (rec_time_df["day"] >= row["day"])]
    if pid_subset.shape[0] > 0:
        return pid_subset["day"].min()
    else:
        return 0

recovery_times = inf_time_df.apply(lookup_rec, axis=1)
si_table = inf_time_df
si_table.rename({"day" : "infected"}, axis=1, inplace=True)

si_table["recovery"] = recovery_times

si_table.set_index(["pid", "infected"], verify_integrity=True, inplace=True)
si = si_table

In [65]:
# (population_network_df)
edges = population_network_df
edges["pid1s"] = edges.apply(lambda row: min(row["pid1"], row["pid2"]), axis=1)
edges["pid2s"] = edges.apply(lambda row: max(row["pid1"], row["pid2"]), axis=1)

edges = edges.drop(columns=['pid1', 'pid2'])
edges = edges.rename(columns={'pid1s':'pid1', 'pid2s':'pid2'})

collapsed_edges = edges[["pid1", "pid2", "duration"]].groupby(["pid1", "pid2"]).sum()


In [66]:
graph_df = collapsed_edges.reset_index()
graph_df.head()
tuples = [tuple(x) for x in graph_df.values]
graph = Graph.TupleList(tuples, directed = True, edge_attrs = ['duration'])
vertex_names = list(graph.vs["name"])
vertex_names.sort()

print(f"There are {len(vertex_names)} vertices")
graph_dict = {}
n_dict = 0

pids = vertex_names
node = []
for pid in pids:
    node = graph.vs.find(name=pid) # name
    subgraph = rand_walk_igraph(graph, node, 50, 0.8, wname="duration")
    graph_dict[pid] = subgraph

subgraphs = graph_dict
print(f"There are {len(subgraphs)} subgraphs")
sample_size = 300000
index_set = np.random.choice(list(subgraphs.keys()), replace=False, size=min(sample_size, len(subgraphs)))

There are 17749 vertices
There are 17749 subgraphs


In [69]:
label_list = []
adj_list = []
inf_list = []
vert_id = []
for i,subgraph_pid in enumerate(index_set): 

    subgraph = subgraphs[subgraph_pid]
    output = process_subgraph(subgraph, si, age_map, vertex_features, subgraph_pid, None)

    if output is not None:
        labels, adjacency_matrix, influence_feature, vertex_id, vertex_features = output

        label_list.append(labels)
        adj_list.append(adjacency_matrix)
        inf_list.append(influence_feature)
        vert_id.append(vertex_id)

    # if (i+1) % 100 == 0:
    #     print(f"Handled {i}/{len(index_set)} instances, {len(label_list)} added")
print(f"Total subgraphs: {len(label_list)}")
print()

Total subgraphs: 23

