# It's Worth A Shot... 
### Urban Density, Endogenous Vaccination Decisions, and Dynamics of Infectious Disease

Data Analysis Notebook <br>
Programmer: Andrew Souther <br>
Date: December 2020

To begin, we import all the packages we need. We use pandas for data manipulation, matplotlib for visualization, and econtools for econometrics work. csv and all the following packages are needed to clean the network data. 

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
#import econtools.metrics as mt


import csv
import networkx as nx
from networkx.algorithms import community

We will also define a few helpful functions to make repetitive data tasks a bit cleaner. 

In [2]:
#this function imports an experiment data log file and performs some simple cleaning
def import_and_clean(filepath):
    
    df = pd.read_json(filepath, lines=True)
    
    df["proportion_vacc"] = 1 - (df["unvaccinated"] / df["hub_size"])
    df["proportion_inf"] = (df["recovered"] / df["hub_size"]) 
    df["hub_id"] = df["hub"].astype(str) + df["inst_unique_id"] #generate a unique id for each hub

    return df


#this function builds a dataframe of "steady-state" values, averaged for each hub over the last five seasons
def collapse_steady_state(df):
    
    df_last5 = df.query("season>=20")

    #collapse the data to the hub level, averaging over the last 5 seasons
    df_hubs = df_last5[["proportion_inf", 
                          "proportion_vacc", 
                          "homophily", 
                          "hub_density",
                          "cost_of_infection",
                          "hub_id"]].groupby(["hub_id"]).mean()
    return df_hubs


#this function imports and cleans the raw network data
def import_network_data(filepath, simulation_id):
    network_df = pd.read_json(filepath, lines=True)
    network_df = network_df[network_df["simulation_id"]==simulation_id]
    
    edges_df = network_df.loc[network_df["data_flag"] == "network"]
    edges_df = edges_df[["source", "target"]]

    nodes_df = network_df.loc[network_df["data_flag"] == "hub"]
    nodes_df = nodes_df[["agent", "type"]]
    
    return edges_df, nodes_df


#this function takes network data in two pandas dataframes outputs it in a form that Gephi likes
def build_gexf_from_df(edges_df, nodes_df, filepath):
    
    nodes_df.to_csv('network_data/nodelist.csv', index=False)
    edges_df.to_csv('network_data/edgelist.csv', index=False)

    with open('network_data/nodelist.csv', 'r') as nodecsv: # Open the file
        nodereader = csv.reader(nodecsv) # Read the csv
        # Retrieve the data (using Python list comprhension and list slicing to remove the header row, see footnote 3)
        nodes = [n for n in nodereader][1:]

    node_names = [n[0] for n in nodes] # Get a list of only the node names

    with open('network_data/edgelist.csv', 'r') as edgecsv: # Open the file
        edgereader = csv.reader(edgecsv) # Read the csv
        edges = [tuple(e) for e in edgereader][1:] # Retrieve the data
    
    G = nx.Graph()
    G.add_nodes_from(node_names)
    G.add_edges_from(edges)

    hub_dict = {}
    for node in nodes: # Loop through the list, one row at a time
        hub_dict[node[0]] = node[1]

    nx.set_node_attributes(G, hub_dict, 'type')
    nx.write_gexf(G, filepath)

# Test Code

In [3]:
raw_df = import_and_clean('prob_choice_data.log')
prob_choices = raw_df['probability_choice'].unique().tolist()
prob_choice_dict = {}

for value in prob_choices:
    value_df = raw_df[raw_df["probability_choice"] == value]
    percent = int(value * 100)

    urban_grouped = value_df.query("hub_density==12").groupby(["season"]).mean()
    rural_grouped = value_df.query("hub_density==8").groupby(["season"]).mean()
    x = urban_grouped.index


    urban_grouped = value_df.query("hub_density==12").groupby(["season"]).mean()
    rural_grouped = value_df.query("hub_density==8").groupby(["season"]).mean()
    x = urban_grouped.index

    plt.plot(x, urban_grouped["proportion_vacc"], color='#7f6d5f', label='Urban')
    plt.plot(x, rural_grouped["proportion_vacc"], color='#557f2d', label='Rural')
    plt.title(f"Average Regional Vaccination over Time, probability_choice = {percent}%") 
    plt.xlabel("Season")
    plt.ylabel("Vaccination Rate")
    plt.legend(loc="best")
    
    plt.savefig(f'prob_choice_images/seasonal_fig_{percent}.png', bbox_inches='tight')
    plt.clf()

<Figure size 432x288 with 0 Axes>

In [4]:
raw_df = import_and_clean('fixed_percent_data.log')
percent_choices = raw_df['percent_choice'].unique().tolist()
percent_choice_dict = {}

for value in percent_choices:
    value_df = raw_df[raw_df["percent_choice"] == value]
    percent = int(value * 100)

    urban_grouped = value_df.query("hub_density==12").groupby(["season"]).mean()
    rural_grouped = value_df.query("hub_density==8").groupby(["season"]).mean()
    x = urban_grouped.index


    urban_grouped = value_df.query("hub_density==12").groupby(["season"]).mean()
    rural_grouped = value_df.query("hub_density==8").groupby(["season"]).mean()
    x = urban_grouped.index

    plt.plot(x, urban_grouped["proportion_vacc"], color='#7f6d5f', label='Urban')
    plt.plot(x, rural_grouped["proportion_vacc"], color='#557f2d', label='Rural')
    plt.title(f"Average Regional Vaccination over Time, percent_choice = {percent}%") 
    plt.xlabel("Season")
    plt.ylabel("Vaccination Rate")
    plt.legend(loc="best")
    
    plt.savefig(f'fixed_percent_images/seasonal_fig_{percent}.png', bbox_inches='tight')
    plt.clf()

KeyError: 'fixed_percent'

In [None]:
#basepath = os.path.dirname(os.path.abspath(__file__))
#filepath = basepath + "/experiment_data.log"
base_df = import_and_clean("experiment_data.log")
base_df = base_df.loc[base_df["data_flag"] == "seasonal_data"]
print(base_df.shape)
base_df.head()

In [None]:
urban_grouped = base_df.query("hub_density==12").groupby(["season"]).mean()
rural_grouped = base_df.query("hub_density==8").groupby(["season"]).mean()
x = urban_grouped.index


urban_grouped = base_df.query("hub_density==12").groupby(["season"]).mean()
rural_grouped = base_df.query("hub_density==8").groupby(["season"]).mean()
x = urban_grouped.index


plt.plot(x, urban_grouped["proportion_vacc"], color='#7f6d5f', label='Urban')
plt.plot(x, rural_grouped["proportion_vacc"], color='#557f2d', label='Rural')
plt.title("Average Regional Vaccination over Time") 
plt.xlabel("Season")
plt.ylabel("Vaccination Rate")
plt.legend(loc="best")