# Generate Network Structure From .xlsx File

In [1]:
#import libraries 
import pandas as pd
import os 
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx 
import time as pytime 

In [2]:
#read Excel File 
x = pd.read_excel('NationalOutbreakPublicDataTool.xlsx')

In [3]:
#make TWO new dataframes - one for norovirus (x_n) and one for salmonella (x_s)
norovirus = ['Norovirus Genogroup I', 'Norovirus Genogroup II','Norovirus unknown','Norovirus']
salmonella = ['Salmonella enterica']

#make norovirus dataframe
x_n = pd.DataFrame()
for i in norovirus:
    x_sub = x[x.Etiology == i]
    if i == 'Norovirus Genogroup I':
        x_n = x_sub
    else: x_n = pd.concat([x_n, x_sub])

#make salmonella dataframe
x_s = pd.DataFrame()
x_s = x[x.Etiology == salmonella[0]]

x_n = x_n.reset_index()
x_s = x_s.reset_index()

#drop any cases that are attributed to multistates
x_n = x_n[x_n.State != 'Multistate']
x_s = x_s[x_s.State != 'Multistate']

In [4]:
#print preview of norovirus file 
x_n

Unnamed: 0,index,Year,Month,State,Primary Mode,Etiology,Serotype or Genotype,Etiology Status,Setting,Illnesses,...,Deaths,Info on Deaths,Food Vehicle,Food Contaminated Ingredient,IFSAC Category,Water Exposure,Water Type,Animal Type,Animal Type Specify,Water Status
0,31,2009,1,Tennessee,Food,Norovirus Genogroup I,,Confirmed,Grocery store,8,...,0.0,8.0,cake,,Multiple,,,,,
1,32,2009,3,Tennessee,Food,Norovirus Genogroup I,,Confirmed,Restaurant - Sit-down dining,9,...,0.0,9.0,"oysters, raw",oysters,Mollusks,,,,,
2,41,2009,1,Tennessee,Food,Norovirus Genogroup I,,Confirmed,"Restaurant - ""Fast-food""(drive up service or p...",6,...,0.0,6.0,,,,,,,,
3,43,2009,1,Florida,Food,Norovirus Genogroup I,,Confirmed,Restaurant - other or unknown type,24,...,0.0,24.0,"sandwich, wrap; sandwich, wrap",,Multiple,,,,,
4,79,2009,2,Ohio,Food,Norovirus Genogroup I,,Confirmed,Caterer (food prepared off-site from where ser...,70,...,0.0,70.0,egg salad/egg salad sandwich,,Multiple,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6154,21708,2018,11,Illinois,Food,Norovirus,,Suspected,Caterer (food prepared off-site from where ser...,21,...,0.0,20.0,italian beef sandwich,,Multiple,,,,,
6155,21713,2018,2,Georgia,Food,Norovirus,,Suspected,Caterer (food prepared off-site from where ser...,17,...,0.0,17.0,,,,,,,,
6156,21773,2018,3,Washington,Food,Norovirus,,Suspected,Restaurant - Sit-down dining,11,...,0.0,5.0,,,,,,,,
6157,21787,2018,11,Illinois,Food,Norovirus,,Suspected,Restaurant - Sit-down dining,7,...,0.0,7.0,,,,,,,,


In [5]:
#geographic network 
    #connect cases that occur within x months and bordering state 
neighbors = pd.read_csv('neighbors-states.csv')

List Mapping States to Abbreviations 

In [6]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [7]:
state_abb = {v: k for k, v in us_state_abbrev.items()} #flip the state abbreviation - full name dictionary
empty = np.empty(neighbors.shape, dtype = str) 
state_abb['DC'] = 'Washington DC'


#convert the neighbors list from state abbreviations to full name 
for i in range(len(neighbors)):
    neighbors.iloc[i].StateCode = state_abb[neighbors.iloc[i].StateCode]
    neighbors.iloc[i].NeighborStateCode = state_abb[neighbors.iloc[i].NeighborStateCode]
    

In [9]:
#create a dictionary where each state has a key that is a list of neighboring states 
neighbors_dict = dict()
for i in np.unique(neighbors):
    sub_list = []
    for j in range(len(neighbors)):
        if neighbors.iloc[j].StateCode == i:
            sub_list.append(neighbors.iloc[j].NeighborStateCode)
    neighbors_dict[i] = sub_list
    
#make some small fixes 
neighbors_dict['Puerto Rico'] = []
neighbors_dict['Alaska'] = [] #don't count Alaska and Washington as neighbors
neighbors_dict['California'] = ['Nevada', 'Arizona', 'Oregon']
neighbors_dict['Florida'] = ['Georgia', 'Alabama']
neighbors_dict['Indiana'] = ['Michigan', 'Ohio', 'Kentucky', 'Illinois']
neighbors_dict['Kansas'] = ['Nebraska', 'Missouri', 'Oklahoma', 'Colorado']
#neighbors_dict['Ohio'] 

In [13]:
neighbors_dict

{'Alabama': ['Florida', 'Georgia', 'Mississippi', 'Tennessee'],
 'Alaska': ['Washington'],
 'Arizona': ['California', 'Colorado', 'New Mexico', 'Nevada', 'Utah'],
 'Arkansas': ['Louisiana',
  'Missouri',
  'Mississippi',
  'Oklahoma',
  'Tennessee',
  'Texas'],
 'California': ['Hawaii', 'Nevada', 'Oregon'],
 'Colorado': ['Kansas',
  'Nebraska',
  'New Mexico',
  'Oklahoma',
  'Utah',
  'Wyoming'],
 'Connecticut': ['Massachusetts', 'New York', 'Rhode Island'],
 'Delaware': ['Maryland', 'New Jersey', 'Pennsylvania'],
 'Florida': ['Georgia'],
 'Georgia': ['North Carolina', 'South Carolina', 'Tennessee'],
 'Hawaii': [],
 'Idaho': ['Montana', 'Nevada', 'Oregon', 'Utah', 'Washington', 'Wyoming'],
 'Illinois': ['Indiana', 'Kentucky', 'Missouri', 'Wisconsin'],
 'Indiana': ['Kentucky', 'Michigan', 'Ohio'],
 'Iowa': ['Illinois',
  'Minnesota',
  'Missouri',
  'Nebraska',
  'South Dakota',
  'Wisconsin'],
 'Kansas': ['Missouri', 'Nebraska', 'Oklahoma'],
 'Kentucky': ['Missouri', 'Ohio', 'Tennesse

In [10]:
#initialize graph
net = nx.Graph()
#add node for every case
net.add_nodes_from(list(x_n.index))

#initialize list to hold all geographic edges
state_edges = []

start_time = pytime.time()

counter1 = 0 #initialize row counter 
for i in x_n.State: #iterate over EVERY ROW 
    counter2 = 0 #initialize second row counter
    for j in x_n.State: #iterate over EVERY ROW for EVERY ROW 
        if counter1 != counter2: #if they are not the same case 
            if i in neighbors_dict[j]: #AND if the states is neighboring
                state_edges.append((counter1, counter2))
        counter2 += 1
    counter1 += 1
print('code block took ' + str(pytime.time() - start_time) + ' seconds to execute')
print('state edges = ' +str( len(state_edges) / len(x_n)**2) +' % of possible edge combinations')

code block took 17.94823932647705 seconds to execute
state edges = 0.04152571275090888 % of possible edge combinations
