In [None]:
import pandas as pd
pd.set_option('display.max_rows', 999)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import networkx as nx
import geopandas

# **Exploratory Data Analysis**

In [None]:
# read in data
covid_f_raw = pd.read_csv('data/covid_forecast_2020-12-03.csv')
covid_h_raw = pd.read_csv('data/COVID_historical_us-counties.csv',
                         converters={'fips': lambda x: str(x)})

In [None]:
census_raw = pd.read_csv('data/2010_census_pop.csv',
                    converters={'Zip Code ZCTA': lambda x: str(x)})
pop_den_raw = pd.read_csv('data/uszips_pop_density.csv',
                     converters={'zip': lambda x: str(x)})
fips_pop = pd.read_csv('data/fips_population.csv',
                      converters={'FIPS': lambda x: str(x)})

In [None]:
fb = pd.read_table('data/county_county_aug2020.tsv', 
                   converters={'user_loc': lambda x: str(x), 
                               'fr_loc': lambda x: str(x)})

In [None]:
zips_fips_raw = pd.read_csv('data/zips_fips.csv',
                           converters={'ZIP': lambda x: str(x),
                                      'STCOUNTYFP': lambda x: str(x)})

In [None]:
us_map = geopandas.read_file('data/map_1/cb_2018_us_county_20m.shp')

In [None]:
sf_county['GEOID'] = sf_county['fr_loc']

In [None]:
sf_map = us_map.merge(sf_county, on='GEOID')

In [None]:
sf_map

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
sf_map.plot(ax=ax, column='scaled_sci', 
            scheme='percentiles', 
            cmap='OrRd', 
            legend=True)
# leg = ax.get_legend()
ax.set_xlim(-125, -65)
ax.set_ylim(23, 50)
ax.set(title="Counties Connected to San Francisco, scaled by Social Connectedness Indicator")
ax.axis("off")
;

In [None]:
kern_county['GEOID'] = kern_county['fr_loc']

In [None]:
kern_map = us_map.merge(kern_county, on='GEOID')

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
kern_map.plot(ax=ax, column='scaled_sci', 
            scheme='box_plot', 
            cmap='OrRd', 
            legend=True)
# leg = ax.get_legend()
ax.set_xlim(-125, -65)
ax.set_ylim(23, 50)
ax.set(title="Counties Connected to San Francisco, scaled by Social Connectedness Indicator")
ax.axis("off")
;

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(25, 20))
ax1 = sf_map.plot(ax=ax1, column='log_sci', cmap='Blues', scheme='natural_breaks')
ax2 = kern_map.plot(ax=ax2, column='log_sci', cmap='Blues', scheme='natural_breaks', legend=True)
ax1.set(title="San Francisco County")
ax2.set(title="Kern County")
ax1.set_xlim(-125, -65)
ax1.set_ylim(23, 50)
ax2.set_xlim(-125, -65)
ax2.set_ylim(23, 50)
ax1.axis("off")
ax2.axis("off")
plt.savefig('SCI_map.png')
;

two plots

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
sf_map.plot(ax=ax, column='log_sci', 
            scheme='natural_breaks', 
            cmap='Blues', 
            legend=True)
# leg = ax.get_legend()
ax.set_xlim(-125, -65)
ax.set_ylim(23, 50)
ax.set(title="San Francisco County")
ax.axis("off")
plt.savefig('SF_county_SCI_map.png');

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
kern_map.plot(ax=ax, column='log_sci', 
            scheme='natural_breaks', 
            cmap='Blues', 
            legend=True)
# leg = ax.get_legend()
ax.set_xlim(-125, -65)
ax.set_ylim(23, 50)
ax.set(title="Kern County")
ax.axis("off")
plt.savefig('Kern_county_SCI_map.png');


________________________________________________

In [None]:
sf_map = geopandas.read_file('data/map_1/cb_2018_us_county_20m.shp')
fig, ax = plt.subplots(figsize=(20, 15))
us_map.plot(ax=ax, alpha=1)

# minx, miny, maxx, maxy = states.total_bounds
ax.set_xlim(-125, -65)
ax.set_ylim(23, 50)

**Areas of interest**

```
Challenges:
1) large dataset
- use random samples or duplicates?
2) dispersion of data -> challenging to work with
- use log transformation?

Notes
- how to treat counties?  Have to treat them in pairs or singularly?
- some counties are dropped because of low fb users
- SCI formula
- upscaling/downscaling from fips pop to zip pop to derive cases/zip

How to wrangle data:
1) Take random sample from population ~ 1 million rows
2) Remove duplicates ~ 5 million rows

Data Cleaning
1) add zeros to four digit zip codes - won't do
2) add population data by zip code - done
3) normalize cases/per capita - not started
4) dataset of cities and zip code - not started
5) added log transformation field - done

Views of Analysis:
- two buckets - high/low - won't do
- stratified buckets - 5 or more? 

EDA - questions to answer
The Basics
- Are zip links directional? - done
- What are the highest/smallest SCI indices, by zip? - done
    -> create histogram of distribution of SCI indices - done
- distribution of SCI indices? - done 
- What's the range of SCI indices? - done
- How many zip pairs are there? - done
- Average scaled_sci per zip? Zips with highest, lowest scaled_sci? - done, won't do
- view of zip and average SCI - done

How does the SCI relate to population? Do we see higher SCIs in zips with greater populations?
1a) SCI vs population, all zips - DONE
scatterplot of zip codes with dependent variable as SCI, independent variable as pop
1b) SCI vs population/sq kilometer - DONE
2) SCI vs population, grouped zips by population buckets
- What range does SCI take over various population buckets?

Additional Data Cleaning
- mapping of FIPs to ZIPs
- conversion of FIPs and ZIPs to dtype object vs int

Overall Trends - COVID vs SCI
1) SCI vs total COVID cases - DONE
2) SCI vs total COVID deaths - DONE

Historical spread of COVID, relationship to SCI - timeseries
1) How does the historical spread rate of COVID differ between counties with high SCI scores and those with low SCI scores?

Future spread of COVID, relationship to SCI
- Do prediction models indicate counties with higher SCIs will get more cases?

Can I provide a recommendation to public health experts on potential areas of spread?

Hypothesis testing
- Is the incidence of COVID between zip pairs with high SCI significantly different than the incidence of COVID between zip pairs with low SCI?

Questions
- How am I defining the incidence of COVID? Spread? total number of cases? growth rate over time?
- Recommendations/audience definition

Extra Credit
Network graph approach 
- how should network analysis be used as a tool to fight disease spread?
1) Which nodes (zip codes) have the highest number of connections to other nodes (zip codes)? 
1a) Which nodes w/highest covid spread have highest number of connections to other high disease nodes? 
* edges would be scaled for a minimum SCI
* degree centrality

GIS graphs
- sample two counties - show maps of SCI

Story:
How does a ZIP's SCI impact itself?
0) What SCI is
1) Distribution of SCI
- average
2) SCI vs Population
- examples
3) SCI vs COVID
- muted trends
- maybe covid is too widespread to notice any significant differences?
4) SCI vs COVID timeseries

How does a ZIP's SCI impact other ZIPs?



data sources:
covid_forecast_2020-12-03.csv - nytimes
COVID_historical_us-counties.csv - nytimes
fb_SCI_county_county_aug2020.tsv - facebook data for good
2010_census_pop.csv - US Census Bureau 
uszips_pop_density.csv - https://simplemaps.com/data/us-zips
zip-county-fips - https://www.kaggle.com/danofer/zipcodes-county-fips-crosswalk
2019 census estimate - https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/

c_historical = pd.read_csv('data/COVID_historical_us-counties.csv')
fb_sci = pd.read_csv('data/fb_SCI_county_county_aug2020.tsv', sep='\t')
census = pd.read_csv('data/2010_census_pop.csv')
pop_den = uszips_pop_density.csv
zip_fips = zips_fips.csv
```


## Data Cleaning

In [None]:
# create 
fb['log_sci'] = np.log10(fb['scaled_sci'])

In [None]:
# create population density table
pop_den = pop_den_raw[['zip', 'density']]
pop_den

In [None]:
pop_den

## EDA - questions to answer

### The Basics

**Are zip links directional?**

In [None]:
fb['scaled_sci']

**What are the min and max SCIs?**

In [None]:
fb['scaled_sci'].describe().astype(int)

**What is the distribution of SCI indices?**

In [None]:
fb.median()

In [None]:
plt.hist(fb['log_sci'], bins=50);

In [None]:
log_sci_hist = fb['log_sci']

In [None]:
log_sci_hist = log_sci_hist.where(log_sci_hist > 0)

In [None]:
log_sci_hist

In [None]:
f, ax = plt.subplots(figsize=(12, 5))
sns.despine(f)

sns.histplot(log_sci_hist, 
             color='#CB2A52'
            )

sns.set_style("white")

ax.set_ylim(0, 60000)
ax.set_xlim(0, 7)
ax.set_xlabel('log, SCI')
ax.set_title('Distribution of County-County Pairs by log SCI',
            size=18)
plt.savefig('log_SCI_distribution.png', dpi=1200)
;



In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
counts, bins, patches = ax.hist(fb['scaled_sci'], bins=5000)
ax.set_ylim(0, 50000)
ax.set_xlim(0, 10000000)
ax.ticklabel_format(useOffset=False, style='plain')

In [None]:
plt.boxplot(fb['log_sci'])
ax.ticklabel_format(useOffset=False, style='plain');

**Average scaled_sci per zip?**

In [None]:
fb.columns

In [None]:
fb.groupby(by='user_loc').sum('scaled_sci').astype(int).sort_values(by='log_sci')

In [None]:
fb.sort_values(by='scaled_sci')

### How does the SCI relate to population?

#### SCI vs Population

In [None]:
census_raw

In [None]:
census_join = census_raw.set_index('Zip Code ZCTA').join(fb.set_index('user_loc'))
cen_re = census_join.reset_index().dropna(axis=0).drop(columns='fr_loc').groupby(by='index').mean('scaled_sci')
cen_re.rename(columns={'index':'user_zip', '2010 Census Population': 'pop'}, inplace=True)

In [None]:
cen_re.reset_index()

In [None]:
x = cen_re['pop']
y = cen_re['scaled_sci']

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plt.scatter(x, y, alpha=.1, color='dodgerblue')
ax.set_ylabel('Mean SCI')
ax.set_xlabel('Population')
ax.set_title('Mean SCI vs. Population')

#### SCI vs Population, log transformation

In [None]:
cen_re['log_sci'] = np.log10(cen_re['scaled_sci'])

In [None]:
x = cen_re['pop']
y = cen_re['log_sci']

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plt.scatter(x, y, alpha=.1, color='dodgerblue')
ax.set_ylabel('Mean SCI, log')
ax.set_xlabel('Population')
ax.set_title('Mean log SCI vs. Population');

#### SCI (log transformation) vs Population/sq km

In [None]:
popdensci = cen_re.join(pop_den.set_index('zip'))

In [None]:
x = popdensci['density']
y = popdensci['log_sci']

fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plt.scatter(x, y, alpha=.1, color='dodgerblue')
ax.set_ylabel('SCI, log')
ax.set_xlabel('Population Density')
ax.set_title('SCI log vs. Population Density ')
;

### SCI vs total COVID cases

#### Data Cleaning

In [None]:
# grouping covid cases by fips
covid_h = covid_h_raw.groupby('fips').max()
# tables used below:
## popdensci, zips_fips_raw, covid_h

In [None]:
covid_h

In [None]:
popdensci_join

In [None]:
covid_h = covid_h.reset_index()

In [None]:
covid_h['fips'].replace('', np.nan, inplace=True)
covid_h = covid_h.dropna()
popdensci_join = popdensci.join(zips_fips_raw.set_index('ZIP')).reset_index().sort_values('STCOUNTYFP')
popdensci_join = popdensci_join.dropna()
popdensci_join = popdensci_join.rename(columns={'index':'zips', 'STCOUNTYFP':'fips'})
popdensci_join = popdensci_join.set_index('fips')
popdensci_join = popdensci_join.join(covid_h.set_index('fips'))

In [None]:
popdensci_join = popdensci_join.reset_index()

In [None]:
# upscaling/downscaling data
popdensci_join['fip_pop'] = popdensci_join['pop'].groupby(popdensci_join['fips']).transform('sum')
popdensci_join['zip_pop_percent'] = popdensci_join['pop']/popdensci_join['fip_pop']
popdensci_join['cases_per_zip'] = popdensci_join['zip_pop_percent'] * popdensci_join['cases']
popdensci_join['deaths_per_zip'] = popdensci_join['zip_pop_percent'] * popdensci_join['deaths']
popdensci_join['cases_per_capita'] = popdensci_join['cases_per_zip'] / popdensci_join['deaths']


In [None]:
x = popdensci_join['log_sci']
y = popdensci_join['cases']


fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plt.scatter(x, y, alpha=.1, color='dodgerblue')
ax.set_xlabel('SCI, log')
ax.set_ylabel('COVID Cases')
ax.set_title('SCI log vs. COVID Cases')
# ax.set_ylim(0, 25)
;

In [None]:
x = popdensci_join['log_sci']
y = popdensci_join['deaths_per_zip']


fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plt.scatter(x, y, alpha=.1, color='dodgerblue')
ax.set_xlabel('SCI, log')
ax.set_ylabel('COVID Deaths')
ax.set_title('SCI log vs. COVID Deaths')
ax.set_ylim(0, 100)
;

In [None]:
x = popdensci_join['log_sci']
y = popdensci_join['cases_per_capita']


fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plt.scatter(x, y, alpha=.1, color='dodgerblue')
ax.set_xlabel('SCI, log')
ax.set_ylabel('COVID Cases per Capita')
ax.set_title('SCI log vs. COVID Cases per Capita')
ax.set_ylim(0, 100)
;

### SCI vs COVID cases, timeseries

#### Data Cleaning

In [None]:
# copy cen_re into new df
# create SCI quantile groups
cen_re_timeseries = cen_re.copy()
cen_re_timeseries.loc[cen_re_timeseries['log_sci'] <= cen_re_timeseries['log_sci'].quantile(.25), 'SCI_lower_25'] = '1'
cen_re_timeseries.loc[cen_re_timeseries['log_sci'] >= cen_re_timeseries['log_sci'].quantile(.75), 'SCI_upper_25'] = '1'
cen_re_timeseries.loc[(cen_re_timeseries['log_sci'] > cen_re_timeseries['log_sci'].quantile(.25)) & \
           (cen_re_timeseries['log_sci'] < cen_re_timeseries['log_sci'].quantile(.75)) , \
           'SCI_middle_50'] = '1'

# join cen_re_timeseries to zips/fips mapping and covid_h_raw
cen_re_timeseries = cen_re_timeseries.reset_index().rename(columns={'index':'zip'})
cen_re_timeseries = cen_re_timeseries.set_index('zip').join(zips_fips_raw.set_index('ZIP')).reset_index().rename(columns={'index':'zip'})
cen_re_timeseries = cen_re_timeseries.set_index('STCOUNTYFP').join(covid_h_raw.set_index('fips'))

In [None]:
# cen_re_timeseries = cen_re_timeseries.drop(columns=['pop', 'CLASSFP'])

In [None]:
cen_re_timeseries

In [None]:
middle_50 = cen_re_timeseries.loc[cen_re_timeseries['SCI_middle_50'] == '1'].groupby(['date']).mean('cases')
middle_50 = middle_50.drop(columns=['scaled_sci', 'log_sci', 'deaths', 'pop']).rename(columns={'cases':'middle_50'})

lower_25 = cen_re_timeseries.loc[cen_re_timeseries['SCI_lower_25'] == '1'].groupby('date').mean('cases')
lower_25 = lower_25.drop(columns=['scaled_sci', 'log_sci', 'deaths', 'pop']).rename(columns={'cases':'lower_25'})

upper_25 = cen_re_timeseries.loc[cen_re_timeseries['SCI_upper_25'] == '1'].groupby('date').mean('cases')
upper_25 = upper_25.drop(columns=['scaled_sci', 'log_sci', 'deaths', 'pop']).rename(columns={'cases':'upper_25'})

quadrants = pd.concat([lower_25, middle_50, upper_25], axis=1)

quadrants = quadrants.sort_index()


In [None]:
quadrants.plot.line(figsize=(18,10));

__________________________________

# Graph Analysis

## Data Cleaning

In [None]:
'''
0) data clean - get a clean list of fips <-> population, pop density, cases, cases/capita - DONE
1a) Intro - what is a node vs edge
1b) Describe node sizing, edge sizing
1c) Example: San Francisco connections to other zips vs rural county to other zips -> population density as a node attributes
EDIT: change san francisco and Kern county example to only include counties w/SCI greater than a certain threshold
2) Bucket by SCI quadrant -> lower 25%, middle 50%, upper 25%
- create quadrants in fb - done
- create new fb_ dataframes - done
- random sample new fb_ dataframes - done
- plot new fb_ dataframes - done
- calculate stats for each quadrant -> covid cases, connectedness, degree

3) Hypothesis testing - test the idea that higher SCIs are more connected and have higher rates of covid spread

4) Quantitative measures - shortest path, connectedness, eigenvector
'''

In [None]:
# na_covid_cases
# na_fips_pop_covid # final table of fips <> pop <> cases <> cases_per_capita <> deaths
# sf_county # sf county to fips SCI
# kern_county # kern county to fips SCI
# fb_ga_lower_25
# fb_ga_middle_50
# fb_ga_upper_25
fb_ga = fb.copy()

In [None]:
fb_ga

In [None]:
# create fb_ga quartiles for lower middle, and upper quartiles
fb_ga.loc[fb_ga['log_sci'] <= fb_ga['log_sci'].quantile(.25), 'SCI_lower_25'] = '1'
fb_ga.loc[fb_ga['log_sci'] >= fb_ga['log_sci'].quantile(.75), 'SCI_upper_25'] = '1'
fb_ga.loc[(fb_ga['log_sci'] > fb_ga['log_sci'].quantile(.25)) & \
           (fb_ga['log_sci'] < fb_ga['log_sci'].quantile(.75)) , \
           'SCI_middle_50'] = '1'

fb_ga_lower_25 = fb_ga[fb_ga.SCI_lower_25 == '1']
fb_ga_middle_50 = fb_ga[fb_ga.SCI_middle_50 == '1']
fb_ga_upper_25 = fb_ga[fb_ga.SCI_upper_25 == '1']

In [None]:
fb_ga_lower_25

In [None]:
# Cleaning COVID datasets
na_covid_cases = covid_h_raw.groupby('fips').max()
na_covid_cases.reset_index(inplace=True)
na_covid_cases.replace("", np.nan, inplace=True)
na_covid_cases.dropna(inplace=True)

In [None]:
na_fips_pop_covid = fips_pop.set_index('FIPS').join(na_covid_cases.set_index('fips')).dropna()

In [None]:
na_fips_pop_covid['cases_per_capita'] = na_fips_pop_covid['cases']/1000

In [None]:
na_fips_pop_covid.sort_values('cases_per_capita').reset_index()

## Intro - San Francisco, network

In [None]:
sf_county = fb[fb['user_loc'] == '06075']

In [None]:
sf_county

In [None]:
g_sf = nx.from_pandas_edgelist(sf_county, "user_loc", "fr_loc", ["scaled_sci", "log_sci"])
print('# of edges: {}'.format(g_sf.number_of_edges()))
print('# of nodes: {}'.format(g_sf.number_of_nodes()))

In [None]:
# reset index of na_fips_pop_covid
na_fips_pop_covid = na_fips_pop_covid.reset_index()

# create nodelist from popdensci_join
nodelist = na_fips_pop_covid.copy()
nodelist = nodelist[['FIPS', 'cases_per_capita']]

In [None]:
nodelist

In [None]:
# deduplicate nodelists by joining edgelist to it
# node_attributes = sf_county.join(nodelist)

In [None]:
# convert node attributes in nodelist to dict
node_attributes_sf = dict(zip(nodelist.FIPS, nodelist.cases_per_capita))

In [None]:
plt.figure(figsize=(15, 12))
pos = nx.kamada_kawai_layout(g_sf)
nx.draw(g_sf,
        pos,
        nodelist=node_attributes_sf.keys(), 
        node_size=[v for v in node_attributes_sf.values()], 
        with_labels=False, 
        font_size=10, 
        width= [data['scaled_sci']/50000 for node1, node2, data in g_sf.edges(data=True)], 
        linewidths=0,
        alpha=0.65)

plt.show()

## Intro - Kern County, network

In [None]:
kern_county = fb[fb['user_loc'] == '06029']

In [None]:
g_kern = nx.from_pandas_edgelist(kern_county, "user_loc", "fr_loc", ["scaled_sci", "log_sci"])
print('# of edges: {}'.format(g_kern.number_of_edges()))
print('# of nodes: {}'.format(g_kern.number_of_nodes()))

In [None]:
# reset index of na_fips_pop_covid
# na_fips_pop_covid = na_fips_pop_covid.reset_index()

# create nodelist from popdensci_join
nodelist = na_fips_pop_covid.copy()
nodelist = nodelist[['FIPS', 'cases_per_capita']]

In [None]:
# convert node attributes in nodelist to dict
node_attributes_kern = dict(zip(nodelist.FIPS, nodelist.cases_per_capita))

In [None]:
plt.figure(figsize=(15, 12))
pos = nx.kamada_kawai_layout(g_kern)
nx.draw(g_kern,
        pos,
        nodelist=node_attributes_kern.keys(), 
        node_size=[v for v in node_attributes_kern.values()], 
        with_labels=False, 
        font_size=10, 
        width= [data['scaled_sci']/50000 for node1, node2, data in g_kern.edges(data=True)], 
        linewidths=0,
        alpha=0.65)

plt.show()

## SCI Quantile Exploration

### Lower 25%

In [None]:
fb_ga_lower_25

In [None]:
lower_25_edges_sample = fb_ga_lower_25.sample(n=2000)

In [None]:
lower_25_edges_sample.drop(labels=['SCI_lower_25', 'SCI_upper_25', 'SCI_middle_50'], axis=1, inplace=True)

In [None]:
lower_25_edges_sample

In [None]:
g_lower_25_sample = nx.from_pandas_edgelist(lower_25_edges_sample, "user_loc", "fr_loc", ["scaled_sci", "log_sci"])
print('# of edges: {}'.format(g_lower_25_sample.number_of_edges()))
print('# of nodes: {}'.format(g_lower_25_sample.number_of_nodes()))

In [None]:
na_fips_pop_covid

In [None]:
# reset index of na_fips_pop_covid
# na_fips_pop_covid = na_fips_pop_covid.reset_index()

# create nodelist from popdensci_join
node_attributes_lower_25_sample = na_fips_pop_covid.copy()
node_attributes_lower_25_sample = node_attributes_lower_25_sample[['FIPS', 'cases_per_capita']]

In [None]:
node_attributes_lower_25_sample['cases_per_capita'].max()

In [None]:
# deduplicate nodelists by joining edgelist to it
node_attributes_lower_25_sample = lower_25_edges_sample.set_index('user_loc').join(node_attributes_lower_25_sample.set_index('FIPS'))

In [None]:
node_attributes_lower_25_sample

In [None]:
# convert node attributes in nodelist to dict
node_attributes_lower_25_sample = dict(zip(node_attributes_lower_25_sample.fr_loc, node_attributes_lower_25_sample.cases_per_capita))

In [None]:
plt.figure(figsize=(15, 12))
# pos = nx.kamada_kawai_layout(g)
nx.draw(g_lower_25_sample,
        nodelist=node_attributes_lower_25_sample.keys(), 
        node_size=[v*5 for v in node_attributes_lower_25_sample.values()], 
        with_labels=False, 
        font_size=10, 
        width= [data['log_sci']/10 for node1, node2, data in g_lower_25_sample.edges(data=True)], 
        linewidths=0,
        node_color = '#DA4167',
        alpha=0.65)

plt.savefig('network_map_lower_50.png', dpi=1200)
# plt.show()

### Middle 50%

In [None]:
middle_50_sample

In [None]:
middle_50_edges_sample = fb_ga_middle_50.sample(n=2000)

In [None]:
middle_50_edges_sample.drop(labels=['SCI_lower_25', 'SCI_upper_25', 'SCI_middle_50'], axis=1, inplace=True)

In [None]:
middle_50_edges_sample

In [None]:
g_middle_50_sample = nx.from_pandas_edgelist(middle_50_edges_sample, "user_loc", "fr_loc", ["scaled_sci", "log_sci"])
print('# of edges: {}'.format(g_middle_50_sample.number_of_edges()))
print('# of nodes: {}'.format(g_middle_50_sample.number_of_nodes()))

In [None]:
na_fips_pop_covid

In [None]:
# reset index of na_fips_pop_covid
# na_fips_pop_covid = na_fips_pop_covid.reset_index()

# create nodelist from popdensci_join
node_attributes_middle_50_sample = na_fips_pop_covid.copy()
node_attributes_middle_50_sample = node_attributes_middle_50_sample[['FIPS', 'cases_per_capita']]

In [None]:
# deduplicate nodelists by joining edgelist to it
node_attributes_middle_50_sample = middle_50_edges_sample.set_index('user_loc').join(node_attributes_middle_50_sample.set_index('FIPS'))

In [None]:
# convert node attributes in nodelist to dict
node_attributes_middle_50_sample = dict(zip(node_attributes_middle_50_sample.fr_loc, node_attributes_middle_50_sample.cases_per_capita))

In [None]:
plt.figure(figsize=(15, 12))
# pos = nx.kamada_kawai_layout(g)
nx.draw(g_middle_50_sample,
        nodelist=node_attributes_middle_50_sample.keys(), 
        node_size=[v*5 for v in node_attributes_middle_50_sample.values()], 
        with_labels=False, 
        font_size=10, 
        width= [data['log_sci']/10 for node1, node2, data in g_middle_50_sample.edges(data=True)], 
        linewidths=0,
        node_color='#DA4167',
        alpha=0.65)
plt.savefig('network_map_middle_50.png', dpi=1200)
# plt.show()
;

### Upper 25%

In [None]:
upper_25_edges_sample = fb_ga_upper_25.sample(n=2000)

In [None]:
upper_25_edges_sample.drop(labels=['SCI_lower_25', 'SCI_upper_25', 'SCI_middle_50'], axis=1, inplace=True)

In [None]:
upper_25_edges_sample

In [None]:
g_upper_25_sample = nx.from_pandas_edgelist(upper_25_edges_sample, "user_loc", "fr_loc", ["scaled_sci", "log_sci"])
print('# of edges: {}'.format(g_upper_25_sample.number_of_edges()))
print('# of nodes: {}'.format(g_upper_25_sample.number_of_nodes()))

In [None]:
na_fips_pop_covid

In [None]:
# reset index of na_fips_pop_covid
# na_fips_pop_covid = na_fips_pop_covid.reset_index()

# create nodelist from popdensci_join
node_attributes_upper_25_sample = na_fips_pop_covid.copy()
node_attributes_upper_25_sample = node_attributes_upper_25_sample[['FIPS', 'cases_per_capita']]

In [None]:
# deduplicate nodelists by joining edgelist to it
node_attributes_upper_25_sample = upper_25_edges_sample.set_index('user_loc').join(node_attributes_upper_25_sample.set_index('FIPS'))

In [None]:
node_attributes_upper_25_sample['cases_per_capita'].max()

In [None]:
# convert node attributes in nodelist to dict
node_attributes_upper_25_sample = dict(zip(node_attributes_upper_25_sample.fr_loc, node_attributes_upper_25_sample.cases_per_capita))

In [None]:
plt.figure(figsize=(15, 12))
# pos = nx.kamada_kawai_layout(g)
nx.draw(g_upper_25_sample,
        nodelist=node_attributes_upper_25_sample.keys(), 
        node_size=[v*5 for v in node_attributes_upper_25_sample.values()], 
        with_labels=False, 
        font_size=10, 
        width= [data['log_sci']/10 for node1, node2, data in g_upper_25_sample.edges(data=True)], 
        linewidths=0,
        node_color='#DA4167',
        alpha=0.75)

# plt.show()
plt.savefig('network_map_upper_25.png', dpi=1200)

## Hypothesis Testing

After taking a sample of upper SCI quartile counties, is the average number of COVID cases per capita significantly greater than the population mean?

1. Formulate hypotheses

$H_0$: High SCI counties have no difference in number of covid cases/capita than lower SCI counties.

$H_a$: High SCI counties have a significantly different number of covid cases/capita than lower SCI counties.

2. Choose a significance level

$\alpha = 0.05$ 

3. Choose a statistical test, find the test statistic

- two tailed t test

- $\bar{X}$ = 

4. Compute probability of results assuming the null hypothesis is true

- outputs the p value

5. Compare p-value to alpha to draw conclusion

- p <= alpha, reject null in favor of the alternative
- p > alpha, fail to reject the null


3a. calculate sample mean from size n, calculate sample standard deviation

3b. calculate z statistic - 
sample mean - population mean / (pop standard deviation / sqroot of n)



In [None]:
# upper

# sample from upper SCI counties
ht_ga_upper_25_sample = fb_ga_upper_25.sample(n=300)

# drop duplicates
ht_ga_upper_25_sample = ht_ga_upper_25_sample.drop_duplicates('user_loc')

# container of sample upper quartile SCI counties + COVID/capita
ht_ga_upper_25_sample = ht_ga_upper_25_sample.set_index('user_loc').join(na_fips_pop_covid.set_index('FIPS'))

# drop nas
ht_ga_upper_25_sample_ttest = ht_ga_upper_25_sample['cases_per_capita'].dropna()

# container of average cases/capita
ht_ga_upper_25_sample_ttest.count()

In [None]:
# lower

# sample from lower SCI counties
ht_ga_lower_25_sample = fb_ga_lower_25.sample(n=300)

# drop duplicates
ht_ga_lower_25_sample = ht_ga_lower_25_sample.drop_duplicates('user_loc')

# container of sample lower quartile SCI counties + COVID/capita
ht_ga_lower_25_sample = ht_ga_lower_25_sample.set_index('user_loc').join(na_fips_pop_covid.set_index('FIPS'))

# drop nas
ht_ga_lower_25_sample_ttest = ht_ga_lower_25_sample['cases_per_capita'].dropna()

# container of average cases/capita
ht_ga_lower_25_sample_ttest.count()

In [None]:
# two-tailed t test

from scipy import stats
scipy.stats.ttest_ind(ht_ga_lower_25_sample_ttest, ht_ga_upper_25_sample_ttest, axis=0, nan_policy='propagate')

In [None]:
ht_ga_lower_25

In [None]:
# container of lower quartile SCI counties
ht_ga_lower_25 = fb_ga_lower_25.drop_duplicates('user_loc')

# container of sample of upper quartile SCI counties
ht_ga_upper_25_sample = ht_ga_upper_25.sample(n=300)

In [None]:
# container of upper quartile SCI counties
ht_ga_upper_25 = fb_ga_upper_25.drop_duplicates('user_loc')

# container of sample of upper quartile SCI counties
ht_ga_upper_25_sample = ht_ga_upper_25.sample(n=300)

In [None]:
ht_ga_upper_25_sample.head()

In [None]:
# container of sample upper quartile SCI counties + COVID/capita
ht_ga_upper_25_sample = ht_ga_upper_25_sample.set_index('user_loc').join(na_fips_pop_covid.set_index('FIPS'))

In [None]:
ht_ga_upper_25_sample.head()

In [None]:
# population mean COVID cases/capita
pop_mean = na_fips_pop_covid['cases_per_capita'].mean()
print(f"Population mean: {na_fips_pop_covid['cases_per_capita'].mean():.2f}")

# population standard deviation COVID cases/capita
pop_std = na_fips_pop_covid['cases_per_capita'].std()
print(f"Population standard deviation: {na_fips_pop_covid['cases_per_capita'].std():.2f}")

# sample mean of COVID cases/capita
sample_mean = ht_ga_upper_25_sample['cases_per_capita'].mean()
print(f"Sample mean: {ht_ga_upper_25_sample['cases_per_capita'].mean():.2f}")

# sample standard deviation
sample_std = ht_ga_upper_25_sample['cases_per_capita'].std()
print(f"Sample standard deviation: {ht_ga_upper_25_sample['cases_per_capita'].std():.2f}")

# z score
z_score = sample_mean-pop_mean/(pop_std/np.sqrt(300))
print(f"z score: {sample_mean-pop_mean/(pop_std/np.sqrt(300)):.2f}")

# p value
import scipy.stats as st
print(f"p-value: {st.norm.pdf(z_score):.2f}")

In [None]:
f, ax = plt.subplots(figsize=(12, 5))
sns.despine(f)

sns.histplot(ht_ga_upper_25_sample['cases_per_capita'],
             bins=100,
             color='#DA4167'
            )

sns.set_style("white")

ax.set_ylim(0, 100)
ax.set_xlim(0, 50)
ax.set_xlabel('log, SCI')
ax.set_title('Distribution of County-County Pairs by log SCI',
            size=18)
;


In [None]:
ht_ga_upper_25_sample.head()

___________________________________________________________________________________________________________________

In [None]:
plotlist = fb.sample(n=2000)

In [None]:
# import edge list from pandas dataframe plotlist
g = nx.from_pandas_edgelist(plotlist, "user_loc", "fr_loc", ["scaled_sci", "log_sci"])
print('# of edges: {}'.format(g.number_of_edges()))
print('# of nodes: {}'.format(g.number_of_nodes()))

In [None]:
# reset index of na_fips_pop_covid
# na_fips_pop_covid = na_fips_pop_covid.reset_index()

# create nodelist from popdensci_join
nodelist = na_fips_pop_covid.copy()
nodelist = nodelist[['FIPS', 'cases_per_capita']]

In [None]:
# deduplicate nodelists by joining edgelist to it
nodelist = plotlist.set_index('user_loc').join(nodelist.set_index('FIPS'))

In [None]:
# convert node attributes in nodelist to dict
node_attributes = dict(zip(nodelist.fr_loc, nodelist.cases_per_capita))

In [None]:
plt.figure(figsize=(15, 12))
# pos = nx.kamada_kawai_layout(g)
nx.draw(g,
        nodelist=node_attributes.keys(), 
        node_size=[v*5 for v in node_attributes.values()], 
        with_labels=False, 
        font_size=10, 
        width= [data['log_sci']/4 for node1, node2, data in g.edges(data=True)], 
        linewidths=0,
        alpha=0.65)

plt.show()

In [None]:
plt.figure(figsize=(15, 12))
# pos = nx.kamada_kawai_layout(g)
nx.draw(g,
        nodelist=node_attributes.keys(), 
        node_size=[v/15 for v in node_attributes.values()], 
        with_labels=False, 
        font_size=10, 
        width= [data['log_sci']/2 for node1, node2, data in g.edges(data=True) if data['log_sci'] > 4], 
        linewidths=0,
        alpha=0.65)

plt.show()

____________________________________________________________

In [None]:
print('# of edges: {}'.format(G.number_of_edges()))
print('# of nodes: {}'.format(G.number_of_nodes()))

In [None]:
import itertools
import copy
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

In [None]:



edgelist = fb.copy().drop(columns='scaled_sci')

In [None]:
plotlist = edgelist.sample(n=2000)

g = nx.Graph()

for i, elrow in plotlist.iterrows():
    g.add_edge(elrow[0], elrow[1], attr_dict=elrow[2:].to_dict())


In [None]:
plotlist

In [None]:
print(elrow[0]) # node1
print(elrow[1]) # node2
print(elrow[2:].to_dict()) # edge attribute dict


In [None]:
# add node attributes
for i, nlrow in nodelist.iterrows():
    try:
        g.nodes[nlrow['zips']].update(nlrow[1:].to_dict())
    except KeyError:
        continue

In [None]:
# add edges and edge attributes
for i, elrow in plotlist.iterrows():
    g.add_edge(elrow[0], elrow[1], attr_dict=elrow[2:].to_dict())

In [None]:
print('# of edges: {}'.format(g.number_of_edges()))
print('# of nodes: {}'.format(g.number_of_nodes()))

%time

In [None]:
print('# of edges: {}'.format(g.number_of_edges()))
print('# of nodes: {}'.format(g.number_of_nodes()))

In [None]:
plt.figure(figsize=(15, 12))
nx.draw(g, node_size=10)
plt.show()

%time

In [None]:
import sys, networkx as nx, matplotlib.pyplot as plt

# Create a list of 10 nodes numbered [0, 9]
nodes = range(10)
node_sizes = []
labels = {}
for n in nodes:
        node_sizes.append( 100 * n )
        labels[n] = 100 * n

# Node sizes: [0, 100, 200, 300, 400, 500, 600, 700, 800, 900]

# Connect each node to its successor
edges = [ (i, i+1) for i in range(len(nodes)-1) ]

# Create the graph and draw it with the node labels
g = nx.Graph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)

nx.draw_random(g, node_size = node_sizes, labels=labels, with_labels=True)    
plt.show()

In [None]:
G = nx.Graph()
G.add_edge('31183','48267',color='r',weight=2)
G.add_edge('54013','37115',color='g',weight=4)
G.add_edge('20201','22099',color='b',weight=6)
G.add_edge('39015','72011',color='y',weight=3)
G.add_edge('16051','13019',color='m',weight=1)
G.add_edge('16051','22099',color='m',weight=1)

colors = nx.get_edge_attributes(G,'color').values()
weights = nx.get_edge_attributes(G,'weight').values()

pos = nx.shell_layout(G)
nx.draw(G, pos,
        edge_color=colors,
        width=list(weights),
        with_labels=True,
        node_color='lightgreen')

In [None]:
import networkx as nx

G = nx.Graph()
G.add_edge(1,2,color='r',weight=2)
G.add_edge(2,3,color='b',weight=4)
G.add_edge(3,4,color='g',weight=6)

pos = nx.circular_layout(G)

edges = G.edges()
colors = [G[u][v]['color'] for u,v in edges]
weights = [G[u][v]['weight'] for u,v in edges]

nx.draw(G, pos, edges=edges, edge_color=colors, width=weights)