In [None]:
!pip install geopandas
!pip install pygeos
!pip install gpdvega

# Dependencies
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import pygeos
import gpdvega 
import seaborn as sns


In [None]:
counties = gpd.read_file('/work/ca-county-boundaries/CA_Counties/CA_Counties_TIGER2016.shp')
counties['county'] = pd.to_numeric(counties['COUNTYFP'])


ca_full = pd.read_csv('/work/cleaned-csvs/ca_counties_full_dataset.csv')

us_counties = gpd.read_file('/work/cb_2018_us_county_20m/cb_2018_us_county_20m.shp')
us_counties['county'] = pd.to_numeric(us_counties['COUNTYFP'])
us_counties['state'] = pd.to_numeric(us_counties['STATEFP'])
us_counties = us_counties[us_counties['state']<65]

us_df = pd.read_csv('/work/cleaned-csvs/national_counties_full_dataset.csv')

us_df = us_df[us_df['state']<19]

ca_19 = ca_full[ca_full['year']==2019]
ca_full['year'] = pd.to_datetime(ca_full.year, format='%Y')

ca_full['year'].unique()
ca_full.head()

The follow graph contains the change in inflow and outflow from 2014 through 2019 in all 58 CA counties. For any given county, we see that inflow and outflow are similar in size and highly correlated. We also see that inflow and outflow remain pretty consistant throught the period of this study. There are two interesting systematic differences. For 2015, all counties with inflow over 20,000, have a noticable decrease in both inflow and outflow. For 2017, we see all counties with an inflow greater than 20,000 have an increase in both inflow and outflow. This systematic difference has been documented by researchers at the University of Minnisota who caution against using this data after the IRS took over managing it from the US Census, noting, "Specifically, after 2012, out- and inmigration fall precipitously through 2014, increase dramatically through 2016, and then
sharply increase or decrease thereafter. The correspondence between the levels and changes
of out- and in-migration after (versus before) 2011 is also noteworthy." (page 4)

In [None]:


outflow = alt.Chart(ca_full).mark_line().encode(
    x='year:T',
    y='individual_outflow',
    color = 'county_name',
    tooltip='county_name'
)

inflow = alt.Chart(ca_full).mark_line().encode(
    x='year:T',
    y='individual_inflow',
    color = 'county_name',
    tooltip='county_name'
)

(outflow | inflow).properties(title="County inflow and outflow over time (each line represents a county)")

In [None]:

outflow = alt.Chart(ca_full).mark_point().encode(
    x='individual_outflow',
    y='county_name',
    color = 'year:N',
    tooltip='county_name'
)

inflow = alt.Chart(ca_full).mark_point().encode(
    x='individual_outflow',
    y='county_name',
    color = 'year:N',
    tooltip='county_name'
)

inflow | outflow

#To do: hide axes on outflow

In [None]:
ind_in = alt.Chart(ca_19).mark_point().encode(
    x='individual_inflow',
    y='county_name',
    color = 'year:N',
    tooltip='county_name'
)

ind_out = alt.Chart(ca_19).mark_point().encode(
    x='individual_outflow',
    y='county_name',
    color = 'year:N',
    tooltip='county_name'
)

ind_in + ind_out

In [None]:

 chart = alt.Chart(ca_full).mark_circle().encode(
    x='individual_inflow',
    y='individual_outflow',
    color=alt.Color('county_name'),
    tooltip=['county_name','year']
).properties(
    title={
      "text": ['CA County Inflow vs. Outflow 2014 - 2019'], 
      "subtitle": ["Each color represents a county"],
    }
    ) 

text1 = alt.Chart({'values':[{'x': 125000, 'y': 200000}]}).mark_text(
    text='LA County in 2015 ➟',
).encode(
    x='x:Q', y='y:Q'
)

text2 = alt.Chart({'values':[{'x': 195000, 'y': 340000}]}).mark_text(
    text='LA County in 2017 ➟',# angle=327
).encode(
    x='x:Q', y='y:Q'
)
text3 = alt.Chart({'values':[{'x': 110000, 'y': 260000}]}).mark_text(
    text='LA County in 2014, 2016, 2018 + 2019 ➟'
).encode(
    x='x:Q', y='y:Q'
)

chart + text1 + text2 + text3

In [None]:


ca_all = pd.merge(counties,ca_19,how='left',on='county')

# Create column with possible different choices for inflow/outflow target
ca_all['ratio_inflow_outflow'] = ca_all['individual_inflow'] / ca_all['individual_outflow']
ca_all['dif_inflow_outflow'] = ca_all['individual_inflow'] - ca_all['individual_outflow']
ca_all['per_change_pop'] = (ca_all['individual_inflow'] - ca_all['individual_outflow']) / ca_all['total_population']

#create rank for each of these. rank = 1 means high ranking for people coming into the county, rank = 58 means highest ranking for people leaving the county
ca_all['rank_ratio_inflow_outflow'] = ca_all['ratio_inflow_outflow'].rank(ascending=False)
ca_all['rank_dif_inflow_outflow'] = ca_all['dif_inflow_outflow'].rank(ascending=False)
ca_all['rank_per_change_pop'] = ca_all['per_change_pop'].rank(ascending=False)


ca_all[['NAME','per_change_pop','rank_per_change_pop','dif_inflow_outflow','rank_dif_inflow_outflow','rank_ratio_inflow_outflow','ratio_inflow_outflow']].sort_values(by='rank_per_change_pop').head()


In [None]:
col = [ 'perc_poverty','republican_pct', 'perc_unemployed','per_capita_farm_proprieter_jobs','latitude', 'perc_white', 'perc_65_over',
       'housing_per_capita','perc_vacant','per_capita_num_violent_crimes',
       'perc_american_indian',  'perc_enrolled_undergrad',  
       'perc_owner', 'avg_temp','perc_renter','perc_hispanic',
        'area_land', 'area_water',
       'longitude',  'perc_black','democrat_pct','median_income',
       'median_rent', 'median_home_value', 'educational_attainment','perc_asian',
       'av_commute_time','total_population', 
       'individual_inflow', 'individual_outflow','per_change_pop','ratio_inflow_outflow','dif_inflow_outflow',

       ]

plt.figure(figsize = (10,10))
sns.heatmap(ca_all[col].corr().sort_values(
              by='individual_inflow',axis=1).sort_values(
              by='individual_inflow',axis=0),xticklabels=True, 
              yticklabels=True, square = True,linewidths=.25).set(title='Correlation between CA Variables')

In [None]:
col = ['individual_outflow', 'individual_inflow','per_change_pop','ratio_inflow_outflow','dif_inflow_outflow']
cor_df = ca_all.corr()#.sort_values(by='individual_inflow',axis=1,ascending=False)

plt.figure(figsize = (5,15))
sns.heatmap(cor_df[col].dropna(axis=0).sort_values(by='individual_inflow'),linewidths=.15,annot=True)

### Which counties had the biggest increase or decrease in population?

There are many different ways to compare counties and their relative inflow and outflow. We could take the ratio of inflow to outflow, the net inflow (inflow minus outflow), or the percent change in population (inflow - outflow)/population. Each method will yield a different ranking for cities with the highest inflow. This is an investigation of the variation and similarity between these methods. If we want larger counties to hold a larger weight in our model than smaller counties, using raw numbers is most logical (inflow, outflow, or inflow - outflow). If instead, we want all counties to hold equal weight in the model, normalized versions of these metrics are more logical (percent change in population or ratio of inflow to outflow). Here is a look at the similarities and differences between the methods. 

In [None]:
inflow = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip='NAME',
    color='individual_inflow'
).properties(title='2019 Individual Inflow (number of people who move to a county)')

outflow = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip='NAME',
    color='individual_outflow'
).properties(title='2019 Individual Outflow (number of people who leave a county)')

population = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip='NAME',
    color='total_population'
).properties(title='2019 Individual Outflow (number of people who leave a county)')

difference = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip=['NAME','dif_inflow_outflow'],
    color = alt.Color('dif_inflow_outflow',scale=alt.Scale(scheme='blueorange', domain=[-9000, 9000]))
).properties(title='2019 Difference between Inflow and Outflow')

(inflow | outflow) 

In [None]:
butte = ca_all[ca_all['county_name']=='butte']

# alt.Chart(butte).mark_geoshape(stroke='grey',fill=None).encode(
#     text='county_name'
# )

butte = alt.Chart(butte).mark_text().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    text=alt.Text('county_name:N'))

In [None]:
difference + butte

In [None]:
ratio_inflow_outflow = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip=['NAME','ratio_inflow_outflow'],
    color = alt.Color('ratio_inflow_outflow',scale=alt.Scale(scheme='blueorange', domain=[.5, 1.5]))
).properties(title='2019 Ratio of Inflow to Outflow')

per_change_pop = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip=['NAME','per_change_pop'],
    color = alt.Color('per_change_pop',scale=alt.Scale(scheme='blueorange', domain=[-0.02, 0.02]),legend=alt.Legend(format=".0%"))
    #color='per_change_pop'
).properties(title='2019 Percent Change in Population from Migration')

per_change_pop

In [None]:
ratio_inflow_outflow

Here we notice that Butte county stands out with the lowest ratio of outflow to inflow (0.44). This means that 44 people move to this county for every 100 that leave.  Between 2018 and 2019, approximately 3.7% of the population left the county, leaving it as the county with the sharpest decrease in population, far outpacing the next closest county of San Fransisco, which had a 1.5% decrease in population. This is largly due to the 2018 wildfire that hit Paradise, a town within the county. This may also account for the large increase in population in some surrounding counties including Yuba (0.8% increase),and Placer (1% increase). 

In [None]:
most_leaving = ca_all.sort_values(by='ratio_inflow_outflow', axis=0, ascending=True)['NAME'][:5]

most_leaving_ratios = ca_all.sort_values(by='ratio_inflow_outflow', axis=0, ascending=True)['ratio_inflow_outflow'][:5]

print("These are the top 5 counties with the highest ratio of leaving to staying: \n{}".format(most_leaving.values))

most_leaving = ca_all.sort_values(by='ratio_inflow_outflow', axis=0, ascending=True)['NAME'][:5]

print("These are those counties' corresponding ratios of inflow to outflow: \n{}".format(most_leaving_ratios.values))

In [None]:
ranking_ratio_inflow_outflow = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip=['NAME','rank_ratio_inflow_outflow','ratio_inflow_outflow'],
    color='rank_ratio_inflow_outflow'
).properties(title='2019 Ranking of Ratio of Inflow to Outflow')

ranking_dif_inflow_outflow = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip=['NAME','rank_dif_inflow_outflow','dif_inflow_outflow'],
    color='rank_dif_inflow_outflow'
).properties(title='2019 Ranking of Difference between Inflow and Outflow')

ranking_per_change_pop = alt.Chart(ca_all).mark_geoshape().encode(
    tooltip=['NAME','rank_per_change_pop','per_change_pop'],
    color='rank_per_change_pop'
).properties(title='2019 Ranking of Percent Change in Population')

(ranking_ratio_inflow_outflow | ranking_dif_inflow_outflow) & (ranking_per_change_pop)

In [None]:
# compare correlation between various metrics
# We see ratio inflow/outflow similar to per_change population. 
# These are both v. dif from difference in - out

df = ca_all[['rank_ratio_inflow_outflow','rank_per_change_pop','rank_dif_inflow_outflow']].corr(method='kendall')


sns.heatmap(df,label=True, vmin=0, vmax=1,annot=True)

In [None]:
per_change_v_dif = alt.Chart(ca_all).mark_point().encode(
    x = 'rank_dif_inflow_outflow',
    y = 'rank_per_change_pop',
    tooltip = ['NAME','rank_per_change_pop','rank_dif_inflow_outflow']
)

per_change_v_ratio = alt.Chart(ca_all).mark_point().encode(
    x = 'rank_ratio_inflow_outflow',
    y = 'rank_per_change_pop',
    tooltip = ['NAME','rank_ratio_inflow_outflow','rank_per_change_pop']
)

ratio_v_dif = alt.Chart(ca_all).mark_point().encode(
    x = 'rank_ratio_inflow_outflow',
    y = 'rank_dif_inflow_outflow',
    tooltip = ['NAME','rank_ratio_inflow_outflow','rank_dif_inflow_outflow']
).properties(title='Rank of the ratio of inflow to outflow vs. rank of the difference between inflow and outflow')

(ratio_v_dif | per_change_v_ratio) | (per_change_v_dif)

In [None]:
# compare correlation between various metrics
# We see ratio inflow/outflow similar to per_change population. 
# These are both v. dif from difference in - out

sns.heatmap(ca_all[['ratio_inflow_outflow','per_change_pop','dif_inflow_outflow']].corr(),label=True, vmin=0, vmax=1,annot=True)

In [None]:
per_change_v_dif = alt.Chart(ca_all).mark_point().encode(
    x = 'per_change_pop',
    y = 'dif_inflow_outflow',
    tooltip = ['NAME','per_change_pop','dif_inflow_outflow']
)

per_change_v_ratio = alt.Chart(ca_all).mark_point().encode(
    x = 'ratio_inflow_outflow',
    y = 'per_change_pop',
    tooltip = ['NAME','ratio_inflow_outflow','per_change_pop']
)

ratio_v_dif = alt.Chart(ca_all).mark_point().encode(
    x = 'ratio_inflow_outflow',
    y = 'dif_inflow_outflow',
    tooltip = ['NAME','ratio_inflow_outflow','dif_inflow_outflow']
).properties(title='Ratio of inflow to outflow vs. Difference between inflow and outflow')

ratio_v_dif | per_change_v_ratio | per_change_v_dif

# All of US

In [None]:
col = ['total_population', 'median_income',
       'median_rent', 'median_home_value', 'educational_attainment',
       'av_commute_time', 'perc_poverty', 'perc_white', 'perc_black',
       'perc_american_indian', 'perc_asian', 'perc_hispanic', 'perc_65_over',
       'perc_enrolled_undergrad', 'perc_unemployed', 'housing_per_capita',
       'perc_owner', 'perc_renter', 'perc_vacant', 
       'individual_inflow', 'individual_outflow', 'area_land', 'area_water',
       'longitude', 'latitude']
sns.heatmap(us_df[col].corr(),xticklabels=True, yticklabels=True)

In [None]:
df_19 = us_df[us_df['year']==2019]

df_19['individual_outflow'].std()
df_19['individual_outflow'].mean()


In [None]:


alt.Chart(df_19).mark_point().encode(
    x ='individual_inflow',
    y ='individual_outflow',
    tooltip = 'name',
    color='state:N'
).properties(title = '2019 inflow and outflow')



In [None]:
alt.Chart(df_19).mark_point().encode(
    x ='median_home_value',
    y ='median_rent',
    tooltip = 'name',
    color='state:N'
).properties(title = '2019 rent and home value')

In [None]:
df_19.columns

In [None]:
us_df.columns

In [None]:
us_all = pd.merge(us_counties,df_19,how='left',on=['state','county'])


In [None]:
us_all = us_all[us_all['state'] != 15]
us_all = us_all[us_all['state'] != 2]

In [None]:
alt.Chart(us_all).mark_geoshape().encode(color='perc_black',tooltip='name').properties(title='Percent Black in 2019')

In [None]:
alt.Chart(us_all).mark_geoshape().encode(color='median_rent',tooltip=['name','median_rent'])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f6c76417-5fde-42f3-8920-755838dec3fa' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>