# Income Inequality
## Is it more beneficial to be poor in a rich county or rich in a poor county?

There's a few ways to answer that question.
1. Find information on individuals' income and health that can be mapped to a metro area or county
2. Find neighboring counties in which the income inequality is high and then compare the health outcomes between those high inequality counties to poor counties that are surrounded by poor counties and rich counties that are surrounded by rich counties - this is probably the best choice since it's difficult to get health information about specific individuals.
    a. More precise details for 2 - we form a baseline for rich and poor counties by finding counties that are surrounded by counties of similar income (so a rich county would form a baseline if it was only adjacent to counties that are +/- 1.0 weighted stub average), then for the comparison counties, we find a poor county that is adjacent to a county that is i.e. +2.5 weighted stub average. We end up with baselines as a control group and the inequal county pairs as the test group.

In [1]:
import clickhouse_connect
import polars as pl
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gp
import pandas as pd

In [2]:
client = clickhouse_connect.get_client(host='hub.publichealthhq.xyz', port=18123, username='default', password='Password123!')

In [3]:
result = client.query("""
SELECT (sum(ADJUSTED_GROSS_INCOME) / sum(NUM_RETURNS)) as agiavg, (sum(TAXES_PAID_AMOUNT) / sum(NUM_RETURNS)) as taxavg, any(STATE_NAME), any(COUNTY_NAME), MEASURE, avg(DATA_VALUE), median(DATA_VALUE) 
FROM cps_00004.places_county 
JOIN cps_00004.income_tax 
ON cps_00004.places_county.COUNTY_FIPS = cps_00004.income_tax.COUNTYFIP 
GROUP BY COUNTY_FIPS, MEASURE
ORDER BY agiavg DESC
""")
print(next(result.named_results()))
df = pl.from_dicts(result.named_results(), infer_schema_length=400)

{'agiavg': 248351.2928022362, 'taxavg': 7794.619147449336, 'any(STATE_NAME)': 'Wyoming', 'any(COUNTY_NAME)': 'Teton', 'MEASURE': 'Mammography use among women aged 50-74 years', 'avg(DATA_VALUE)': 71.7, 'median(DATA_VALUE)': Decimal('71.7')}


In [4]:
def query(q):
    result = client.query(q)
    return pl.from_dicts(result.named_results(), infer_schema_length=400)

In [5]:
df = query("""
    SELECT DISTINCT STATE_ABBREV, COUNTY_NAME, AGI_STUB, NUM_RETURNS, COUNTYFIP, ADJUSTED_GROSS_INCOME
    FROM cps_00004.income_tax 
    """)
returns_per_county = query("""
    SELECT DISTINCT any(STATE_ABBREV) as STATE_ABBREV, any(COUNTY_NAME) as COUNTY_NAME, sum(NUM_RETURNS) as TOTAL_RETURNS, COUNTYFIP
    FROM cps_00004.income_tax
    GROUP BY COUNTYFIP
    """)

In [6]:
df.filter(pl.col('COUNTY_NAME') == 'Alexandria city')

STATE_ABBREV,COUNTY_NAME,AGI_STUB,NUM_RETURNS,COUNTYFIP,ADJUSTED_GROSS_INCOME
str,str,i64,i64,i64,i64
"""VA""","""Alexandria cit…",1,780,51510,-70287000
"""VA""","""Alexandria cit…",2,6950,51510,36209000
"""VA""","""Alexandria cit…",3,11910,51510,206421000
"""VA""","""Alexandria cit…",4,16540,51510,614136000
"""VA""","""Alexandria cit…",5,13200,51510,818113000
"""VA""","""Alexandria cit…",6,9850,51510,854645000
"""VA""","""Alexandria cit…",7,16690,51510,2323196000
"""VA""","""Alexandria cit…",8,8940,51510,3861600000


In [7]:
# We want a weight average AGI_STUB value
df.filter(pl.col('COUNTY_NAME') == 'Alexandria city').group_by(['COUNTYFIP']).agg(pl.col('NUM_RETURNS').sum())

COUNTYFIP,NUM_RETURNS
i64,i64
51510,84860


In [8]:
returns_per_county

STATE_ABBREV,COUNTY_NAME,TOTAL_RETURNS,COUNTYFIP
str,str,i64,i64
"""SD""","""Lyman County""",1720,46085
"""PA""","""Juniata County…",11210,42067
"""GA""","""Hart County""",10180,13147
"""GA""","""Twiggs County""",3580,13289
"""WI""","""Kenosha County…",80820,55059
"""VA""","""Arlington Coun…",125580,51013
"""VA""","""Washington Cou…",22590,51191
"""KY""","""Christian Coun…",30060,21047
"""TN""","""Carroll County…",11430,47017
"""IL""","""Logan County""",12570,17107


In [9]:
proportion_stub = df.join(returns_per_county, on='COUNTYFIP').with_columns( PROPORTION_RETURNS=(pl.col('NUM_RETURNS') / pl.col('TOTAL_RETURNS')) )

In [10]:
weighted_average_stub = proportion_stub.group_by('COUNTYFIP').agg( (pl.col('AGI_STUB') * pl.col('PROPORTION_RETURNS')).sum() / pl.col('PROPORTION_RETURNS').sum() )

In [11]:
was = weighted_average_stub.with_columns(WEIGHTED_COUNTY_STUB_AVG=pl.col('AGI_STUB')).sort('WEIGHTED_COUNTY_STUB_AVG', descending=True).join(proportion_stub, on='COUNTYFIP')

In [12]:
normalized_was = was.with_columns(NORMALIZED_COUNTY_STUB_AVG=pl.col('WEIGHTED_COUNTY_STUB_AVG') / pl.col('WEIGHTED_COUNTY_STUB_AVG').mean() )\
.select(['STATE_ABBREV', 'COUNTY_NAME', 'WEIGHTED_COUNTY_STUB_AVG', 'NORMALIZED_COUNTY_STUB_AVG', 'COUNTYFIP'])\
.sort(['NORMALIZED_COUNTY_STUB_AVG'])

In [13]:
# counties = proportion_stub\
# .with_columns(AVG_COUNTY_AGI=pl.col('ADJUSTED_GROSS_INCOME') / pl.col('NUM_RETURNS'))\
# .group_by(['COUNTYFIP'])\
# .agg( 
#     pl.col('AVG_COUNTY_AGI'), 
#     pl.col('TOTAL_RETURNS')
# )

# def gini(indivs):
#     difference_sum = 0
#     n = len(indivs)
#     avg = sum(indivs) / n
    
#     for i in indivs:
#         for j in indivs:
#             difference_sum += abs(i - j)
#     g = difference_sum / (2 * (n**2) * avg)
#     return g

# # for (county, frame) in counties:
# #     indivs = []
# #     for row in frame.rows(named=True):
# #         indivs.extend( [row['AVG_COUNTY_AGI'] for _ in range(row['TOTAL_RETURNS'])] )
# #     g = gini(indivs)
# #     print(f'{county[0]}: {g}')
    
# counties

In [28]:
counties = proportion_stub\
    .with_columns(AVG_COUNTY_AGI=pl.col('ADJUSTED_GROSS_INCOME') / pl.col('NUM_RETURNS'))\
    .group_by(['COUNTYFIP'])

def part(groups):
    pos = []
    neg = []

    for (agi, count) in groups:
        if agi < 0:
            neg.append((agi, count))
        else:
            pos.append((agi, count))
    return neg, pos

def gini(groups):

    if len(groups) == 1:
        # return 0.0 to indicate that everyone in this county is perfectly equal.
        # Which is not technically true, but it's the best we can do with the
        # granularity of available information
        return 0.0
    
    
    neg, pos = part(groups)
    T_n = sum([ abs(agi)*count for (agi, count) in neg ])
    T_a = sum([ agi*count for (agi, count) in pos ])

    S = 0

    M = len(neg) + len(pos)
    
    for (agi_i, count_i) in groups:
        for (agi_j, count_j) in groups:
            S += abs(agi_i - agi_j) * count_j
    G = S / (2 * (M - 1) * (T_a - T_n))
    return G
        
import time
import json

print("Running group by")
ginis = []
with open('county_tax.json', 'w') as f:
    cd = {}
    for (county, frame) in counties:
        # print(f'county: {county[0]}')
        indivs = []
        for row in frame.rows(named=True):
            indivs.append( (row['AVG_COUNTY_AGI'], row['TOTAL_RETURNS']) )
        G = gini(indivs)
        cd[county[0]] = gini(indivs)
        ginis.append( [county[0], G] )
        # print(f'county: {county[0]} - gini: {cd[county[0]]}')
    json.dump(cd, f)
    

Running group by


In [33]:
import csv

with open('county_ginis.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['county_fips', 'gini'])
    for g in ginis:
        writer.writerow({'county_fips': g[0], 'gini': g[1]})
        
    

In [92]:
res = client.query('SELECT STATE_COUNTY_FIPS, GINI FROM cps_00004.county_gini')
gini_df = pl.from_dicts(res.named_results(), infer_schema_length=400)

In [95]:
gini_df = gini_df.to_pandas()

In [257]:
res = client.query('SELECT COUNTYFIP, (sum(ADJUSTED_GROSS_INCOME) / sum(NUM_RETURNS)) as avg_agi FROM cps_00004.income_tax GROUP BY COUNTYFIP')
tax_df = pl.from_dicts(res.named_results(), infer_schema_length=400).to_pandas()
tax_df['STATE_COUNTY_FIPS'] = tax_df['COUNTYFIP']

In [None]:
proportion_stub.group_by(['COUNTYFIP', 'AGI_STUB']).agg( pl.col('ADJUSTED_GROSS_INCOME').sum() / pl.col('NUM_RETURNS').sum() )

In [None]:
unique_normalized = normalized_was.unique()
pd_unique = unique_normalized.to_pandas()
pd_unique

In [None]:
import plotly as pt
import plotly.express as px
import json

In [None]:
# with open('./counties.geojson', 'r').read() 
counties = json.load(open('./counties.geojson', 'r'))

In [None]:
len(counties['features'])

In [None]:
fig = px.choropleth_mapbox(pd_unique, 
                           geojson=counties, 
                           locations='COUNTYFIP', 
                           color='NORMALIZED_COUNTY_STUB_AVG',
                           color_continuous_scale="Viridis",
                           range_color=(0.74, 1.35),
                           mapbox_style="carto-positron",
                           zoom=3, 
                           center={"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'NORMALIZED_COUNTY_STUB_AVG':'Nationally Normalized AGI Stub Avg', 'COUNTY_NAME': 'County', 'STATE_ABBREV': 'State'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
gdf = gp.read_file('./counties.geojson')

In [None]:
gdf['COUNTYFIP'] = pd.to_numeric(gdf['STATE'].astype(str) + gdf['COUNTY'].astype(str))

In [None]:
merged = gdf.merge(pd_unique, on='COUNTYFIP', how='inner')        
merged

In [None]:
merged['buffered'] = merged['geometry'].buffer(1.0)

joined = merged.sjoin(merged, how='inner', predicate='touches')
joined

In [None]:
joined[ (0.85 * joined['NORMALIZED_COUNTY_STUB_AVG_left'] <=  joined['NORMALIZED_COUNTY_STUB_AVG_right']) &  (joined['NORMALIZED_COUNTY_STUB_AVG_right'] <= 1.15*joined['NORMALIZED_COUNTY_STUB_AVG_left']) ]

In [None]:
# js = joined['geometry'].unique()
# js.to_json()
joined = joined.drop('buffered_left' ,axis=1)
joined.to_file('./data.geojson', driver='GeoJSON')
# full_js = joined.to_json()

In [None]:
counties = json.load(open('./data.geojson', 'r'))

In [None]:
fig = px.choropleth_mapbox(pd_unique, 
                           geojson=counties, 
                           locations='COUNTYFIP', 
                           color='NORMALIZED_COUNTY_STUB_AVG',
                           color_continuous_scale="Viridis",
                           range_color=(0.74, 1.35),
                           mapbox_style="carto-positron",
                           zoom=3, 
                           center={"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'NORMALIZED_COUNTY_STUB_AVG':'Nationally Normalized AGI Stub Avg', 'COUNTY_NAME': 'County', 'STATE_ABBREV': 'State'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [65]:
gdf = gp.read_file('counties.geojson')

In [240]:
gini_df['STATE_COUNTY_FIPS'] = gini_df['STATE_COUNTY_FIPS'].astype(int)
# Get rid of extreme outliers
gini_filtered = gini_df[ (0 < gini_df['GINI']) & (gini_df['GINI'] < 100) ]

In [260]:
tax_gini_filtered = pd.merge(gini_filtered, tax_df, on='STATE_COUNTY_FIPS', how='inner', validate='1:1')

In [66]:
gdf['buffered'] = gdf.buffer(1)


  gdf['buffered'] = gdf.buffer(1)


In [167]:
gdf['STATE_COUNTY_FIPS'] = (gdf['STATE'].astype(str) + gdf['COUNTY'].astype(str)).astype(int)
gdf.columns

Index(['id', 'GEO_ID', 'STATE', 'COUNTY', 'NAME', 'LSAD', 'CENSUSAREA',
       'geometry', 'buffered', 'boundary', 'STATE_COUNTY_FIPS'],
      dtype='object')

In [261]:
joined_gini = pd.merge(gdf, tax_gini_filtered, on=['STATE_COUNTY_FIPS'], how='inner', validate='1:1')
joined_gini

Unnamed: 0,id,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry,buffered,boundary,STATE_COUNTY_FIPS,GINI,COUNTYFIP,avg_agi
0,01001,0500000US01001,01,001,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...","POLYGON ((-87.01652 33.65868, -87.01225 33.660...","LINESTRING (-86.49677 32.34444, -86.71790 32.4...",1001,0.715646,1001,53881.928214
1,01009,0500000US01009,01,009,Blount,County,644.776,"POLYGON ((-86.57780 33.76532, -86.75914 33.840...","POLYGON ((-87.90867 33.52129, -87.91261 33.531...","LINESTRING (-86.57780 33.76532, -86.75914 33.8...",1009,0.791975,1009,48101.044505
2,01017,0500000US01017,01,017,Chambers,County,596.531,"POLYGON ((-85.18413 32.87053, -85.12342 32.772...","POLYGON ((-84.26023 32.26765, -84.29074 32.213...","LINESTRING (-85.18413 32.87053, -85.12342 32.7...",1017,0.815764,1017,35732.164948
3,01021,0500000US01021,01,021,Chilton,County,692.854,"POLYGON ((-86.51734 33.02057, -86.51596 32.929...","POLYGON ((-87.80486 33.45455, -87.76922 33.532...","LINESTRING (-86.51734 33.02057, -86.51596 32.9...",1021,0.855137,1021,44896.473988
4,01033,0500000US01033,01,033,Colbert,County,592.619,"POLYGON ((-88.13999 34.58170, -88.13925 34.587...","POLYGON ((-88.16394 33.58199, -88.26176 33.589...","LINESTRING (-88.13999 34.58170, -88.13925 34.5...",1033,0.880750,1033,49426.298016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3131,51001,0500000US51001,51,001,Accomack,County,449.496,"MULTIPOLYGON (((-75.24227 38.02721, -75.29687 ...","POLYGON ((-76.96117 37.54740, -76.97819 37.589...","MULTILINESTRING ((-75.24227 38.02721, -75.2968...",51001,0.826645,51001,43660.891089
3132,51021,0500000US51021,51,021,Bland,County,357.725,"POLYGON ((-81.22510 37.23487, -81.20477 37.243...","POLYGON ((-82.00579 37.87439, -81.90636 37.966...","LINESTRING (-81.22510 37.23487, -81.20477 37.2...",51021,0.781937,51021,47413.618677
3133,51027,0500000US51027,51,027,Buchanan,County,502.763,"POLYGON ((-81.96830 37.53780, -81.92787 37.512...","POLYGON ((-83.06569 36.64372, -83.12192 36.723...","LINESTRING (-81.96830 37.53780, -81.92787 37.5...",51027,0.755729,51027,44295.212766
3134,51037,0500000US51037,51,037,Charlotte,County,475.271,"POLYGON ((-78.44332 37.07940, -78.49303 36.891...","POLYGON ((-79.54353 36.25397, -79.55788 36.265...","LINESTRING (-78.44332 37.07940, -78.49303 36.8...",51037,0.806326,51037,41544.793713


In [262]:
joined = gp.sjoin(joined_gini, joined_gini, how='inner', predicate='intersects')
joined

Unnamed: 0,id_left,GEO_ID_left,STATE_left,COUNTY_left,NAME_left,LSAD_left,CENSUSAREA_left,geometry,buffered_left,boundary_left,...,COUNTY_right,NAME_right,LSAD_right,CENSUSAREA_right,buffered_right,boundary_right,STATE_COUNTY_FIPS_right,GINI_right,COUNTYFIP_right,avg_agi_right
0,01001,0500000US01001,01,001,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...","POLYGON ((-87.01652 33.65868, -87.01225 33.660...","LINESTRING (-86.49677 32.34444, -86.71790 32.4...",...,085,Lowndes,County,715.911,"POLYGON ((-87.74894 31.51031, -87.78527 31.569...","LINESTRING (-86.85758 31.96217, -86.85658 32.0...",1085,0.843279,1085,36537.242798
0,01001,0500000US01001,01,001,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...","POLYGON ((-87.01652 33.65868, -87.01225 33.660...","LINESTRING (-86.49677 32.34444, -86.71790 32.4...",...,101,Montgomery,County,784.247,"POLYGON ((-87.39479 31.90690, -87.41075 31.938...","LINESTRING (-86.41117 32.40994, -86.23707 32.4...",1101,1.025438,1101,51891.872438
0,01001,0500000US01001,01,001,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...","POLYGON ((-87.01652 33.65868, -87.01225 33.660...","LINESTRING (-86.49677 32.34444, -86.71790 32.4...",...,047,Dallas,County,978.695,"POLYGON ((-88.40600 32.66675, -88.40565 32.669...","LINESTRING (-86.90696 32.04797, -87.17807 32.0...",1047,0.802473,1047,38101.175015
0,01001,0500000US01001,01,001,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...","POLYGON ((-87.01652 33.65868, -87.01225 33.660...","LINESTRING (-86.49677 32.34444, -86.71790 32.4...",...,021,Chilton,County,692.854,"POLYGON ((-87.80486 33.45455, -87.76922 33.532...","LINESTRING (-86.51734 33.02057, -86.51596 32.9...",1021,0.855137,1021,44896.473988
0,01001,0500000US01001,01,001,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...","POLYGON ((-87.01652 33.65868, -87.01225 33.660...","LINESTRING (-86.49677 32.34444, -86.71790 32.4...",...,001,Autauga,County,594.436,"POLYGON ((-87.01652 33.65868, -87.01225 33.660...","LINESTRING (-86.49677 32.34444, -86.71790 32.4...",1001,0.715646,1001,53881.928214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3135,51041,0500000US51041,51,041,Chesterfield,County,423.297,"POLYGON ((-77.85180 37.35487, -77.85515 37.418...","POLYGON ((-78.53065 36.62059, -78.59824 36.689...","LINESTRING (-77.85180 37.35487, -77.85515 37.4...",...,041,Chesterfield,County,423.297,"POLYGON ((-78.53065 36.62059, -78.59824 36.689...","LINESTRING (-77.85180 37.35487, -77.85515 37.4...",51041,0.862679,51041,70447.541886
3135,51041,0500000US51041,51,041,Chesterfield,County,423.297,"POLYGON ((-77.85180 37.35487, -77.85515 37.418...","POLYGON ((-78.53065 36.62059, -78.59824 36.689...","LINESTRING (-77.85180 37.35487, -77.85515 37.4...",...,760,Richmond,city,59.805,"POLYGON ((-78.20329 36.75402, -78.28103 36.827...","LINESTRING (-77.42085 37.44708, -77.45922 37.4...",51760,1.017935,51760,63016.118614
3135,51041,0500000US51041,51,041,Chesterfield,County,423.297,"POLYGON ((-77.85180 37.35487, -77.85515 37.418...","POLYGON ((-78.53065 36.62059, -78.59824 36.689...","LINESTRING (-77.85180 37.35487, -77.85515 37.4...",...,087,Henrico,County,233.697,"POLYGON ((-78.03417 36.63737, -78.10426 36.669...","LINESTRING (-77.33384 37.57686, -77.22850 37.5...",51087,0.820531,51087,71220.850378
3135,51041,0500000US51041,51,041,Chesterfield,County,423.297,"POLYGON ((-77.85180 37.35487, -77.85515 37.418...","POLYGON ((-78.53065 36.62059, -78.59824 36.689...","LINESTRING (-77.85180 37.35487, -77.85515 37.4...",...,145,Powhatan,County,260.221,"POLYGON ((-79.03139 37.94032, -78.99928 38.032...","LINESTRING (-78.07226 37.65736, -77.95782 37.6...",51145,0.831699,51145,76543.939394


In [263]:
joined.columns

Index(['id_left', 'GEO_ID_left', 'STATE_left', 'COUNTY_left', 'NAME_left',
       'LSAD_left', 'CENSUSAREA_left', 'geometry', 'buffered_left',
       'boundary_left', 'STATE_COUNTY_FIPS_left', 'GINI_left',
       'COUNTYFIP_left', 'avg_agi_left', 'index_right', 'id_right',
       'GEO_ID_right', 'STATE_right', 'COUNTY_right', 'NAME_right',
       'LSAD_right', 'CENSUSAREA_right', 'buffered_right', 'boundary_right',
       'STATE_COUNTY_FIPS_right', 'GINI_right', 'COUNTYFIP_right',
       'avg_agi_right'],
      dtype='object')

In [279]:
selected = joined[['STATE_left',  'STATE_COUNTY_FIPS_left', 'GINI_left', 'avg_agi_left','STATE_right', 'STATE_COUNTY_FIPS_right', 'GINI_right', 'avg_agi_right']]

In [280]:
selected

Unnamed: 0,STATE_left,STATE_COUNTY_FIPS_left,GINI_left,avg_agi_left,STATE_right,STATE_COUNTY_FIPS_right,GINI_right,avg_agi_right
0,01,1001,0.715646,53881.928214,01,1085,0.843279,36537.242798
0,01,1001,0.715646,53881.928214,01,1101,1.025438,51891.872438
0,01,1001,0.715646,53881.928214,01,1047,0.802473,38101.175015
0,01,1001,0.715646,53881.928214,01,1021,0.855137,44896.473988
0,01,1001,0.715646,53881.928214,01,1001,0.715646,53881.928214
...,...,...,...,...,...,...,...,...
3135,51,51041,0.862679,70447.541886,51,51041,0.862679,70447.541886
3135,51,51041,0.862679,70447.541886,51,51760,1.017935,63016.118614
3135,51,51041,0.862679,70447.541886,51,51087,0.820531,71220.850378
3135,51,51041,0.862679,70447.541886,51,51145,0.831699,76543.939394


### Defining counties that have similar levels of inequality
There's two things we want to do here. The first is to group counties into four groups:
1. Rich, low inequality
2. Rich, high inequality
3. Poor, low inequality
4. Poor, high inequality

Then we want to take those four groups and check healthcare outcomes for them, see which one fair best.

The second independent thing is to compare a poor county that is surrounded only by other poor counties, with a poor county that borders at least one rich county. This second part requires two things: 

1. Define "poor" (i.e. bottom 25 percentile?) and rich
2. Group nearby counties
   
This is actually the part that we need the Geographic functionality for. The first part does not require any kind of geospatial processing.

In [314]:
std = selected['GINI_left'].std()
mean = selected['GINI_left'].mean()
poor = selected['avg_agi_right'].quantile(0.10)
rich = selected['avg_agi_right'].quantile(0.90)


selected_pl = pl.from_pandas(selected)

# Just showing the same thing with polars
# poor_pl = selected_pl['avg_agi_left'].quantile(0.10)

print(f'poor: ${poor:.2f}')
print(f'rich: ${rich:.2f}')

poor_near_poor = selected_pl\
.filter(pl.col('avg_agi_left') <= poor )\
.group_by('STATE_COUNTY_FIPS_left')\
    .agg( avg_agi_right_max=pl.col('avg_agi_right').max() )\
.filter( pl.col('avg_agi_right_max') <= poor )

rich_near_rich = selected_pl\
.filter( pl.col('avg_agi_left') >= rich )\
.group_by('STATE_COUNTY_FIPS_left')\
    .agg( avg_agi_right_max=pl.col('avg_agi_right').max() )\
.filter( pl.col('avg_agi_right_max') >= rich )

poor_near_rich = selected_pl\
.filter(pl.col('avg_agi_left') <= poor )\
.group_by('STATE_COUNTY_FIPS_left')\
    .agg( avg_agi_right_max=pl.col('avg_agi_right').max() )\
.filter( pl.col('avg_agi_right_max') >= rich )

# Notice that this uses min in the agg, not max
rich_near_poor = selected_pl\
.filter(pl.col('avg_agi_left') >= rich )\
.group_by('STATE_COUNTY_FIPS_left')\
    .agg( avg_agi_right_max=pl.col('avg_agi_right').min() )\
.filter( pl.col('avg_agi_right_max') <= poor )

poor: $38872.43
rich: $66707.41


In [315]:
print(f'poor_near_poor: {len(poor_near_poor)}')
print(f'rich_near_rich: {len(rich_near_rich)}')
print(f'poor_near_rich: {len(poor_near_rich)}')
print(f'rich_near_poor: {len(rich_near_poor)}')

poor_near_poor: 9
rich_near_rich: 328
poor_near_rich: 30
rich_near_poor: 27


In [276]:
# This is within 1 standard deviation, so less than or equal to 1 standard deviation
low_inequality = selected[ selected['GINI_left'] < mean - 1*std   ]
# This is 2 standard deviations away
high_inequality = selected[ mean + 1*std < selected['GINI_left'] ]

In [316]:
poor_near_poor

STATE_COUNTY_FIPS_left,avg_agi_right_max
i64,f64
13243,38007.474227
48215,37812.816308
48489,37812.816308
48061,37812.816308
29179,38379.421769
51595,37671.240106
21057,37321.0
28119,38687.067773
21129,37319.082569


In [320]:
pnp_list = list(poor_near_poor['STATE_COUNTY_FIPS_left'])
rnr_list = list(rich_near_rich['STATE_COUNTY_FIPS_left'])
pnr_list = list(poor_near_rich['STATE_COUNTY_FIPS_left'])
rnp_list = list(rich_near_poor['STATE_COUNTY_FIPS_left'])