In [11]:
import clickhouse_connect
import polars as pl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
client = clickhouse_connect.get_client(host='hub.publichealthhq.xyz', port=18123, username='default', password='Password123!')
result = client.query("""
SELECT DISTINCT STATE_NAME, COUNTY_NAME, TOTAL_POPULATION, RUCC
FROM cps_00004.places_county 
JOIN cps_00004.rural_urban_codes 
ON cps_00004.places_county.COUNTY_FIPS = rural_urban_codes.FIPS 
""")

df = pl.from_dicts(result.named_results(), infer_schema_length=400)

In [2]:
len(df)

3135

In [3]:
total_pop = df.group_by(['STATE_NAME']).agg( pl.col('TOTAL_POPULATION').sum() )
total_pop

STATE_NAME,TOTAL_POPULATION
str,i64
"""New Hampshire""",1388992
"""South Carolina…",5190705
"""Arkansas""",3025891
"""North Dakota""",774948
"""Montana""",1104271
"""Georgia""",10799566
"""Vermont""",645570
"""West Virginia""",1782959
"""Arizona""",7276316
"""Hawaii""",1441553


In [4]:
pop_by_county = df.join(total_pop, 'STATE_NAME')\
.group_by(['STATE_NAME', 'COUNTY_NAME'])\
.agg( pl.col('TOTAL_POPULATION').sum() / pl.col('TOTAL_POPULATION_right').first() )

In [5]:
pop_by_county

STATE_NAME,COUNTY_NAME,TOTAL_POPULATION
str,str,f64
"""Texas""","""Wood""",0.001554
"""Washington""","""Asotin""",0.002894
"""Texas""","""Swisher""",0.000237
"""Virginia""","""Nottoway""",0.001804
"""Utah""","""Emery""",0.002986
"""Virginia""","""Richmond City""",0.02622
"""Utah""","""Uintah""",0.010846
"""Virginia""","""Highland""",0.000258
"""Virginia""","""Albemarle""",0.013137
"""Virginia""","""Augusta""",0.008975


In [6]:
pop_rcc_by_county = pop_by_county.join(
    df.select(['STATE_NAME', 'COUNTY_NAME', 'RUCC']), on=['STATE_NAME', 'COUNTY_NAME'], how='inner')\
.select(['STATE_NAME', 'COUNTY_NAME', 'TOTAL_POPULATION', 'RUCC'])

In [7]:
result = pop_rcc_by_county\
.with_columns( WEIGHTED_RUCC=pl.col('TOTAL_POPULATION') * pl.col('RUCC'))\
.group_by(['STATE_NAME'])\
.agg( pl.col('WEIGHTED_RUCC').sum()).sort('WEIGHTED_RUCC', descending=True)

In [8]:
type(result)

polars.dataframe.frame.DataFrame

In [9]:
result
df = pl.from_dicts(result, infer_schema_length=400)

TypeError: the truth value of a DataFrame is ambiguous

Hint: to check if a DataFrame contains any values, use `is_empty()`.

In [None]:
result.write_csv("testfile9999.csv")

In [74]:
result = client.query("""
SELECT any(STATE_FIPS) as STATE_FIPS, any(STATE_NAME) as STATE_NAME, COUNTY as COUNTY_NAME, round(avg(LIFE_EXPECTANCY), 2) as LIFE_EXP 
FROM cps_00004.life_expectancy 
JOIN cps_00004.state_fips 
ON state_fips.STATE_NAME = life_expectancy.STATE_NAME 
WHERE COUNTY_NAME IS NOT NULL
GROUP BY COUNTY_NAME
ORDER BY STATE_FIPS
""")
df = pl.from_dicts(result.named_results(), infer_schema_length=400)

In [75]:
df

STATE_FIPS,STATE_NAME,COUNTY_NAME,LIFE_EXP
i64,str,str,f64
1,"""Alabama""","""Henry County, …",75.82
1,"""Alabama""","""Bibb County, A…",73.55
1,"""Alabama""","""Mobile County""",
1,"""Alabama""","""Lee County, AL…",77.55
1,"""Alabama""","""Perry County, …",73.9
1,"""Alabama""","""Coosa County, …",77.9
1,"""Alabama""","""Dale County, A…",76.45
1,"""Alabama""","""Clay County, A…",74.6
1,"""Alabama""","""Pike County, A…",75.43
1,"""Alabama""","""Baldwin County…",


In [79]:
updated = df.map_rows(lambda row: (row[0], row[1], ''.join(row[2].split(',')[0:-1]), row[3]) ).rename({
    'column_0': 'STATE_FIPS',
    'column_1': 'STATE_NAME',
    'column_2': 'COUNTY_NAME',
    'column_3': 'LIFE_EXP',
}) 
len(updated)

3386

In [90]:
county_result = client.query("SELECT STATE_FIPS, COUNTY_NAME, COUNTY_FIPS FROM cps_00004.county_fips ORDER BY STATE_FIPS, COUNTY_NAME")
county_df = pl.from_dicts(county_result.named_results(), infer_schema_length=400)

In [114]:
le_with_fips = updated\
.join(county_df, on=['STATE_FIPS', 'COUNTY_NAME'], how='inner')\
.sort(['STATE_NAME', 'COUNTY_NAME'])\
.map_rows(lambda row: (row[0], row[1], row[2], row[3], row[4], int(f'{row[0]}{row[4]:03d}')  ) )\
.rename({
    'column_0': 'STATE_FIPS',
    'column_1': 'STATE_NAME',
    'column_2': 'COUNTY_NAME',
    'column_3': 'LIFE_EXP',
    'column_4': 'COUNTY_FIPS',
    'column_5': 'STATE_COUNTY_FIPS'
})

In [115]:
rucc_result = client.query(""" SELECT * FROM cps_00004.rural_urban_codes """)
rucc_df = pl.from_dicts(rucc_result.named_results(), infer_schema_length=400).rename({'FIPS': 'STATE_COUNTY_FIPS'})

In [116]:
rucc_df

STATE_COUNTY_FIPS,STATE_ABBREV,COUNTY_NAME,POP,RUCC,DESCR
i64,str,str,i64,i64,str
1001,"""AL""","""Autauga County…",58805,2,"""Metro - Counti…"
1003,"""AL""","""Baldwin County…",231767,3,"""Metro - Counti…"
1005,"""AL""","""Barbour County…",25223,6,"""Nonmetro - Urb…"
1007,"""AL""","""Bibb County""",22293,1,"""Metro - Counti…"
1009,"""AL""","""Blount County""",59134,1,"""Metro - Counti…"
1011,"""AL""","""Bullock County…",10357,8,"""Nonmetro - Urb…"
1013,"""AL""","""Butler County""",19051,6,"""Nonmetro - Urb…"
1015,"""AL""","""Calhoun County…",116441,3,"""Metro - Counti…"
1017,"""AL""","""Chambers Count…",34772,6,"""Nonmetro - Urb…"
1019,"""AL""","""Cherokee Count…",24971,8,"""Nonmetro - Urb…"


In [119]:
rucc_le_fips = le_with_fips.join(rucc_df, on='STATE_COUNTY_FIPS')\
.select('STATE_FIPS', 'STATE_NAME', 'COUNTY_NAME', 'LIFE_EXP', 'STATE_COUNTY_FIPS', 'POP', 'RUCC', 'DESCR')

In [127]:
places_result = client.query("""SELECT COUNTY_FIPS, MEASURE, DATA_VALUE FROM cps_00004.places_county""")
places_df = pl.from_dicts(places_result.named_results(), infer_schema_length=400).rename({'COUNTY_FIPS': 'STATE_COUNTY_FIPS'})

In [128]:
places_df

STATE_COUNTY_FIPS,MEASURE,DATA_VALUE
i64,str,"decimal[*,1]"
13299,"""Stroke among a…",4.6
18115,"""Stroke among a…",3.8
18119,"""Stroke among a…",4
19095,"""Stroke among a…",3.3
19125,"""Stroke among a…",3
48117,"""Obesity among …",42
48175,"""Chronic obstru…",8.5
1047,"""Obesity among …",44.3
2195,"""Obesity among …",31
2275,"""Stroke among a…",4.4


In [131]:
triple_df = rucc_le_fips.join(places_df, on='STATE_COUNTY_FIPS', how='inner').sort(['STATE_NAME', 'COUNTY_NAME'])

In [132]:
triple_df

STATE_FIPS,STATE_NAME,COUNTY_NAME,LIFE_EXP,STATE_COUNTY_FIPS,POP,RUCC,DESCR,MEASURE,DATA_VALUE
i64,str,str,f64,i64,i64,i64,str,str,"decimal[*,1]"
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Self-care disa…",4
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Chronic obstru…",6.8
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Cervical cance…",84.1
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Any disability…",30.2
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""High cholester…",36.9
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Stroke among a…",3.4
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Depression amo…",22.2
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Chronic kidney…",2.9
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Visits to doct…",76
1,"""Alabama""","""Autauga County…",75.32,1001,58805,2,"""Metro - Counti…","""Binge drinking…",14.4


In [151]:
print(triple_df.dtypes)
pandas_df = triple_df.to_pandas()
pandas_df['DATA_VALUE'] = pandas_df['DATA_VALUE'].astype('Float64')
pandas_df.to_csv('statecountyrucchealthmetrics.csv', index=False)
pandas_df


[Int64, String, String, Float64, Int64, Int64, Int64, String, String, Decimal(precision=None, scale=1)]


Unnamed: 0,STATE_FIPS,STATE_NAME,COUNTY_NAME,LIFE_EXP,STATE_COUNTY_FIPS,POP,RUCC,DESCR,MEASURE,DATA_VALUE
0,1,Alabama,Autauga County,75.32,1001,58805,2,"Metro - Counties in metro areas of 250,000 to ...",Self-care disability among adults aged >=18 years,4.0
1,1,Alabama,Autauga County,75.32,1001,58805,2,"Metro - Counties in metro areas of 250,000 to ...",Chronic obstructive pulmonary disease among ad...,6.8
2,1,Alabama,Autauga County,75.32,1001,58805,2,"Metro - Counties in metro areas of 250,000 to ...",Cervical cancer screening among adult women ag...,84.1
3,1,Alabama,Autauga County,75.32,1001,58805,2,"Metro - Counties in metro areas of 250,000 to ...",Any disability among adults aged >=18 years,30.2
4,1,Alabama,Autauga County,75.32,1001,58805,2,"Metro - Counties in metro areas of 250,000 to ...",High cholesterol among adults aged >=18 years ...,36.9
...,...,...,...,...,...,...,...,...,...,...
226307,56,Wyoming,Weston County,78.80,56045,6838,9,"Nonmetro - Urban population of fewer than 5,00...",Taking medicine for high blood pressure contro...,56.1
226308,56,Wyoming,Weston County,78.80,56045,6838,9,"Nonmetro - Urban population of fewer than 5,00...",Independent living disability among adults age...,7.2
226309,56,Wyoming,Weston County,78.80,56045,6838,9,"Nonmetro - Urban population of fewer than 5,00...",Stroke among adults aged >=18 years,2.8
226310,56,Wyoming,Weston County,78.80,56045,6838,9,"Nonmetro - Urban population of fewer than 5,00...",Older adult men aged >=65 years who are up to ...,36.9
