In [1]:
import pandas as pd
import numpy as np

# Reconciling state and county data

The NCI data is available at both the state and county level but many counties are missing data due to less detailed reporting or low data counts. The goal of this script is to combine the local and state values to fill in estimates of the missing county-level values.

## Read initial data
Take in the procesed state and county data and population data from the Social Vulnerability Index

In [2]:
# Already processed state-level NCI data
state = pd.read_csv("../../data/processed/nci_state.csv")
state.head()

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,cervical_cancer_incidence_rate_per_100000,cervical_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,...,melanoma_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,hodgkins_lymphoma_cancer_mortality_rate_per_100000,hodgkins_lymphoma_cancer_incidence_rate_per_100000,colorectal_screening_percent,smoking_percent,hpv_vaccine_percent
0,1,451.5,173.42,121.6,21.54,9.4,3.46,43.6,15.46,64.9,...,2.38,12.3,6.4,16.2,5.02,0.31,2.42,76.07,43.94,34.9
1,2,417.2,152.62,120.8,18.8,7.1,1.65,42.6,15.05,55.8,...,2.19,11.8,5.55,17.1,5.5,,2.13,68.36,43.22,32.3
2,4,386.7,137.88,114.4,18.47,6.5,2.1,32.6,12.62,46.8,...,2.48,11.0,5.85,15.5,4.67,0.28,2.0,75.32,40.35,39.9
3,5,472.8,177.63,118.2,20.33,9.5,3.43,44.1,15.59,77.2,...,2.12,14.5,6.79,18.2,5.3,0.3,2.62,73.39,47.06,26.8
4,6,404.6,140.3,121.5,19.35,7.2,2.24,35.1,12.5,41.5,...,2.06,12.5,5.86,18.4,5.17,0.31,2.18,75.6,33.45,38.8


In [3]:
# Already processed county-level NCI data
county = pd.read_csv("../../data/processed/nci_county.csv")
county

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,...,58786.0,0.7,32.5,12.0,20.0,4563.0,1.1,19.1,,
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,...,55962.0,1.4,33.9,,25.7,4492.0,0.8,9.5,4.5,86.3
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,...,,1.5,,21.5,18.5,4642.0,,47.6,4.3,
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,...,,0.7,33.7,10.5,17.5,4499.0,,22.3,,
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,...,48695.0,1.5,33.9,10.2,25.2,4416.0,,1.5,9.1,95.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,,,,21.63,,14.20,,40.71,136.1,...,73008.0,2.4,,10.0,23.6,4577.0,1.1,,16.0,93.1
3138,56039,,,141.0,,,,,,125.5,...,83831.0,5.0,34.6,,36.4,4224.0,1.4,1.2,14.9,90.3
3139,56041,,,139.5,31.52,,,,,,...,58235.0,,,11.0,21.2,4532.0,,,9.1,93.4
3140,56043,,,130.2,,44.7,,,,86.4,...,53426.0,1.3,33.9,,22.7,4341.0,,,14.2,89.7


In [4]:
# Shannon County, SD (fips 46113) was renamed to Oglala Lakota County (fips 46102)
# Combine those two rows
county.loc[county["fips"] == 46113, "fips"] = 46102
county = county.groupby("fips").max().reset_index()

county[(county["fips"] == 46113) | (county["fips"] == 46102)]

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent
2410,46102,433.4,212.98,,,54.0,,84.5,56.26,95.6,...,,1.6,,45.1,16.6,4299.0,,,3.9,


In [5]:
# Wade Hampton Census Area, AK (fips 02270) was renamed to Kusilvak Census Area (fips 02158)
# Combine those two rows
county.loc[county["fips"] == 2270, "fips"] = 2158
county = county.groupby("fips").max().reset_index()

county[(county["fips"] == 2158) | (county["fips"] == 2158)]

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent
80,2158,,171.79,,,102.1,,,,,...,,3.7,,39.0,15.6,,,,,


In [6]:
# Read population counts from the Social Vulnerability Index
svi = pd.read_csv("../../data/processed/social_vulnerability_index.csv")
svi = svi[["fips", "population", "population_over_65"]]
svi.head()

Unnamed: 0,fips,population,population_over_65
0,1001,55639,8490
1,1003,218289,44716
2,1005,25026,4777
3,1007,22374,3676
4,1009,57755,10382


In [7]:
# Shannon County, SD (fips 46113) was renamed to Oglala Lakota County (fips 46102)
svi.loc[svi["fips"] == 46113, "fips"] = 46102
svi = svi.groupby("fips").max().reset_index()
# Wade Hampton Census Area, AK (fips 02270) was renamed to Kusilvak Census Area (fips 02158)
svi.loc[svi["fips"] == 2270, "fips"] = 2158
svi = svi.groupby("fips").max().reset_index()

## Pre-process data frames
We need to join tables so both the state and county dataframes have the state fips code and population

In [8]:
# Add in the state FIPs code for cross-referencing
county["state_fips"] = (county["fips"]/1000).apply(np.floor).astype(int)

# Join county with the population data
county = county.set_index("fips").join(svi.set_index("fips")).reset_index()

county.head()

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent,state_fips,population,population_over_65
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,...,12.0,20.0,4563.0,1.1,19.1,,,1,55639.0,8490.0
1,1003,452.4,165.47,128.1,22.2,40.4,13.93,70.0,47.25,88.4,...,,25.7,4492.0,0.8,9.5,4.5,86.3,1,218289.0,44716.0
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,...,21.5,18.5,4642.0,,47.6,4.3,,1,25026.0,4777.0
3,1007,467.3,187.6,140.7,22.05,47.0,14.3,78.3,54.26,112.7,...,10.5,17.5,4499.0,,22.3,,,1,22374.0,3676.0
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,...,10.2,25.2,4416.0,,1.5,9.1,95.5,1,57755.0,10382.0


In [9]:
# Join state with the population data
svi["state_fips"] = (svi["fips"] / 1000).apply(np.floor).astype(int)
svi_state = (
    svi.groupby("state_fips")
    .agg({"population": "sum"})
    .reset_index()
    .rename(columns={"state_fips": "fips"})
)

state = state.set_index("fips").join(svi_state.set_index("fips")).reset_index()
state = state.rename(columns={"fips": "state_fips"})

state.head()

Unnamed: 0,state_fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,cervical_cancer_incidence_rate_per_100000,cervical_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,...,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,hodgkins_lymphoma_cancer_mortality_rate_per_100000,hodgkins_lymphoma_cancer_incidence_rate_per_100000,colorectal_screening_percent,smoking_percent,hpv_vaccine_percent,population
0,1,451.5,173.42,121.6,21.54,9.4,3.46,43.6,15.46,64.9,...,12.3,6.4,16.2,5.02,0.31,2.42,76.07,43.94,34.9,4893186
1,2,417.2,152.62,120.8,18.8,7.1,1.65,42.6,15.05,55.8,...,11.8,5.55,17.1,5.5,,2.13,68.36,43.22,32.3,736990
2,4,386.7,137.88,114.4,18.47,6.5,2.1,32.6,12.62,46.8,...,11.0,5.85,15.5,4.67,0.28,2.0,75.32,40.35,39.9,7174064
3,5,472.8,177.63,118.2,20.33,9.5,3.43,44.1,15.59,77.2,...,14.5,6.79,18.2,5.3,0.3,2.62,73.39,47.06,26.8,3011873
4,6,404.6,140.3,121.5,19.35,7.2,2.24,35.1,12.5,41.5,...,12.5,5.86,18.4,5.17,0.31,2.18,75.6,33.45,38.8,39346023


## Extrapolate cancer rates

Using the population data, subtract the known county data from the state data and use that rate for the remaining counties.

In [10]:
# Fills in the data for metric in the county data frame (c) using the state data frame (s)
def fill_data(s, c, metric):
    # Get the relevant columns
    df_county = c[["fips", "state_fips", "population", metric]]
    df_state = s[["state_fips", "population", metric]]
    
    # Find the total number of incidences in the counties with known rates
    df_county["county_total"] = df_county["population"] * df_county[metric] / 100000
    
    # Sum up the populations of all counties in each state that don't have data for the metric
    missing_counties = df_county[df_county[metric].isna()]
    df_state = df_state.set_index("state_fips").join(
        missing_counties.groupby("state_fips")
        .agg({"population": "sum"})
        .rename(columns={"population": "missing_data_population"})
    ).reset_index()
    
    # Calculate the total number of incidences in each state
    df_state["state_total"] = df_state["population"] * df_state[metric] / 100000
    
    # Find the number of cases not attributed to counties
    county_total = df_county.groupby("state_fips").agg({"county_total": "sum"})

    # Join with the state data
    df_state = df_state.set_index("state_fips").join(county_total).reset_index()

    # Due to age adjustment, some states have slightly negative values. Those should
    # be reset to 0 when calculating the diff
    df_state["diff"] = np.maximum(df_state["state_total"] - df_state["county_total"], 0)
    
    # Calculate values for all counties with missing data.
    df_state[metric] = df_state["diff"] / df_state["missing_data_population"] * 100000
    
    # Go through all values of metric and fill in the missing ones
    for i in county.index:
        if np.isnan(county.iloc[i].loc[metric]):
            val = df_state[
                df_state["state_fips"] == (county.iloc[i].loc["state_fips"].astype(int))
            ].iloc[0].loc[metric]
            county.iloc[i, county.columns.get_loc(metric)] = val


county

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent,state_fips,population,population_over_65
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,...,12.0,20.0,4563.0,1.1,19.1,,,1,55639.0,8490.0
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,...,,25.7,4492.0,0.8,9.5,4.5,86.3,1,218289.0,44716.0
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,...,21.5,18.5,4642.0,,47.6,4.3,,1,25026.0,4777.0
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,...,10.5,17.5,4499.0,,22.3,,,1,22374.0,3676.0
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,...,10.2,25.2,4416.0,,1.5,9.1,95.5,1,57755.0,10382.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,,,,21.63,,14.20,,40.71,136.1,...,10.0,23.6,4577.0,1.1,,16.0,93.1,56,43352.0,5292.0
3138,56039,,,141.0,,,,,,125.5,...,,36.4,4224.0,1.4,1.2,14.9,90.3,56,23356.0,3481.0
3139,56041,,,139.5,31.52,,,,,,...,11.0,21.2,4532.0,,,9.1,93.4,56,20374.0,2868.0
3140,56043,,,130.2,,44.7,,,,86.4,...,,22.7,4341.0,,,14.2,89.7,56,7933.0,1716.0


In [11]:
# Iterate over all cancer metrics to fill in missing data
for m in list(filter(lambda x: "rate" in x, county.columns)):
    fill_data(state, county, m)

# Drop columns used to process data
county = county.drop(columns=["state_fips", "population", "population_over_65"])

county

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_county["county_total"] = df_county["population"] * df_county[metric] / 100000


Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent
0,1001,490.100000,163.480000,129.900000,21.590000,49.50000,15.190000,67.800000,48.430000,133.200000,...,58786.0,0.7,32.5,12.0,20.0,4563.0,1.1,19.1,,
1,1003,452.400000,165.470000,128.100000,22.200000,40.40000,13.930000,70.000000,47.250000,88.400000,...,55962.0,1.4,33.9,,25.7,4492.0,0.8,9.5,4.5,86.3
2,1005,429.617003,182.940000,104.428701,17.199895,44.10000,15.420000,63.100000,54.490000,146.200000,...,,1.5,,21.5,18.5,4642.0,,47.6,4.3,
3,1007,467.300000,187.600000,140.700000,22.050000,47.00000,14.300000,78.300000,54.260000,112.700000,...,,0.7,33.7,10.5,17.5,4499.0,,22.3,,
4,1009,432.000000,176.720000,130.000000,20.360000,36.70000,14.170000,70.300000,53.940000,96.900000,...,48695.0,1.5,33.9,10.2,25.2,4416.0,,1.5,9.1,95.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,386.394603,134.611466,95.349904,21.630000,30.40499,14.200000,41.872859,40.710000,136.100000,...,73008.0,2.4,,10.0,23.6,4577.0,1.1,,16.0,93.1
3138,56039,386.394603,134.611466,141.000000,13.192843,30.40499,8.482755,41.872859,26.764554,125.500000,...,83831.0,5.0,34.6,,36.4,4224.0,1.4,1.2,14.9,90.3
3139,56041,386.394603,134.611466,139.500000,31.520000,30.40499,8.482755,41.872859,26.764554,62.069944,...,58235.0,,,11.0,21.2,4532.0,,,9.1,93.4
3140,56043,386.394603,134.611466,130.200000,13.192843,44.70000,8.482755,41.872859,26.764554,86.400000,...,53426.0,1.3,33.9,,22.7,4341.0,,,14.2,89.7


In [12]:
# Write out the cleaned data
county.to_csv("../../data/processed/nci.csv", index=False)