In [4]:
import pandas as pd
import numpy as np

# Data by county

In [7]:
# Create a function that allows you to read a file, rename columns, and add it to the dataframe
def add_file(old_df, file, cols):
    df_new = pd.read_csv("../../data/raw/" + file, header=1)
    df_new = df_new.rename(columns=cols)
    df_new = df_new[cols.values()]
    return old_df.set_index("fips").join(df_new.set_index("fips"), how="outer").reset_index()

In [3]:
# Start with the incidence rate of all cancers for both sexes
file = "Incidence_US_by_County_All_Races_All_Cancer_Sites_(Both_Sexes)_2013_-_2017.csv"
df = pd.read_csv("../../data/raw/" + file, header=1)
df = df.rename(
    columns={
        "County FIPS": "fips",
        "Age-Adjusted Rate per 100,000": "cancer_incidence_rate_per_100000",
    }
)
df = df[["fips", "cancer_incidence_rate_per_100000"]]
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000
0,12125,1143.5
1,31113,739.4
2,40055,710.4
3,21197,648.2
4,31163,629.5
...,...,...
1995,31107,431.4
1996,48497,431.4
1997,51113,431.4
1998,46019,431.4


In [4]:
# Mortality
df = add_file(
    df,
    "Mortality_US_by_County_All_Races_All_Malignant_Cancers_(Both_Sexes)_2014_-_2018.csv",
    {
        "County FIPS": "fips",
        "Age-Adjusted Rate per 100,000": "cancer_mortality_rate_per_100000",
    },
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000
0,1001,490.1,163.48
1,1003,452.4,165.47
2,1005,,182.94
3,1007,467.3,187.60
4,1009,432.0,176.72
...,...,...,...
2472,56009,443.3,
2473,56017,445.2,
2474,56021,443.5,
2475,56029,438.8,


In [5]:
# Incidence and mortality for other cancers
def add_cancer_rates(df_old, file_name, formatted_name):
    df_new = add_file(
        df_old,
        "Incidence_US_by_County_All_Races_" + file_name + "_2013_-_2017.csv",
        {
            "County FIPS": "fips",
            "Age-Adjusted Rate per 100,000": formatted_name
            + "_cancer_incidence_rate_per_100000",
        },
    )
    df_new = add_file(
        df_new,
        "Mortality_US_by_County_All_Races_" + file_name + "_2014_-_2018.csv",
        {
            "County FIPS": "fips",
            "Age-Adjusted Rate per 100,000": formatted_name
            + "_cancer_mortality_rate_per_100000",
        },
    )
    return df_new


df = add_cancer_rates(df, "Breast_(Female)", "breast")
df = add_cancer_rates(df, "Colon_&_Rectum_(Both_Sexes)", "colorectal")
df = add_cancer_rates(df, "Lung_&_Bronchus_(Both_Sexes)", "lung")
df = add_cancer_rates(df, "Prostate_(Male)", "prostate")
df = add_cancer_rates(df, "Leukemia_(Both_Sexes)", "leukemia")
df = add_cancer_rates(df, "Non-Hodgkin_Lymphoma_(Both_Sexes)", "non_hodgkin_lymphoma")
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,18.07,13.7,no data/suppressed,15.0,4.90
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,17.98,14.0,5.69,17.0,4.24
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,22.79,10.7,no data/suppressed,17.0,no data/suppressed
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,,,no data/suppressed,16.3,no data/suppressed
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,17.60,13.9,6.74,17.4,5.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3073,56037,,,,21.63,,14.20,,40.71,136.1,,10.6,no data/suppressed,21.2,no data/suppressed
3074,56039,,,141.0,,,,,,125.5,,20.3,no data/suppressed,20.0,no data/suppressed
3075,56041,,,139.5,31.52,,,,,,,,no data/suppressed,17.8,no data/suppressed
3076,56043,,,130.2,,44.7,,,,86.4,,no data/suppressed,no data/suppressed,,


In [6]:
# Household income
df = add_file(
    df,
    "Demographics_US_by_County_Income_Median_Household_Income_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Dollars": "median_household_income"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,median_household_income
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,18.07,13.7,no data/suppressed,15.0,4.90,58786.0
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,17.98,14.0,5.69,17.0,4.24,55962.0
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,22.79,10.7,no data/suppressed,17.0,no data/suppressed,
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,,,no data/suppressed,16.3,no data/suppressed,
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,17.60,13.9,6.74,17.4,5.80,48695.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3119,56037,,,,21.63,,14.20,,40.71,136.1,,10.6,no data/suppressed,21.2,no data/suppressed,73008.0
3120,56039,,,141.0,,,,,,125.5,,20.3,no data/suppressed,20.0,no data/suppressed,83831.0
3121,56041,,,139.5,31.52,,,,,,,,no data/suppressed,17.8,no data/suppressed,58235.0
3122,56043,,,130.2,,44.7,,,,86.4,,no data/suppressed,no data/suppressed,,,53426.0


In [7]:
# Language isolation
df = add_file(
    df,
    "Demographics_US_by_County_Non-English_Language_Language_Isolation_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "language_isolation_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,18.07,13.7,no data/suppressed,15.0,4.90,58786.0,0.7
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,17.98,14.0,5.69,17.0,4.24,55962.0,1.4
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,22.79,10.7,no data/suppressed,17.0,no data/suppressed,,1.5
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,,,no data/suppressed,16.3,no data/suppressed,,0.7
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,17.60,13.9,6.74,17.4,5.80,48695.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3133,56037,,,,21.63,,14.20,,40.71,136.1,,10.6,no data/suppressed,21.2,no data/suppressed,73008.0,2.4
3134,56039,,,141.0,,,,,,125.5,,20.3,no data/suppressed,20.0,no data/suppressed,83831.0,5.0
3135,56041,,,139.5,31.52,,,,,,,,no data/suppressed,17.8,no data/suppressed,58235.0,
3136,56043,,,130.2,,44.7,,,,86.4,,no data/suppressed,no data/suppressed,,,53426.0,1.3


In [8]:
# Age demographics
df = add_file(
    df,
    "Demographics_US_by_County_Population_Age_65_and_Over_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "over_65_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,18.07,13.7,no data/suppressed,15.0,4.90,58786.0,0.7,32.5
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,17.98,14.0,5.69,17.0,4.24,55962.0,1.4,33.9
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,22.79,10.7,no data/suppressed,17.0,no data/suppressed,,1.5,
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,,,no data/suppressed,16.3,no data/suppressed,,0.7,33.7
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,17.60,13.9,6.74,17.4,5.80,48695.0,1.5,33.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3134,56037,,,,21.63,,14.20,,40.71,136.1,,10.6,no data/suppressed,21.2,no data/suppressed,73008.0,2.4,
3135,56039,,,141.0,,,,,,125.5,,20.3,no data/suppressed,20.0,no data/suppressed,83831.0,5.0,34.6
3136,56041,,,139.5,31.52,,,,,,,,no data/suppressed,17.8,no data/suppressed,58235.0,,
3137,56043,,,130.2,,44.7,,,,86.4,,no data/suppressed,no data/suppressed,,,53426.0,1.3,33.9


In [9]:
# Families below the poverty line
df = add_file(
    df,
    "Demographics_US_by_County_Poverty_Families_Below_Poverty_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "below_poverty_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,18.07,13.7,no data/suppressed,15.0,4.90,58786.0,0.7,32.5,12.0
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,17.98,14.0,5.69,17.0,4.24,55962.0,1.4,33.9,
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,22.79,10.7,no data/suppressed,17.0,no data/suppressed,,1.5,,21.5
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,,,no data/suppressed,16.3,no data/suppressed,,0.7,33.7,10.5
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,17.60,13.9,6.74,17.4,5.80,48695.0,1.5,33.9,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3135,56037,,,,21.63,,14.20,,40.71,136.1,,10.6,no data/suppressed,21.2,no data/suppressed,73008.0,2.4,,10.0
3136,56039,,,141.0,,,,,,125.5,,20.3,no data/suppressed,20.0,no data/suppressed,83831.0,5.0,34.6,
3137,56041,,,139.5,31.52,,,,,,,,no data/suppressed,17.8,no data/suppressed,58235.0,,,11.0
3138,56043,,,130.2,,44.7,,,,86.4,,no data/suppressed,no data/suppressed,,,53426.0,1.3,33.9,


In [10]:
# Uninsured rate
df = add_file(
    df,
    "Demographics_US_by_County_Uninsured_Ages__65_at_or_below_138%_of_Poverty_(Both_Sexes_-_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "uninsured_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,18.07,13.7,no data/suppressed,15.0,4.90,58786.0,0.7,32.5,12.0,20.0
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,17.98,14.0,5.69,17.0,4.24,55962.0,1.4,33.9,,25.7
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,22.79,10.7,no data/suppressed,17.0,no data/suppressed,,1.5,,21.5,18.5
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,,,no data/suppressed,16.3,no data/suppressed,,0.7,33.7,10.5,17.5
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,17.60,13.9,6.74,17.4,5.80,48695.0,1.5,33.9,10.2,25.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3135,56037,,,,21.63,,14.20,,40.71,136.1,,10.6,no data/suppressed,21.2,no data/suppressed,73008.0,2.4,,10.0,23.6
3136,56039,,,141.0,,,,,,125.5,,20.3,no data/suppressed,20.0,no data/suppressed,83831.0,5.0,34.6,,36.4
3137,56041,,,139.5,31.52,,,,,,,,no data/suppressed,17.8,no data/suppressed,58235.0,,,11.0,21.2
3138,56043,,,130.2,,44.7,,,,86.4,,no data/suppressed,no data/suppressed,,,53426.0,1.3,33.9,,22.7


In [11]:
# Ultraviolet exposure
df = add_file(
    df,
    "Screening_and_Risk_Factors_US_by_County_(UV_Only)_UV_Exposure_Data_Ultraviolet_Exposure.csv",
    {"County FIPS": "fips", "Watt-Hours Per Square Meter": "uv_exposure"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,...,13.7,no data/suppressed,15.0,4.90,58786.0,0.7,32.5,12.0,20.0,4563.0
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,...,14.0,5.69,17.0,4.24,55962.0,1.4,33.9,,25.7,4492.0
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,...,10.7,no data/suppressed,17.0,no data/suppressed,,1.5,,21.5,18.5,4642.0
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,...,,no data/suppressed,16.3,no data/suppressed,,0.7,33.7,10.5,17.5,4499.0
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,...,13.9,6.74,17.4,5.80,48695.0,1.5,33.9,10.2,25.2,4416.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,,,,21.63,,14.20,,40.71,136.1,...,10.6,no data/suppressed,21.2,no data/suppressed,73008.0,2.4,,10.0,23.6,4577.0
3138,56039,,,141.0,,,,,,125.5,...,20.3,no data/suppressed,20.0,no data/suppressed,83831.0,5.0,34.6,,36.4,4224.0
3139,56041,,,139.5,31.52,,,,,,...,,no data/suppressed,17.8,no data/suppressed,58235.0,,,11.0,21.2,4532.0
3140,56043,,,130.2,,44.7,,,,86.4,...,no data/suppressed,no data/suppressed,,,53426.0,1.3,33.9,,22.7,4341.0


In [12]:
# Racial demographic data
df = add_file(
    df,
    "Demographics_US_by_County_Population_Asian_Pacific_Islander_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "asian_pacific_islander_percent"},
)
df = add_file(
    df,
    "Demographics_US_by_County_Population_Black_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "black_percent"},
)
df = add_file(
    df,
    "Demographics_US_by_County_Population_Hispanic_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "hispanic_percent"},
)
df = add_file(
    df,
    "Demographics_US_by_County_Population_White_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "white_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent
0,1001,490.1,163.48,129.9,21.59,49.5,15.19,67.8,48.43,133.2,...,58786.0,0.7,32.5,12.0,20.0,4563.0,1.1,19.1,,
1,1003,452.4,165.47,128.1,22.20,40.4,13.93,70.0,47.25,88.4,...,55962.0,1.4,33.9,,25.7,4492.0,0.8,9.5,4.5,86.3
2,1005,,182.94,,,44.1,15.42,63.1,54.49,146.2,...,,1.5,,21.5,18.5,4642.0,,47.6,4.3,
3,1007,467.3,187.60,140.7,22.05,47.0,14.30,78.3,54.26,112.7,...,,0.7,33.7,10.5,17.5,4499.0,,22.3,,
4,1009,432.0,176.72,130.0,20.36,36.7,14.17,70.3,53.94,96.9,...,48695.0,1.5,33.9,10.2,25.2,4416.0,,1.5,9.1,95.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,,,,21.63,,14.20,,40.71,136.1,...,73008.0,2.4,,10.0,23.6,4577.0,1.1,,16.0,93.1
3138,56039,,,141.0,,,,,,125.5,...,83831.0,5.0,34.6,,36.4,4224.0,1.4,1.2,14.9,90.3
3139,56041,,,139.5,31.52,,,,,,...,58235.0,,,11.0,21.2,4532.0,,,9.1,93.4
3140,56043,,,130.2,,44.7,,,,86.4,...,53426.0,1.3,33.9,,22.7,4341.0,,,14.2,89.7


In [13]:
# Replace no data with NaN
df = df.replace("no data/suppressed", np.nan)

In [14]:
# Write out the cleaned data
df.to_csv("../../data/processed/nci_county.csv", index=False)

# Data by state

In [39]:
# Start with an empty dataframe
df = pd.DataFrame([], columns=["fips"])
df

Unnamed: 0,fips


In [40]:
# Incidence
df = add_file(
    df,
    "Incidence_US_by_State_All_Races_All_Cancer_Sites_(Both_Sexes)_2013_-_2017.csv",
    {
        "State FIPS": "fips",
        "Age-Adjusted Rate per 100,000": "cancer_incidence_rate_per_100000",
    },
)
# Mortality
df = add_file(
    df,
    "Mortality_US_by_State_All_Races_All_Malignant_Cancers_(Both_Sexes)_2014_-_2018.csv",
    {
        "State FIPS": "fips",
        "Age-Adjusted Rate per 100,000": "cancer_mortality_rate_per_100000",
    },
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000
0,1,451.5,173.42
1,2,417.2,152.62
2,4,386.7,137.88
3,5,472.8,177.63
4,6,404.6,140.3
5,8,399.1,133.89
6,9,470.6,141.16
7,10,488.2,164.83
8,11,429.1,160.41
9,12,457.1,147.29


In [41]:
# Incidence and mortality for other cancers
def add_cancer_rates(df_old, file_name, formatted_name):
    df_new = add_file(
        df_old,
        "Incidence_US_by_State_All_Races_" + file_name + "_2013_-_2017.csv",
        {
            "State FIPS": "fips",
            "Age-Adjusted Rate per 100,000": formatted_name
            + "_cancer_incidence_rate_per_100000",
        },
    )
    df_new = add_file(
        df_new,
        "Mortality_US_by_State_All_Races_" + file_name + "_2014_-_2018.csv",
        {
            "State FIPS": "fips",
            "Age-Adjusted Rate per 100,000": formatted_name
            + "_cancer_mortality_rate_per_100000",
        },
    )
    return df_new


df = add_cancer_rates(df, "Breast_(Female)", "breast")
df = add_cancer_rates(df, "Cervix_(Female)", "cervical")
df = add_cancer_rates(df, "Colon_&_Rectum_(Both_Sexes)", "colorectal")
df = add_cancer_rates(df, "Lung_&_Bronchus_(Both_Sexes)", "lung")
df = add_cancer_rates(df, "Prostate_(Male)", "prostate")
df = add_cancer_rates(df, "Melanoma_of_the_Skin_(Both_Sexes)", "melanoma")
df = add_cancer_rates(df, "Leukemia_(Both_Sexes)", "leukemia")
df = add_cancer_rates(df, "Non-Hodgkin_Lymphoma_(Both_Sexes)", "non_hodgkin_lymphoma")
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,cervical_cancer_incidence_rate_per_100000,cervical_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,melanoma_cancer_incidence_rate_per_100000,melanoma_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000
0,1,451.5,173.42,121.6,21.54,9.4,3.46,43.6,15.46,64.9,48.55,121.1,20.97,21.6,2.38,12.3,6.4,16.2,5.02
1,2,417.2,152.62,120.8,18.8,7.1,1.65,42.6,15.05,55.8,36.59,83.6,18.57,14.4,2.19,11.8,5.55,17.1,5.5
2,4,386.7,137.88,114.4,18.47,6.5,2.1,32.6,12.62,46.8,31.32,79.2,17.32,24.5,2.48,11.0,5.85,15.5,4.67
3,5,472.8,177.63,118.2,20.33,9.5,3.43,44.1,15.59,77.2,52.8,112.4,18.42,21.7,2.12,14.5,6.79,18.2,5.3
4,6,404.6,140.3,121.5,19.35,7.2,2.24,35.1,12.5,41.5,28.08,93.0,19.86,22.9,2.06,12.5,5.86,18.4,5.17
5,8,399.1,133.89,127.6,18.9,6.2,1.53,33.2,11.78,41.4,26.48,92.6,21.36,22.3,2.64,13.3,5.75,17.2,4.59
6,9,470.6,141.16,140.5,17.43,6.1,1.35,35.3,10.69,59.6,33.15,111.3,17.79,20.4,2.07,14.5,5.95,21.1,5.19
7,10,488.2,164.83,134.7,21.44,7.8,2.53,37.9,13.27,66.9,43.18,124.5,17.16,29.5,2.78,12.8,6.75,20.2,5.65
8,11,429.1,160.41,139.4,26.2,8.8,2.7,38.4,15.15,46.6,30.4,127.4,28.25,10.2,1.03,8.4,4.78,15.3,4.48
9,12,457.1,147.29,118.3,18.82,8.9,2.6,36.3,12.89,57.7,37.45,93.9,16.56,24.8,2.38,18.3,6.19,23.8,5.03


In [42]:
# Handle the Hodgkin's Lymphoma data separately because the incidence data is different
df_inc = pd.read_csv(
    "../../data/raw/Invasive-Cancer-Incidence-Rates-by-State-in-North-America-Hodgkin-Lymphoma-2015---2019.csv",
    header=2,
)
df_inc = df_inc.rename(
    columns={
        "United States": "state",
        "Age-adjusted Rate": "hodgkins_lymphoma_cancer_incidence_rate_per_100000",
    }
)
df_inc = df_inc[["state", "hodgkins_lymphoma_cancer_incidence_rate_per_100000"]]
df_mort = pd.read_csv(
    "../../data/raw/Mortality_US_by_State_All_Races_Hodgkin_Lymphoma_(Both_Sexes)_2014_-_2018.csv",
    header=1,
)
df_mort = df_mort.rename(
    columns={
        "Area": "state",
        "State FIPS": "fips",
        "Age-Adjusted Rate per 100,000": "hodgkins_lymphoma_cancer_mortality_rate_per_100000",
    }
)
df_mort = df_mort[["fips", "state", "hodgkins_lymphoma_cancer_mortality_rate_per_100000"]]
df_hodgkin = df_mort.set_index("state").join(df_inc.set_index("state")).reset_index()
df_hodgkin = df_hodgkin.drop(columns=["state"])
df = df.set_index("fips").join(df_hodgkin.set_index("fips"), how="outer").reset_index()

In [43]:
# Risk factors

# Colon cancer screening
df = add_file(
    df,
    "Screening_and_Risk_Factors_US_by_State_Colorectal_Screening_Home-based_FOBT_in_the_past_two_years_or_ever_had_a_colorectal_endoscopy_(Ages_50+_Both_Sexes_2018).csv",
    {"State FIPS": "fips", "Percent": "colorectal_screening_percent"},
)
# Smoking
df = add_file(
    df,
    "Screening_and_Risk_Factors_US_by_State_Smoking_Ever_Smoked_100_Cigarettes_(Ages_18+_Both_Sexes_2018).csv",
    {"State FIPS": "fips", "Percent": "smoking_percent"},
)
# HPV vaccination
df = add_file(
    df,
    "Screening_and_Risk_Factors_US_by_State_Women's_Health_Percent_who_received_3+_doses_of_HPV_Vaccine_(Ages_13-17_Both_Sexes_2018).csv",
    {"State FIPS": "fips", "Percent": "hpv_vaccine_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,cervical_cancer_incidence_rate_per_100000,cervical_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,...,melanoma_cancer_mortality_rate_per_100000,leukemia_cancer_incidence_rate_per_100000,leukemia_cancer_mortality_rate_per_100000,non_hodgkin_lymphoma_cancer_incidence_rate_per_100000,non_hodgkin_lymphoma_cancer_mortality_rate_per_100000,hodgkins_lymphoma_cancer_mortality_rate_per_100000,hodgkins_lymphoma_cancer_incidence_rate_per_100000,colorectal_screening_percent,smoking_percent,hpv_vaccine_percent
0,1,451.5,173.42,121.6,21.54,9.4,3.46,43.6,15.46,64.9,...,2.38,12.3,6.4,16.2,5.02,0.31,2.42,76.07,43.94,34.9
1,2,417.2,152.62,120.8,18.8,7.1,1.65,42.6,15.05,55.8,...,2.19,11.8,5.55,17.1,5.5,no data/suppressed,2.13,68.36,43.22,32.3
2,4,386.7,137.88,114.4,18.47,6.5,2.1,32.6,12.62,46.8,...,2.48,11.0,5.85,15.5,4.67,0.28,2.0,75.32,40.35,39.9
3,5,472.8,177.63,118.2,20.33,9.5,3.43,44.1,15.59,77.2,...,2.12,14.5,6.79,18.2,5.3,0.30,2.62,73.39,47.06,26.8
4,6,404.6,140.3,121.5,19.35,7.2,2.24,35.1,12.5,41.5,...,2.06,12.5,5.86,18.4,5.17,0.31,2.18,75.6,33.45,38.8
5,8,399.1,133.89,127.6,18.9,6.2,1.53,33.2,11.78,41.4,...,2.64,13.3,5.75,17.2,4.59,0.27,2.43,76.11,40.43,40.0
6,9,470.6,141.16,140.5,17.43,6.1,1.35,35.3,10.69,59.6,...,2.07,14.5,5.95,21.1,5.19,0.25,3.25,80.21,39.47,31.8
7,10,488.2,164.83,134.7,21.44,7.8,2.53,37.9,13.27,66.9,...,2.78,12.8,6.75,20.2,5.65,0.28,2.79,79.32,42.27,39.3
8,11,429.1,160.41,139.4,26.2,8.8,2.7,38.4,15.15,46.6,...,1.03,8.4,4.78,15.3,4.48,no data/suppressed,2.4,78.63,34.72,53.3
9,12,457.1,147.29,118.3,18.82,8.9,2.6,36.3,12.89,57.7,...,2.38,18.3,6.19,23.8,5.03,0.27,3.14,77.86,41.89,30.4


In [44]:
# Replace no data with NaN
df = df.replace("no data/suppressed", np.nan)
# Write out the cleaned data
df.to_csv("../../data/processed/nci_state.csv", index=False)