In [1]:
import pandas as pd
import numpy as np

In [7]:
# Create a function that allows you to read a file, rename columns, and add it to the dataframe
def add_file(old_df, file, cols):
    df_new = pd.read_csv("../../data/raw/" + file, header=1)
    df_new = df_new.rename(columns=cols)
    df_new = df_new[cols.values()]
    return old_df.join(df_new.set_index("fips"), on="fips")

In [8]:
# Start with the incidence rate of all cancers for both sexes
file = "Incidence_US_by_County_All_Races_All_Cancer_Sites_(Both_Sexes)_2013_-_2017.csv"
df = pd.read_csv("../../data/raw/" + file, header=1)
df = df.rename(
    columns={
        "County FIPS": "fips",
        "Age-Adjusted Rate per 100,000": "cancer_incidence_rate_per_100000",
    }
)
df = df[["fips", "cancer_incidence_rate_per_100000"]]
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000
0,12125,1143.5
1,31113,739.4
2,40055,710.4
3,21197,648.2
4,31163,629.5
...,...,...
1995,31107,431.4
1996,48497,431.4
1997,51113,431.4
1998,46019,431.4


In [9]:
# Mortality
df = add_file(
    df,
    "Mortality_US_by_County_All_Races_All_Malignant_Cancers_(Both_Sexes)_2014_-_2018.csv",
    {
        "County FIPS": "fips",
        "Age-Adjusted Rate per 100,000": "cancer_mortality_rate_per_100000",
    },
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000
0,12125,1143.5,454.48
1,31113,739.4,
2,40055,710.4,209.41
3,21197,648.2,246.56
4,31163,629.5,
...,...,...,...
1995,31107,431.4,
1996,48497,431.4,181.40
1997,51113,431.4,181.05
1998,46019,431.4,178.03


In [10]:
# Incidence and mortality for other cancers
def add_cancer_rates(df_old, file_name, formatted_name):
    df_new = add_file(
        df_old,
        "Incidence_US_by_County_All_Races_" + file_name + "_2013_-_2017.csv",
        {
            "County FIPS": "fips",
            "Age-Adjusted Rate per 100,000": formatted_name
            + "_cancer_incidence_rate_per_100000",
        },
    )
    df_new = add_file(
        df_new,
        "Mortality_US_by_County_All_Races_" + file_name + "_2014_-_2018.csv",
        {
            "County FIPS": "fips",
            "Age-Adjusted Rate per 100,000": formatted_name
            + "_cancer_mortality_rate_per_100000",
        },
    )
    return df_new


df = add_cancer_rates(df, "Breast_(Female)", "breast")
df = add_cancer_rates(df, "Colon_&_Rectum_(Both_Sexes)", "colorectal")
df = add_cancer_rates(df, "Lung_&_Bronchus_(Both_Sexes)", "lung")
df = add_cancer_rates(df, "Prostate_(Male)", "prostate")
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,no data/suppressed
1,31113,739.4,,,,,,,,,
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,no data/suppressed
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,no data/suppressed
4,31163,629.5,,157.1,,,,,,161.8,
...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,18.48
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,


In [11]:
# Household income
df = add_file(
    df,
    "Demographics_US_by_County_Income_Median_Household_Income_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Dollars": "median_household_income"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,median_household_income
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,no data/suppressed,
1,31113,739.4,,,,,,,,,,51607.0
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,no data/suppressed,
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,no data/suppressed,
4,31163,629.5,,157.1,,,,,,161.8,,51172.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,,49752.0
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,18.48,61709.0
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,,54197.0
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,,


In [12]:
# Language isolation
df = add_file(
    df,
    "Demographics_US_by_County_Non-English_Language_Language_Isolation_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "language_isolation_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,no data/suppressed,,
1,31113,739.4,,,,,,,,,,51607.0,1.8
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,no data/suppressed,,0.7
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,no data/suppressed,,
4,31163,629.5,,157.1,,,,,,161.8,,51172.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,,49752.0,0.8
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,18.48,61709.0,3.3
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,,54197.0,0.5
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,,,


In [13]:
# Age demographics
df = add_file(
    df,
    "Demographics_US_by_County_Population_Age_65_and_Over_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "over_65_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,no data/suppressed,,,34.6
1,31113,739.4,,,,,,,,,,51607.0,1.8,
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,no data/suppressed,,0.7,33.5
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,no data/suppressed,,,33.5
4,31163,629.5,,157.1,,,,,,161.8,,51172.0,,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,,49752.0,0.8,
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,18.48,61709.0,3.3,33.6
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,,54197.0,0.5,33.1
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,,,,


In [14]:
# Families below the poverty line
df = add_file(
    df,
    "Demographics_US_by_County_Poverty_Families_Below_Poverty_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "below_poverty_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,no data/suppressed,,,34.6,17.6
1,31113,739.4,,,,,,,,,,51607.0,1.8,,
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,no data/suppressed,,0.7,33.5,14.7
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,no data/suppressed,,,33.5,16.4
4,31163,629.5,,157.1,,,,,,161.8,,51172.0,,34.0,9.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,,49752.0,0.8,,
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,18.48,61709.0,3.3,33.6,9.2
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,,54197.0,0.5,33.1,
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,,,,,


In [15]:
# Uninsured rate
df = add_file(
    df,
    "Demographics_US_by_County_Uninsured_Ages__65_at_or_below_138%_of_Poverty_(Both_Sexes_-_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "uninsured_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,no data/suppressed,,,34.6,17.6,18.6
1,31113,739.4,,,,,,,,,,51607.0,1.8,,,16.9
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,no data/suppressed,,0.7,33.5,14.7,19.0
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,no data/suppressed,,,33.5,16.4,
4,31163,629.5,,157.1,,,,,,161.8,,51172.0,,34.0,9.1,19.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,,49752.0,0.8,,,18.0
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,18.48,61709.0,3.3,33.6,9.2,36.1
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,,54197.0,0.5,33.1,,20.9
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,,,,,,21.9


In [16]:
# Ultraviolet exposure
df = add_file(
    df,
    "Screening_and_Risk_Factors_US_by_County_(UV_Only)_UV_Exposure_Data_Ultraviolet_Exposure.csv",
    {"County FIPS": "fips", "Watt-Hours Per Square Meter": "uv_exposure"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,prostate_cancer_mortality_rate_per_100000,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,no data/suppressed,,,34.6,17.6,18.6,4668.0
1,31113,739.4,,,,,,,,,,51607.0,1.8,,,16.9,4378.0
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,no data/suppressed,,0.7,33.5,14.7,19.0,4937.0
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,no data/suppressed,,,33.5,16.4,,
4,31163,629.5,,157.1,,,,,,161.8,,51172.0,,34.0,9.1,19.8,4401.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,,49752.0,0.8,,,18.0,4206.0
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,18.48,61709.0,3.3,33.6,9.2,36.1,4903.0
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,,54197.0,0.5,33.1,,20.9,4119.0
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,,,,,,21.9,4201.0


In [17]:
# Racial demographic data
df = add_file(
    df,
    "Demographics_US_by_County_Population_Asian_Pacific_Islander_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "asian_pacific_islander_percent"},
)
df = add_file(
    df,
    "Demographics_US_by_County_Population_Black_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "black_percent"},
)
df = add_file(
    df,
    "Demographics_US_by_County_Population_Hispanic_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "hispanic_percent"},
)
df = add_file(
    df,
    "Demographics_US_by_County_Population_White_(Both_Sexes_-_2014_to_2018)_2014_-_2018.csv",
    {"County FIPS": "fips", "Percent": "white_percent"},
)
df

Unnamed: 0,fips,cancer_incidence_rate_per_100000,cancer_mortality_rate_per_100000,breast_cancer_incidence_rate_per_100000,breast_cancer_mortality_rate_per_100000,colorectal_cancer_incidence_rate_per_100000,colorectal_cancer_mortality_rate_per_100000,lung_cancer_incidence_rate_per_100000,lung_cancer_mortality_rate_per_100000,prostate_cancer_incidence_rate_per_100000,...,median_household_income,language_isolation_percent,over_65_percent,below_poverty_percent,uninsured_percent,uv_exposure,asian_pacific_islander_percent,black_percent,hispanic_percent,white_percent
0,12125,1143.5,454.48,190.2,no data/suppressed,121.7,45.14,196.3,147.94,230.6,...,,,34.6,17.6,18.6,4668.0,0.6,21.9,5.5,
1,31113,739.4,,,,,,,,,...,51607.0,1.8,,,16.9,4378.0,,,2.9,98.6
2,40055,710.4,209.41,234.0,no data/suppressed,60.4,,95.4,61.15,140.3,...,,0.7,33.5,14.7,19.0,4937.0,,8.3,11.3,
3,21197,648.2,246.56,128.8,,73.2,26.48,132.5,90.67,,...,,,33.5,16.4,,,0.6,,,96.8
4,31163,629.5,,157.1,,,,,,161.8,...,51172.0,,34.0,9.1,19.8,4401.0,0.7,,,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,31107,431.4,,160.0,,,,,,110.7,...,49752.0,0.8,,,18.0,4206.0,,,,86.8
1996,48497,431.4,181.40,,28.48,43.7,16.51,72.1,55.09,,...,61709.0,3.3,33.6,9.2,36.1,4903.0,0.6,1.3,19.0,93.3
1997,51113,431.4,181.05,,,45.8,18.40,75.6,42.07,,...,54197.0,0.5,33.1,,20.9,4119.0,0.8,6.9,,85.8
1998,46019,431.4,178.03,,,52.7,,64.0,53.59,,...,,,,,21.9,4201.0,,,3.5,94.3


In [18]:
# Replace no data with NaN
df = df.replace("no data/suppressed", np.nan)

In [19]:
# Write out the cleaned data
df.to_csv("../../data/processed/nci.csv", index=False)