In [55]:
# Import libraries
import numpy as np
import pandas as pd

In [56]:
# Read csv file
file = "./CDC_Disease_Data_2020_2023.csv"

diseases_df = pd.read_csv(file)

diseases_df.head()

# information on df
print(diseases_df.columns)
print("========================================================")
print(diseases_df.describe())

Index(['Data As Of', 'Jurisdiction of Occurrence', 'MMWR Year', 'MMWR Week',
       'Week Ending Date', 'All Cause', 'Natural Cause',
       'Septicemia (A40-A41)', 'Malignant neoplasms (C00-C97)',
       'Diabetes mellitus (E10-E14)', 'Alzheimer disease (G30)',
       'Influenza and pneumonia (J09-J18)',
       'Chronic lower respiratory diseases (J40-J47)',
       'Other diseases of respiratory system (J00-J06,J30-J39,J67,J70-J98)',
       'Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)',
       'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)',
       'Diseases of heart (I00-I09,I11,I13,I20-I51)',
       'Cerebrovascular diseases (I60-I69)',
       'COVID-19 (U071, Multiple Cause of Death)',
       'COVID-19 (U071, Underlying Cause of Death)', 'flag_allcause',
       'flag_natcause', 'flag_sept', 'flag_neopl', 'flag_diab', 'flag_alz',
       'flag_inflpn', 'flag_clrd', 'flag_otherresp', 'flag_nephr',
       'fl

In [57]:
# Clean data
# Drop columns not required and/or with nan values
# Finding nan valued columns
diseases_df.columns[diseases_df.isna().any()].tolist()

# Replace nan values with 0
diseases_df.fillna(0, inplace=True)

# check
# Finding nan valued columns - should be empty
diseases_df.columns[diseases_df.isna().any()].tolist()

# Drop columns not required and/or filled with nan values
diseases_df.drop(columns = ['Data As Of',
                            'MMWR Year', 'MMWR Week',
                            'flag_allcause', 'flag_natcause', 'flag_sept',
                            'flag_neopl', 'flag_diab', 'flag_alz', 'flag_inflpn',
                            'flag_clrd', 'flag_otherresp', 'flag_nephr',
                            'flag_otherunk', 'flag_hd', 'flag_stroke',
                            'flag_cov19mcod', 'flag_cov19ucod'],
                 inplace=True)

# verify
print(diseases_df.columns)
print("========================================================")
print(diseases_df.head())

Index(['Jurisdiction of Occurrence', 'Week Ending Date', 'All Cause',
       'Natural Cause', 'Septicemia (A40-A41)',
       'Malignant neoplasms (C00-C97)', 'Diabetes mellitus (E10-E14)',
       'Alzheimer disease (G30)', 'Influenza and pneumonia (J09-J18)',
       'Chronic lower respiratory diseases (J40-J47)',
       'Other diseases of respiratory system (J00-J06,J30-J39,J67,J70-J98)',
       'Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)',
       'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)',
       'Diseases of heart (I00-I09,I11,I13,I20-I51)',
       'Cerebrovascular diseases (I60-I69)',
       'COVID-19 (U071, Multiple Cause of Death)',
       'COVID-19 (U071, Underlying Cause of Death)'],
      dtype='object')
  Jurisdiction of Occurrence Week Ending Date  All Cause  Natural Cause  \
0              United States       2020-01-04      60179          55010   
1              United States       2020-01-1

In [58]:
# Filter data to only keep 2023
# Create date format column and find year only
diseases_df['Week Ending Date'] = pd.to_datetime(diseases_df['Week Ending Date'])
# diseases_df['Week Ending Date'].dt.year

# Keep only year 2023
diseases2023_df = diseases_df[diseases_df['Week Ending Date'].dt.year == 2023]
diseases2023_df.head()

Unnamed: 0,Jurisdiction of Occurrence,Week Ending Date,All Cause,Natural Cause,Septicemia (A40-A41),Malignant neoplasms (C00-C97),Diabetes mellitus (E10-E14),Alzheimer disease (G30),Influenza and pneumonia (J09-J18),Chronic lower respiratory diseases (J40-J47),"Other diseases of respiratory system (J00-J06,J30-J39,J67,J70-J98)","Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)","Diseases of heart (I00-I09,I11,I13,I20-I51)",Cerebrovascular diseases (I60-I69),"COVID-19 (U071, Multiple Cause of Death)","COVID-19 (U071, Underlying Cause of Death)"
157,United States,2023-01-07,69134,62822,993.0,11943.0,2220.0,2552.0,1770.0,3384.0,1079.0,1260.0,878.0,15038.0,3522.0,3874.0,2710.0
158,United States,2023-01-14,66490,60697,939.0,11846.0,2031.0,2551.0,1452.0,3391.0,1056.0,1154.0,792.0,14534.0,3367.0,3693.0,2553.0
159,United States,2023-01-21,64321,58596,855.0,12006.0,1921.0,2430.0,1192.0,3100.0,1133.0,1132.0,835.0,14043.0,3280.0,3247.0,2122.0
160,United States,2023-01-28,62537,57061,905.0,11685.0,1896.0,2362.0,1130.0,3025.0,1050.0,1124.0,811.0,13795.0,3279.0,2910.0,1902.0
161,United States,2023-02-04,62865,57141,854.0,11858.0,2042.0,2273.0,975.0,3021.0,1007.0,1145.0,864.0,13849.0,3300.0,2644.0,1663.0


In [59]:
diseases2023_df.rename(columns={
    "Septicemia (A40-A41)": "Septicemia",
    "Malignant neoplasms (C00-C97)": "Malignant neoplasms",
    "Diabetes mellitus (E10-E14)": "Diabetes mellitus",
    "Alzheimer disease (G30)": "Alzheimer disease",
    "Influenza and pneumonia (J09-J18)": "Influenza and pneumonia",
    "Chronic lower respiratory diseases (J40-J47)": "Chronic lower respiratory diseases",
    "Other diseases of respiratory system (J00-J06,J30-J39,J67,J70-J98)": "Other diseases of respiratory system",
    "Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)": "Nephritis, nephrotic syndrome and nephrosis",
    "Diseases of heart (I00-I09,I11,I13,I20-I51)": "Diseases of heart",
    "Cerebrovascular diseases (I60-I69)": "Cerebrovascular diseases",
    "COVID-19 (U071, Multiple Cause of Death)": "COVID-19 (Multiple Cause of Death)",
    "COVID-19 (U071, Underlying Cause of Death)": "COVID-19 (Underlying Cause of Death)"
}, inplace=True)
diseases2023_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases2023_df.rename(columns={


Unnamed: 0,Jurisdiction of Occurrence,Week Ending Date,All Cause,Natural Cause,Septicemia,Malignant neoplasms,Diabetes mellitus,Alzheimer disease,Influenza and pneumonia,Chronic lower respiratory diseases,Other diseases of respiratory system,"Nephritis, nephrotic syndrome and nephrosis","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",Diseases of heart,Cerebrovascular diseases,COVID-19 (Multiple Cause of Death),COVID-19 (Underlying Cause of Death)
157,United States,2023-01-07,69134,62822,993.0,11943.0,2220.0,2552.0,1770.0,3384.0,1079.0,1260.0,878.0,15038.0,3522.0,3874.0,2710.0
158,United States,2023-01-14,66490,60697,939.0,11846.0,2031.0,2551.0,1452.0,3391.0,1056.0,1154.0,792.0,14534.0,3367.0,3693.0,2553.0
159,United States,2023-01-21,64321,58596,855.0,12006.0,1921.0,2430.0,1192.0,3100.0,1133.0,1132.0,835.0,14043.0,3280.0,3247.0,2122.0
160,United States,2023-01-28,62537,57061,905.0,11685.0,1896.0,2362.0,1130.0,3025.0,1050.0,1124.0,811.0,13795.0,3279.0,2910.0,1902.0
161,United States,2023-02-04,62865,57141,854.0,11858.0,2042.0,2273.0,975.0,3021.0,1007.0,1145.0,864.0,13849.0,3300.0,2644.0,1663.0


In [60]:
pop_df = pd.read_csv('./population.csv')
pop_df = pop_df[['NAME', 'POPESTIMATE2023']]
pop_df.rename(columns={
    'NAME': 'Jurisdiction of Occurrence',
    'POPESTIMATE2023': 'Population'
}, inplace=True)
pop_df

Unnamed: 0,Jurisdiction of Occurrence,Population
0,United States,334914895
1,Northeast Region,56983517
2,New England,15159777
3,Middle Atlantic,41823740
4,Midwest Region,68909283
...,...,...
61,Washington,7812880
62,West Virginia,1770071
63,Wisconsin,5910955
64,Wyoming,584057


In [61]:
diseases2023_df = diseases2023_df.merge(pop_df, on='Jurisdiction of Occurrence', how='inner')
diseases2023_df

Unnamed: 0,Jurisdiction of Occurrence,Week Ending Date,All Cause,Natural Cause,Septicemia,Malignant neoplasms,Diabetes mellitus,Alzheimer disease,Influenza and pneumonia,Chronic lower respiratory diseases,Other diseases of respiratory system,"Nephritis, nephrotic syndrome and nephrosis","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",Diseases of heart,Cerebrovascular diseases,COVID-19 (Multiple Cause of Death),COVID-19 (Underlying Cause of Death),Population
0,United States,2023-01-07,69134,62822,993.0,11943.0,2220.0,2552.0,1770.0,3384.0,1079.0,1260.0,878.0,15038.0,3522.0,3874.0,2710.0,334914895
1,United States,2023-01-14,66490,60697,939.0,11846.0,2031.0,2551.0,1452.0,3391.0,1056.0,1154.0,792.0,14534.0,3367.0,3693.0,2553.0,334914895
2,United States,2023-01-21,64321,58596,855.0,12006.0,1921.0,2430.0,1192.0,3100.0,1133.0,1132.0,835.0,14043.0,3280.0,3247.0,2122.0,334914895
3,United States,2023-01-28,62537,57061,905.0,11685.0,1896.0,2362.0,1130.0,3025.0,1050.0,1124.0,811.0,13795.0,3279.0,2910.0,1902.0,334914895
4,United States,2023-02-04,62865,57141,854.0,11858.0,2042.0,2273.0,975.0,3021.0,1007.0,1145.0,864.0,13849.0,3300.0,2644.0,1663.0,334914895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,Puerto Rico,2023-08-19,612,590,0.0,92.0,48.0,48.0,22.0,15.0,13.0,17.0,29.0,112.0,27.0,15.0,12.0,3205691
1957,Puerto Rico,2023-08-26,657,624,15.0,110.0,48.0,64.0,13.0,15.0,11.0,23.0,18.0,120.0,30.0,21.0,18.0,3205691
1958,Puerto Rico,2023-09-02,580,552,16.0,97.0,70.0,62.0,10.0,14.0,0.0,22.0,21.0,98.0,21.0,16.0,12.0,3205691
1959,Puerto Rico,2023-09-09,533,516,10.0,81.0,50.0,47.0,14.0,19.0,11.0,19.0,23.0,99.0,18.0,16.0,13.0,3205691


In [62]:
diseases2023_df = diseases2023_df.assign(**{col: (diseases2023_df[col] / diseases2023_df['Population']) * 100000 for col in ['All Cause',
'Natural Cause', 'Septicemia', 'Malignant neoplasms',
'Diabetes mellitus', 'Alzheimer disease', 'Influenza and pneumonia',
'Chronic lower respiratory diseases',
'Other diseases of respiratory system',
'Nephritis, nephrotic syndrome and nephrosis',
'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)',
'Diseases of heart', 'Cerebrovascular diseases',
'COVID-19 (Multiple Cause of Death)',
'COVID-19 (Underlying Cause of Death)']})
diseases2023_df

Unnamed: 0,Jurisdiction of Occurrence,Week Ending Date,All Cause,Natural Cause,Septicemia,Malignant neoplasms,Diabetes mellitus,Alzheimer disease,Influenza and pneumonia,Chronic lower respiratory diseases,Other diseases of respiratory system,"Nephritis, nephrotic syndrome and nephrosis","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",Diseases of heart,Cerebrovascular diseases,COVID-19 (Multiple Cause of Death),COVID-19 (Underlying Cause of Death),Population
0,United States,2023-01-07,20.642259,18.757601,0.296493,3.565981,0.662855,0.761985,0.528492,1.010406,0.322171,0.376215,0.262156,4.490096,1.051610,1.156712,0.809161,334914895
1,United States,2023-01-14,19.852805,18.123112,0.280370,3.537018,0.606423,0.761686,0.433543,1.012496,0.315304,0.344565,0.236478,4.339610,1.005330,1.102668,0.762283,334914895
2,United States,2023-01-21,19.205177,17.495788,0.255289,3.584791,0.573579,0.725557,0.355911,0.925608,0.338295,0.337996,0.249317,4.193006,0.979353,0.969500,0.633594,334914895
3,United States,2023-01-28,18.672505,17.037463,0.270218,3.488946,0.566114,0.705254,0.337399,0.903215,0.313512,0.335608,0.242151,4.118957,0.979055,0.868877,0.567905,334914895
4,United States,2023-02-04,18.770440,17.061349,0.254990,3.540601,0.609707,0.678680,0.291119,0.902020,0.300673,0.341878,0.257976,4.135080,0.985325,0.789454,0.496544,334914895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,Puerto Rico,2023-08-19,19.091048,18.404768,0.000000,2.869896,1.497337,1.497337,0.686279,0.467918,0.405529,0.530307,0.904641,3.493787,0.842252,0.467918,0.374334,3205691
1957,Puerto Rico,2023-08-26,20.494801,19.465382,0.467918,3.431397,1.497337,1.996449,0.405529,0.467918,0.343140,0.717474,0.561501,3.743343,0.935836,0.655085,0.561501,3205691
1958,Puerto Rico,2023-09-02,18.092823,17.219376,0.499112,3.025869,2.183617,1.934060,0.311945,0.436723,0.000000,0.686279,0.655085,3.057063,0.655085,0.499112,0.374334,3205691
1959,Puerto Rico,2023-09-09,16.626680,16.096374,0.311945,2.526756,1.559726,1.466143,0.436723,0.592696,0.343140,0.592696,0.717474,3.088258,0.561501,0.499112,0.405529,3205691


In [65]:
diseases2023_df = diseases2023_df.round(decimals=2)
diseases2023_df

Unnamed: 0,Jurisdiction of Occurrence,Week Ending Date,All Cause,Natural Cause,Septicemia,Malignant neoplasms,Diabetes mellitus,Alzheimer disease,Influenza and pneumonia,Chronic lower respiratory diseases,Other diseases of respiratory system,"Nephritis, nephrotic syndrome and nephrosis","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",Diseases of heart,Cerebrovascular diseases,COVID-19 (Multiple Cause of Death),COVID-19 (Underlying Cause of Death),Population
0,United States,2023-01-07,20.64,18.76,0.30,3.57,0.66,0.76,0.53,1.01,0.32,0.38,0.26,4.49,1.05,1.16,0.81,334914895
1,United States,2023-01-14,19.85,18.12,0.28,3.54,0.61,0.76,0.43,1.01,0.32,0.34,0.24,4.34,1.01,1.10,0.76,334914895
2,United States,2023-01-21,19.21,17.50,0.26,3.58,0.57,0.73,0.36,0.93,0.34,0.34,0.25,4.19,0.98,0.97,0.63,334914895
3,United States,2023-01-28,18.67,17.04,0.27,3.49,0.57,0.71,0.34,0.90,0.31,0.34,0.24,4.12,0.98,0.87,0.57,334914895
4,United States,2023-02-04,18.77,17.06,0.25,3.54,0.61,0.68,0.29,0.90,0.30,0.34,0.26,4.14,0.99,0.79,0.50,334914895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,Puerto Rico,2023-08-19,19.09,18.40,0.00,2.87,1.50,1.50,0.69,0.47,0.41,0.53,0.90,3.49,0.84,0.47,0.37,3205691
1957,Puerto Rico,2023-08-26,20.49,19.47,0.47,3.43,1.50,2.00,0.41,0.47,0.34,0.72,0.56,3.74,0.94,0.66,0.56,3205691
1958,Puerto Rico,2023-09-02,18.09,17.22,0.50,3.03,2.18,1.93,0.31,0.44,0.00,0.69,0.66,3.06,0.66,0.50,0.37,3205691
1959,Puerto Rico,2023-09-09,16.63,16.10,0.31,2.53,1.56,1.47,0.44,0.59,0.34,0.59,0.72,3.09,0.56,0.50,0.41,3205691


In [66]:
# information on new df
print(diseases2023_df.columns)
print("========================================================")
print(diseases2023_df.dtypes)
print("========================================================")
print(diseases2023_df.describe())

Index(['Jurisdiction of Occurrence', 'Week Ending Date', 'All Cause',
       'Natural Cause', 'Septicemia', 'Malignant neoplasms',
       'Diabetes mellitus', 'Alzheimer disease', 'Influenza and pneumonia',
       'Chronic lower respiratory diseases',
       'Other diseases of respiratory system',
       'Nephritis, nephrotic syndrome and nephrosis',
       'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)',
       'Diseases of heart', 'Cerebrovascular diseases',
       'COVID-19 (Multiple Cause of Death)',
       'COVID-19 (Underlying Cause of Death)', 'Population'],
      dtype='object')
Jurisdiction of Occurrence                                                                                   object
Week Ending Date                                                                                     datetime64[ns]
All Cause                                                                                                   float64
Natura

In [67]:
# Group by state for whole year
disease_count_state = diseases2023_df.groupby(['Jurisdiction of Occurrence']).sum(numeric_only=True)
disease_count_state.head()

Unnamed: 0_level_0,All Cause,Natural Cause,Septicemia,Malignant neoplasms,Diabetes mellitus,Alzheimer disease,Influenza and pneumonia,Chronic lower respiratory diseases,Other diseases of respiratory system,"Nephritis, nephrotic syndrome and nephrosis","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",Diseases of heart,Cerebrovascular diseases,COVID-19 (Multiple Cause of Death),COVID-19 (Underlying Cause of Death),Population
Jurisdiction of Occurrence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Alabama,775.1,713.52,14.84,140.77,18.37,30.53,11.15,41.44,15.68,16.97,18.76,190.01,41.11,16.1,10.47,189013316
Alaska,464.42,386.3,0.0,86.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.19,1.36,0.0,0.0,27136022
Arizona,663.65,590.84,4.51,123.1,22.3,25.3,8.82,33.44,11.31,7.45,22.12,138.27,29.69,13.5,8.14,274959728
Arkansas,797.27,739.9,13.22,150.33,28.98,33.15,10.34,52.81,8.43,16.1,11.69,187.23,39.15,13.82,7.32,113506084
California,529.37,487.69,3.21,108.45,19.95,28.54,9.31,21.82,6.1,8.82,16.25,114.05,31.84,12.51,8.66,1441712141


In [68]:
# Group by state and month
disease_count_state_month = diseases2023_df.groupby(['Jurisdiction of Occurrence',
                              diseases2023_df['Week Ending Date'].dt.year,
                              diseases2023_df['Week Ending Date'].dt.month]).sum(numeric_only=True)
disease_count_state_month.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,All Cause,Natural Cause,Septicemia,Malignant neoplasms,Diabetes mellitus,Alzheimer disease,Influenza and pneumonia,Chronic lower respiratory diseases,Other diseases of respiratory system,"Nephritis, nephrotic syndrome and nephrosis","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",Diseases of heart,Cerebrovascular diseases,COVID-19 (Multiple Cause of Death),COVID-19 (Underlying Cause of Death),Population
Jurisdiction of Occurrence,Week Ending Date,Week Ending Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama,2023,1,94.75,86.7,2.07,16.15,2.04,3.7,1.84,5.3,1.78,2.2,1.34,23.3,4.7,4.56,3.5,20433872
Alabama,2023,2,94.08,86.19,1.8,16.66,2.55,3.72,1.27,5.1,1.54,2.26,1.48,23.24,4.84,3.39,2.61,20433872
Alabama,2023,3,86.94,78.74,1.54,15.21,2.12,3.13,1.56,5.06,1.74,1.93,1.6,21.47,4.45,1.82,1.24,20433872
Alabama,2023,4,107.54,98.01,1.97,19.32,2.69,4.34,1.22,5.85,2.35,2.55,2.01,26.46,5.56,1.71,0.87,25542340
Alabama,2023,5,86.53,78.34,1.59,16.05,1.98,3.19,1.34,4.15,1.66,1.88,1.9,21.57,4.58,1.05,0.45,20433872


In [69]:
# load dataframes into csv
# Full cleaned disease dataframe
diseases_df.to_csv("2020_to_2023_Count_of_Death_by_State_and_Cause_cleaned.csv")

# 2023 Disease data
diseases2023_df.to_csv("2023_Count_of_Death_by_State_and_Cause.csv")

# Disease Count by State
disease_count_state.to_csv("2023_Count_of_Death_by_State.csv")

# Disease Count by state and month dataframe
disease_count_state_month.to_csv("2023_Count_of_Death_by_State_and_Month.csv")

### GeoJSON Preparation

In [70]:
import json

In [71]:
# Retrieve death counts by state and make it a DataFrame
death_counts = pd.read_csv('./2023_Count_of_Death_by_State.csv')
death_counts.head()

Unnamed: 0,Jurisdiction of Occurrence,All Cause,Natural Cause,Septicemia,Malignant neoplasms,Diabetes mellitus,Alzheimer disease,Influenza and pneumonia,Chronic lower respiratory diseases,Other diseases of respiratory system,"Nephritis, nephrotic syndrome and nephrosis","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",Diseases of heart,Cerebrovascular diseases,COVID-19 (Multiple Cause of Death),COVID-19 (Underlying Cause of Death),Population
0,Alabama,775.1,713.52,14.84,140.77,18.37,30.53,11.15,41.44,15.68,16.97,18.76,190.01,41.11,16.1,10.47,189013316
1,Alaska,464.42,386.3,0.0,86.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.19,1.36,0.0,0.0,27136022
2,Arizona,663.65,590.84,4.51,123.1,22.3,25.3,8.82,33.44,11.31,7.45,22.12,138.27,29.69,13.5,8.14,274959728
3,Arkansas,797.27,739.9,13.22,150.33,28.98,33.15,10.34,52.81,8.43,16.1,11.69,187.23,39.15,13.82,7.32,113506084
4,California,529.37,487.69,3.21,108.45,19.95,28.54,9.31,21.82,6.1,8.82,16.25,114.05,31.84,12.51,8.66,1441712141


In [72]:
# Save our GeoJSON file to a variable
with open('state_outlines.json', 'r') as file:
    data = json.load(file)

In [73]:
# Loop through every state in our GeoJSON
for state in data['features']:
    try:
        # Find the row in our DataFrame that corresponds to the current state
        state_data = death_counts.loc[death_counts['Jurisdiction of Occurrence'] == state['properties']['NAME']].values
    except KeyError:
        continue
    # Add key information from our DataFrame to the 'properties' field of the current state
    state['properties']['All Cause'] = state_data[0][1]
    state['properties']['Natural Cause'] = state_data[0][2]
    state['properties']['Septicemia'] = state_data[0][3]
    state['properties']['Malignant neoplasms'] = state_data[0][4]
    state['properties']['Diabetes mellitus'] = state_data[0][5]
    state['properties']['Alzheimer disease'] = state_data[0][6]
    state['properties']['Influenza and pneumonia'] = state_data[0][7]
    state['properties']['Chronic lower respiratory diseases'] = state_data[0][8]
    state['properties']['Other diseases of respiratory system'] = state_data[0][9]
    state['properties']['Nephritis, nephrotic syndrome and nephrosis'] = state_data[0][10]
    state['properties']['Not elsewhere classified'] = state_data[0][11]
    state['properties']['Diseases of heart'] = state_data[0][12]
    state['properties']['Cerebrovascular diseases'] = state_data[0][13]
    state['properties']['COVID-19 (Multiple Cause of Death)'] = state_data[0][14]
    state['properties']['COVID-19 (Underlying Cause of Death)'] = state_data[0][15]

In [74]:
# Initialize the metadata field of our GeoJSON object
data['metadata'] = {}

# Initialize arrays to hold the death counts of each of our diseases
data['metadata']['All Cause'] = []
data['metadata']['Natural Cause'] = []
data['metadata']['Septicemia'] = []
data['metadata']['Malignant neoplasms'] = []
data['metadata']['Diabetes mellitus'] = []
data['metadata']['Alzheimer disease'] = []
data['metadata']['Influenza and pneumonia'] = []
data['metadata']['Chronic lower respiratory diseases'] = []
data['metadata']['Other diseases of respiratory system'] = []
data['metadata']['Nephritis, nephrotic syndrome and nephrosis'] = []
data['metadata']['Not elsewhere classified'] = []
data['metadata']['Diseases of heart'] = []
data['metadata']['Cerebrovascular diseases'] = []
data['metadata']['COVID-19 (Multiple Cause of Death)'] = []
data['metadata']['COVID-19 (Underlying Cause of Death)'] = []

In [75]:
# Loop through all the states in our GeoJSON and add relevant data to our arrays
for state in data['features']:
    data['metadata']['All Cause'].append(state['properties']['All Cause'])
    data['metadata']['Natural Cause'].append(state['properties']['Natural Cause'])
    data['metadata']['Septicemia'].append(state['properties']['Septicemia'])
    data['metadata']['Malignant neoplasms'].append(state['properties']['Malignant neoplasms'])
    data['metadata']['Diabetes mellitus'].append(state['properties']['Diabetes mellitus'])
    data['metadata']['Alzheimer disease'].append(state['properties']['Alzheimer disease'])
    data['metadata']['Influenza and pneumonia'].append(state['properties']['Influenza and pneumonia'])
    data['metadata']['Chronic lower respiratory diseases'].append(state['properties']['Chronic lower respiratory diseases'])
    data['metadata']['Other diseases of respiratory system'].append(state['properties']['Other diseases of respiratory system'])
    data['metadata']['Nephritis, nephrotic syndrome and nephrosis'].append(state['properties']['Nephritis, nephrotic syndrome and nephrosis'])
    data['metadata']['Not elsewhere classified'].append(state['properties']['Not elsewhere classified'])
    data['metadata']['Diseases of heart'].append(state['properties']['Diseases of heart'])
    data['metadata']['Cerebrovascular diseases'].append(state['properties']['Cerebrovascular diseases'])
    data['metadata']['COVID-19 (Multiple Cause of Death)'].append(state['properties']['COVID-19 (Multiple Cause of Death)'])
    data['metadata']['COVID-19 (Underlying Cause of Death)'].append(state['properties']['COVID-19 (Underlying Cause of Death)'])

In [76]:
# Store our updated data in JSON format
new_geojson = json.dumps(data, indent=4)

# Rewrite the original file
with open('state_outlines.json', 'w') as file:
    file.write(new_geojson)