In [105]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import urllib.request
import PyPDF2
import re

In [14]:
#get latest files containing case hospital deaths, by_sex and by_age data
nyc_file_url = "https://raw.githubusercontent.com/nychealth/coronavirus-data/master/case-hosp-death.csv"
download_file_chd = "./case-hosp-death.csv"
urllib.request.urlretrieve(nyc_file_url, download_file_chd)
nyc_by_sex_url = "https://raw.githubusercontent.com/nychealth/coronavirus-data/master/by-sex.csv"
download_file_bs = "./by_sex.csv"
urllib.request.urlretrieve(nyc_by_sex_url, download_file_bs)
nyc_by_age_url="https://raw.githubusercontent.com/nychealth/coronavirus-data/master/by-age.csv"
download_file_ba = "./by_age.csv"
urllib.request.urlretrieve(nyc_by_age_url, download_file_ba)

('./by_age.csv', <http.client.HTTPMessage at 0x7fa3f5880b10>)

In [91]:
#get pdf file with race data on covid cases
nyc_by_race_url="https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-deaths-race-ethnicity-04082020-1.pdf"
download_file_race="./covid-19-deaths-race-ethnicity-04082020-1.pdf"
urllib.request.urlretrieve(nyc_by_race_url, download_file_race)


('./covid-19-deaths-race-ethnicity-04082020-1.pdf',
 <http.client.HTTPMessage at 0x7fa3f2280550>)

In [110]:
def get_number_dead(race,line):
    search_substring = line.find(race)
    if search_substring!= -1:
        return float(re.findall(r'\d+\n',line[search_substring:])[0])

In [106]:
def get_text_race_table(fn):
    pdfFileObj = open(fn, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    # our file has just 1 page
    pageObj = pdfReader.getPage(0)
    pagecontent = pageObj.extractText()
    #table data after line 5
    lines = pagecontent.split(".",maxsplit=5)
    return lines[5]

In [77]:
#setup dataframe for covid rates by age,sex and the general case_hospitalization_death rate
by_age_df = pd.read_csv(download_file_ba)
by_sex_df = pd.read_csv(download_file_bs)
chd_df = pd.read_csv(download_file_chd)

In [117]:
#setup dict with covid death rate by race
covid_racial_death_rate_text = get_text_race_table(download_file_race)
race_labels = "All Hispanic;Non\n-Hispanic/Latino: Black, African \nAmerica\nn;\
 Non\n-Hispanic/Latino: White;Non\n-Hispanic/Latino: Asian;Non\n-Hispanic/Latino: Other;\
Total \nUnknown \nRace/\nEthnicity"
by_race_dict = {}
for index,race in enumerate(race_labels.split(";")):
    num_dead = get_number_dead(race,covid_racial_death_rate_text )
    by_race_dict[race] = num_dead

In [78]:
def drop_last_row(df):
    df.drop(df.tail(1).index, inplace = True)

In [79]:
def plot_rate(df, xcol,ycol,plot_title="NYC rate"):
    fig = px.bar(df, x=xcol, y=ycol, labels={'x':'DATE'})
    fig.update_layout(title=plot_title)
    fig.show()

In [80]:
def plot_pie(df, values_col, names_col, plot_title="NYC Age/Sex Covid Rate"):
    fig = px.pie(df, values = values_col, names = names_col, title=plot_title)
    fig.show()

In [131]:
#Preprocessing
#Fill unavailable data with 0
by_age_df.fillna(0)
by_sex_df.fillna(0)
chd_df.fillna(0)
# Needed to access DATE OF INTEREST as column
mod_bs = by_sex_df.reset_index()
mod_ba = by_age_df.reset_index()
mod_chd = chd_df.reset_index()
# Needed to remove citywide totals before pie plot generation
drop_last_row(mod_bs)
drop_last_row(mod_ba)
#generate necessary dataframe from race dictionary
mydict["Race Label"]=list(by_race_dict.keys())
mydict["Number dead"]=list(by_race_dict.values())
by_race_df = pd.DataFrame.from_dict(mydict)

In [132]:
plot_rate(mod_chd,"DATE_OF_INTEREST","NEW_COVID_CASE_COUNT", "NYC Covid New Case Rate")

In [133]:
plot_rate(mod_chd,"DATE_OF_INTEREST","HOSPITALIZED_CASE_COUNT", "NYC Covid Hospitalized Case Rate")

In [134]:
plot_rate(mod_chd,"DATE_OF_INTEREST","DEATH_COUNT", "NYC Covid Death Rate")

In [135]:
plot_pie(mod_bs, "COVID_CASE_RATE","SEX_GROUP","NYC Covid Case Rate By Gender")

In [136]:
plot_pie(mod_bs, "HOSPITALIZED_CASE_RATE","SEX_GROUP","NYC Covid Hopitalized Case Rate By Gender")

In [137]:
plot_pie(mod_bs, "DEATH_RATE","SEX_GROUP", "NYC Covid Death Rate by Sex")

In [138]:
plot_pie(mod_ba, "COVID_CASE_RATE","AGE_GROUP", "NYC Covid Case Rate by Age")

In [139]:
plot_pie(mod_ba, "HOSPITALIZED_CASE_RATE","AGE_GROUP", "NYC Hospitalization Case Rate by Age")

In [140]:
plot_pie(mod_ba, "DEATH_RATE","AGE_GROUP", "NYC Covid Death Rate by Age")

In [141]:
plot_pie(by_race_df,"Number dead","Race Label", "NYC Covid Death Rate by Race")