In [1]:
# import requests

# url = "https://data.brasil.io/dataset/covid19/caso_full.csv.gz"
# with open("caso_full.csv.gz", "wb") as f:
#     r = requests.get(url)
#     f.write(r.content)

In [2]:
# import gzip
# import shutil
# with gzip.open('caso_full.csv.gz', 'rb') as f_in:
#     with open('caso_full.csv', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [3]:
import pandas as pd

df = pd.read_csv("caso_full.csv")
df = df.rename(columns={"last_available_confirmed": "confirmed", "last_available_deaths": "deaths"})
df = df[~df.city_ibge_code.isnull()]
df = df.astype({"city_ibge_code": int})
print(df)

city  city_ibge_code        date  epidemiological_week  \
0       São Paulo         3550308  2020-02-25                     9   
1             NaN              35  2020-02-25                     9   
2       São Paulo         3550308  2020-02-26                     9   
3             NaN              35  2020-02-26                     9   
4       São Paulo         3550308  2020-02-27                     9   
...           ...             ...         ...                   ...   
332535        NaN              43  2020-07-08                    28   
332536        NaN              42  2020-07-08                    28   
332537        NaN              28  2020-07-08                    28   
332538        NaN              35  2020-07-08                    28   
332539        NaN              17  2020-07-08                    28   

        estimated_population_2019  is_last  is_repeated  confirmed  \
0                      12252023.0    False        False          1   
1                   

In [111]:
import numpy as np

def retrieve_data_for_all_cities(state):
   # filter out states and imported cases
   test = df[df.place_type.eq("city")]
   if (state == True):
      test = test[test.state.eq("PR")]
   
   all_codes = test[test.place_type.eq("city")]["city_ibge_code"].unique()

   test = test[["city_ibge_code", "date", "confirmed", "deaths"]]
   by_dates = [city for city in test.groupby('date')]

   for i in range(len(by_dates)):
      date, items = by_dates[i]

      # del items["date"]

      # convert all ibge codes to a Series
      pd_codes = pd.Series(all_codes)

      # retrieve all cities which are not in items
      not_in_list = pd_codes[~pd_codes.isin(items['city_ibge_code'])]

      # create a new DataFrame with the missing cities. This is a lot faster than using pd.concat.
      simple_list = []
      for ibge in not_in_list:
         simple_list.append([ibge, date, np.nan, np.nan])
      
      new_data = pd.DataFrame(simple_list, columns=['city_ibge_code', 'date', 'confirmed', 'deaths'])

      # merge together both DataFrames
      items = items.append(new_data, ignore_index=True)

      # save back the values
      by_dates[i] = [date, items]
   return by_dates

In [170]:
def retrieve_data_fixed(state):
  fixed_data = retrieve_data_for_all_cities(state)
  
  # add zero to first element. This will be propagated in the for loop.
  fixed_data[0][1][['confirmed', 'deaths']] = fixed_data[0][1][['confirmed', 'deaths']].fillna(0)

  for i in range(1, len(fixed_data)):
    date, items = fixed_data[i]
    prev_date, prev_items = fixed_data[i - 1]
    
    # fill missing cities with previous value
    items[items['confirmed'].isnull()] = prev_items

    # re-override the date column, since prev_items messed with it
    items["date"] = date

    # fill remaining with zero
    fixed_data[i] = [date, items]
  
  smaller_date = []
  for i in range(len(fixed_data) - 1, -1, steps):
    date, items = fixed_data[i]
    items = items.astype({"confirmed": int, "deaths": int})
    items = items.sort_values(by='city_ibge_code', ascending=True)
    smaller_date.append([date, items])

  return smaller_date         

In [171]:
def to_csv(pr, name):
    pr_df = retrieve_data_fixed(pr)
    a = ""

    for i in range(len(pr_df)):
        date, items = pr_df[i]
        items = items.rename(columns={"city_ibge_code": "z", "confirmed": "c", "deaths": "d"})
        items = items[["date", "z", "c", "d"]]
        # if (not pr):
            # limit the total number
            # items = items.nlargest(3000, 'c')
        a += items.to_csv(header= i==0, index=False)
        pr_df[i] = [date, items]

    with open(name, 'w') as outfile:
        outfile.write(a)


In [172]:
def to_heatmap_csv(pr, name):
    pr = retrieve_data_fixed(pr)

    date, items = pr[0]
    items = items.rename(columns={"city_ibge_code": "z", "confirmed": "c", "deaths": "d"})
    items = items[["z", "c"]]
    items = items.to_csv(name, index = False, header=True)


In [173]:
to_csv(True, "../public/data/pr_ndays.csv")
to_csv(False, "../public/data/br_ndays.csv")

to_heatmap_csv(True, "../public/data/pr_heatmap.csv")
to_heatmap_csv(False, "../public/data/br_heatmap.csv")

In [192]:
df.columns

Index(['city', 'city_ibge_code', 'date', 'epidemiological_week',
       'estimated_population_2019', 'is_last', 'is_repeated', 'confirmed',
       'last_available_confirmed_per_100k_inhabitants', 'last_available_date',
       'last_available_death_rate', 'deaths', 'order_for_place', 'place_type',
       'state', 'new_confirmed', 'new_deaths'],
      dtype='object')

In [191]:
# pr_df = retrieve_data_fixed(True, -1)
# test = df[df.place_type.eq("city")]
# test = test[test.state.eq("PR")]

# top_pr_cities = test.sort_values('confirmed', ascending=False).drop_duplicates('city_ibge_code').head(8).sort_values('confirmed', ascending=False)['city_ibge_code']

# by_dates = [city for city in test.groupby('date')]

# a = ""
# for i in range(len(by_dates)):
#     date, items = pr_df[i]
#     items = items.rename(columns={"city_ibge_code": "z", "confirmed": "c", "deaths": "d"})
#     items = items[["date", "z", "c", "d"]]
#     items = items[items["z"].isin(top_pr_cities)]

#     a += items.to_csv(header= i==0, index=False)
#     pr_df[i] = [date, items]

# # with open("../public/data/pr_topcities_alldays.csv", 'w') as outfile:
# #     outfile.write(a)


330333    4106902
330309    4104808
330428    4113700
325110    4115200
325270    4127700
325150    4118204
330583    4125506
325160    4119152
Name: city_ibge_code, dtype: int64


In [194]:
pr_df = retrieve_data_fixed(True, -1)
top_pr_cities = pr_df[0][1].nlargest(8, 'confirmed')["city_ibge_code"].tolist()
print(top_pr_cities)
a = ""
for i in range(len(pr_df)):
    date, items = pr_df[i]
    items = items.rename(columns={"city_ibge_code": "z", "confirmed": "c", "deaths": "d"})
    items = items[["date", "z", "c", "d"]]
    items = items[items["z"].isin(top_pr_cities)]

    a += items.to_csv(header= i==0, index=False)
    pr_df[i] = [date, items]
print(a)
# with open("../public/data/pr_topcities_alldays.csv", 'w') as outfile:
    # outfile.write(a)


4106902,2666,110
2020-06-22,4113700,1108,59
2020-06-22,4115200,763,11
2020-06-22,4118204,162,5
2020-06-22,4119152,255,7
2020-06-22,4125506,283,18
2020-06-22,4127700,442,4
2020-06-21,4104808,1751,31
2020-06-21,4106902,2487,103
2020-06-21,4113700,1095,58
2020-06-21,4115200,722,11
2020-06-21,4118204,162,5
2020-06-21,4119152,238,7
2020-06-21,4125506,281,16
2020-06-21,4127700,430,4
2020-06-20,4104808,1714,28
2020-06-20,4106902,2375,99
2020-06-20,4113700,1057,56
2020-06-20,4115200,712,11
2020-06-20,4118204,161,4
2020-06-20,4119152,227,7
2020-06-20,4125506,267,16
2020-06-20,4127700,397,4
2020-06-19,4104808,1520,26
2020-06-19,4106902,2253,98
2020-06-19,4113700,1026,56
2020-06-19,4115200,673,11
2020-06-19,4118204,160,4
2020-06-19,4119152,224,7
2020-06-19,4125506,260,16
2020-06-19,4127700,365,4
2020-06-18,4104808,1327,25
2020-06-18,4106902,2060,95
2020-06-18,4113700,982,52
2020-06-18,4115200,558,11
2020-06-18,4118204,133,4
2020-06-18,4119152,216,6
2020-06-18,4125506,250,15
2020-06-18,4127700,336