In [34]:
import os
import pandas as pd
import datetime as dt
from matplotlib import pyplot as plt
import re

def get_update_num_from_filename(filename):
    """
    Parses a filename of type `tabula-Actualizacion_36_COVID-19.csv` to extract
    the update number ("36") in integer format
    """
    m = re.match(r'.*[a-z]-.*[A-Za-z]_(?P<num>\d+)_.*[A-Z]-19.csv', csv_file)
    return int(m.groupdict()['num'])


data_folder = '../extracted_data'

# Updates have not been daily, so I need to map the "update number",
# as published by the Ministry of Health, to the date the data corresponds to
update_num_to_date = {
    34: dt.date(year=2020, month=3, day=2),
    35: dt.date(year=2020, month=3, day=3),
    36: dt.date(year=2020, month=3, day=4),
    37: dt.date(year=2020, month=3, day=5),
    38: dt.date(year=2020, month=3, day=6),
    # Notice: missing data for 03/7 and 03/8
    39: dt.date(year=2020, month=3, day=9),
    40: dt.date(year=2020, month=3, day=10),
    41: dt.date(year=2020, month=3, day=11),
    42: dt.date(year=2020, month=3, day=12),
    43: dt.date(year=2020, month=3, day=13),
    # Notice: missing data for 03/14 and 03/15
    46: dt.date(year=2020, month=3, day=16),
    47: dt.date(year=2020, month=3, day=17),
    48: dt.date(year=2020, month=3, day=18),
    49: dt.date(year=2020, month=3, day=19),
    50: dt.date(year=2020, month=3, day=20),
    51: dt.date(year=2020, month=3, day=21),
    52: dt.date(year=2020, month=3, day=22),
}
       
diagnosed = {}
icu = {}
deaths = {}
for csv_file in sorted(os.listdir(data_folder)):
    if csv_file.endswith('.csv'):
        num_update = get_update_num_from_filename(csv_file)
        update_date = update_num_to_date[num_update]

        daily_update = pd.read_csv(
            os.path.join(data_folder, csv_file), 
            header=0, 
            index_col='CCAA',
            dtype={'Total casos': int, 'TOTAL conf.': int},
            decimal=',',
            thousands='.'
        )
        
        # drop totals (TODO: check sum matches totals)
        daily_update = daily_update.drop(['TOTAL', 'Total', 'ESPAÑA'], axis=0, errors='ignore')
        
        # For "Castilla La Mancha" Autonomous Region, sometimes it is called one way, 
        # some other times another :/
        if 'Castilla-La Mancha' in daily_update.index:
            daily_update.loc['Castilla La Mancha'] = daily_update.loc['Castilla-La Mancha']
            daily_update.drop('Castilla-La Mancha', axis=0, inplace=True)
            
        if 'TOTAL conf.' in daily_update.columns:
            diagnosed[update_date] = daily_update['TOTAL conf.']
        elif 'Total casos' in daily_update.columns:
            diagnosed[update_date] = daily_update['Total casos']
        else:
            raise ValueError('Cant find number of diagnosed')
        
        if 'Ingreso en UCI' in daily_update.columns:
            icu[update_date] = daily_update['Ingreso en UCI']
            
        if 'Fallecidos' in daily_update.columns:
            deaths[update_date] = daily_update['Fallecidos']
            
diagnosed = pd.DataFrame(diagnosed).transpose().fillna(0).astype(int)
icu = pd.DataFrame(icu).transpose().fillna(0).astype(int)
deaths = pd.DataFrame(deaths).transpose().fillna(0).astype(int)

diagnosed.to_csv('../consolidated/diagnosed.csv')
icu.to_csv('../consolidated/icu.csv')
deaths.to_csv('../consolidated/deaths.csv')