<a href="https://colab.research.google.com/github/krldlamini/edm_brazildengue/blob/main/Data_Analysis/notebooks/ZeroesCases_Filtered.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This file counts the number of weeks with no cases reported per municipality

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pasting Functions

In [None]:
# Processing of cases as whole for any files after initial clean
# years 2000 to 2021, compatibilized in order to know the total time series
# doesn't include imported or not imported cases actually
# Code developed by Denise Cammarota

def process_dates(df):
    min_year = min(df['SIN_YEAR'])
    max_year = max(df['SIN_YEAR'])
    if(max_year > 2021):
        max_year = 2021
    years = np.arange(min_year,max_year+1,1)

    # generate weeks for each years and count cases
    list_year = []
    list_week = []
    list_begin = []
    list_cases = []
    list_imported = []


    for year in years:
        start_year = str(year)+'-01-01'
        end_year = str(year)+'-12-31'
        weeks_year = pd.date_range(start=start_year, end = end_year, freq='W-SUN', inclusive = 'left')
        n_weeks = len(weeks_year)
        for week in range(n_weeks):
            list_year.append(year)
            list_week.append(week+1)
            list_begin.append(weeks_year[week].date().strftime('%Y-%m-%d'))
            # and now we count the total number of cases
            filt_df = ((df['SIN_WEEK'] == week) & (df['SIN_YEAR'] == year))
            df_tmp = df[filt_df]
            if df_tmp.empty == False:
                list_cases.append(df_tmp.iloc[0]['CASES'])
                list_imported.append(df_tmp.iloc[0]['IMPORTED'])
            else:
                list_cases.append(0)
                list_imported.append(0)

    df_final = pd.DataFrame(list(zip(list_year, list_week, list_begin, list_cases, list_imported)),
                   columns =['SIN_YEAR', 'SIN_WEEK', 'FIRST_DAY', 'CASES','IMPORTED'])

    return df_final

In [None]:
# Processing of cases separating imported cases for each municipality
# years 2007 onwards actually have the TPAUTOCTO field in order to filter this
# Code developed by Denise Cammarota

def process_imported(data_filtered_3, data_filtered_1, year):
    if(year >= 2007):
        df = data_filtered_1.groupby(['SIN_WEEK','SIN_YEAR','TPAUTOCTO'])['TPAUTOCTO'].size()
        df = df.to_frame(name = 'CASES').reset_index()
        df = df.replace(' ',3)
        df['TPAUTOCTO'] = df['TPAUTOCTO'].astype(int)
        df['CASES'] = df['CASES'].astype(int)
        df['new'] = df['TPAUTOCTO'].isin([2])
        df['new'] = df['new'].astype(int)
        # counting number of imported cases
        df['new_2'] = df['new']*(df['TPAUTOCTO']-1)*df['CASES']
        df = df.groupby(['SIN_WEEK','SIN_YEAR'])['new_2'].sum()
        df = df.to_frame(name = 'IMPORTED').reset_index()
        df['CASES'] = data_filtered_3['CASES']
        # permutating the two last columns
        columns_titles = ['SIN_WEEK','SIN_YEAR','CASES','IMPORTED']
        df = df.reindex(columns=columns_titles)
    else:
        df = data_filtered_3.copy()
        df['IMPORTED'] = 0
    return df

In [None]:
# Processing of cases as whole for any municipio, given the IBGE code
# years 2000 to 2021, compatibilized in order to know the total time series
# includes imported and sum of all cases
# Code developed by Denise Cammarota

def process_municipality(id_municip):
    # find all files
    file_path = 'drive/MyDrive/Dengue_BR/DataBR_Processed/dengue_BR_'
    years = np.arange(2007,2022,1)
    files = []
    # build the files for all years
    for year in years:
      file_path_tmp = file_path + str(year) + '.csv'
      files.append(file_path_tmp)
    # creating the dataframe that will contain the results
    data_total = pd.DataFrame()
    for file in files:
        # find year corresponding to file
        year = int(file[-8:-4])
        # reading data
        data_test = pd.read_csv(file,
                                delimiter = ';',
                                index_col=False,
                                parse_dates = ['DT_SIN_PRI','SEM_PRI','DT_NOTIFIC','SEM_NOT'])
        # first column is read differently
        data_test = data_test.drop(columns = ['Unnamed: 0'])
        # filter data from desidered municipality
        filt_df1 = (data_test['ID_MUNICIP'] == id_municip)
        data_filtered_1 = data_test[filt_df1]
        # see if there are available data and sum for week + year
        if data_filtered_1.empty == False:
            data_filtered_2 = data_filtered_1.groupby(['SIN_WEEK','SIN_YEAR']).size()
            data_filtered_3 = data_filtered_2.to_frame(name = 'CASES').reset_index()
            data_filtered_4 = data_filtered_3.copy()
            data_filtered_4 = process_imported(data_filtered_3, data_filtered_1, year)
            # appending to the final results
            data_total = data_total.append(data_filtered_4)
    # after all years are processed, we put initial week date
    data_total = process_dates(data_total)
    # return this table
    return data_total

Read in Municipalities

In [None]:
id_municipalities = pd.read_csv('filtered_municipalities.csv')
id_municipalities
first_10 = id_municipalities['ID_MUNICIP'][:10]

In [None]:
num_zeroes = []
num_cases = []

for value in first_10:
    df = process_municipality(value)
    df = df[df['SIN_YEAR'] >= 2007]
    df = df[df['SIN_YEAR'] <= 2021]
    num_zeroes.append(len(df[df['CASES'] == 0]))
    num_cases.append(sum(df['CASES']))

In [None]:
batch_10 = pd.DataFrame({'Municipality': first_10,
                   'Zeroes': num_zeroes,
                   'Cases': num_cases})

batch_10_sorted = batch_10.sort_values('Zeroes')
pd.DataFrame(batch_10_sorted).to_csv('first_10_municipalities.csv')

In [None]:
batch_10

Unnamed: 0,Municipality,Zeroes,Cases
0,310620,20,482394
1,230440,15,239158
2,520870,15,220731
3,330455,16,213607
4,350950,46,176023
5,354980,40,165765
6,530010,32,162327
7,520140,15,147775
8,354340,58,146677
9,355030,39,140222


In [None]:
id_municipalities = pd.read_csv('filtered_municipalities.csv')
id_municipalities
first_30 = id_municipalities['ID_MUNICIP'][:30]

In [None]:
num_zeroes = []
num_cases = []

for value in first_30:
    df = process_municipality(value)
    df = df[df['SIN_YEAR'] >= 2007]
    df = df[df['SIN_YEAR'] <= 2021]
    num_zeroes.append(len(df[df['CASES'] == 0]))
    num_cases.append(sum(df['CASES']))

In [None]:
batch_30 = pd.DataFrame({'Municipality': first_30,
                   'Zeroes': num_zeroes,
                   'Cases': num_cases})

batch_30_sorted = batch_30.sort_values('Zeroes')
pd.DataFrame(batch_30_sorted).to_csv('first_30_municipalities.csv')