In [1]:
import pandas as pd
import numpy as np
import os
from helper_functions import *
from herams_helper_functions import *

In [2]:
df = pd.read_excel('data/HeRAMS_Mali_COVID_20012021.xlsx')

In [3]:
column_names = df.loc[0, :].T.to_frame()
column_names.reset_index(inplace=True)
column_names = column_names.rename(columns = {'index':'Long_Name', 0: 'Short_Name'})

In [4]:
df = pd.read_excel('data/HeRAMS_Mali_COVID_20012021.xlsx', header=1)
regions = pd.read_csv('data/regions.csv')
cercle = pd.read_csv('data/cercle.csv')
commune = pd.read_csv('data/commune.csv')

In [5]:
regions = get_serial_column(regions, ['REGION'], 'RegionIndex')
cercle = get_serial_column(cercle, ['RegionIndex', 'CERCLE'], 'CercleIndex')
commune = get_serial_column(commune, ['RegionIndex', 'CercleIndex', 'COMMUNE'], 'CommuneIndex')

In [6]:
assert df.workspace_id.all() == df.GEO1.all()
cols_to_delete = [col for col in df.columns if df[col].isnull().sum() == df.shape[0]]
df = df.drop(cols_to_delete, axis=1)

In [7]:
df[['REGION', 'CERCLE']] = df['GEO1'].str.split('/', expand=True)
df = df.drop(['GEO1', 'workspace_id', 'GEO3', 'HFOXY2other'], axis=1)

In [8]:
original_column_order = list(df.columns[:3]) + list(df.columns[-2:]) + list(df.columns[3:-2])
original_shape = df.shape

In [9]:
cols_to_clean_string = ['REGION', 'CERCLE', 'GEO6', 'MoSD2', 'MoSD3', 'MoSD3other', 'MoSD6']
df = clean_string(df, cols_to_clean_string)

In [10]:
values_to_replace = [{'SÉGOU': 'SEGOU', 'MÉNAKA': 'MENAKA'}, {'DIÉMA': 'DIEMA', 'KÉNIÉBA': 'KENIEBA', 'DIOÏLA': 'DIOILA', 'MÉNAKA': 'MENAKA', 'KALABANCORO': 'KALABAN CORO',
                  'NIAFOUNKÉ': 'NIAFUNKE', 'TENENKOUN': 'TENENKOU', 'BAROUÉLI': 'BAROUELI', 'SÉGOU': 'SEGOU', 'DIRÉ': 'DIRE'}]

In [11]:
df = replace_values(df, ['REGION', 'CERCLE'], values_to_replace)
df = merge_columns(df, regions, ['REGION'], ['REGION'], ['REGION'])
df = merge_columns(df, cercle, ['RegionIndex', 'CERCLE'], ['RegionIndex', 'CERCLE'], ['CERCLE'])

In [12]:
df = df.rename(columns = {'RegionIndex': 'REGION', 'CercleIndex': 'CERCLE'})

In [13]:
a = df[['REGION', 'CERCLE', 'GEO6']]
a = a.drop_duplicates()
a = a.sort_values(['REGION', 'CERCLE', 'GEO6'])
a = a.reset_index().reset_index()
a = a.drop('index', axis=1)
a = a.rename(columns = {'level_0': 'GEO6Index'})
a['GEO6Index'] += 1
a.to_csv('data/herams_GEO6_localite.csv', index=False)

In [14]:
df = merge_columns(df, a, ['REGION', 'CERCLE', 'GEO6'], ['REGION', 'CERCLE', 'GEO6'], ['GEO6'])
df = df.rename(columns = {'GEO6Index': 'GEO6'})

In [15]:
mosd3 = load_category_csv('MoSD3')
mosd3_append = {10: 'LABORATOIRE P2', 11: 'LABORATOIRE P3'}
mosd3 = append_categories(mosd3, mosd3_append)

In [16]:
mosd4 = load_category_csv('MoSD4')
mosd5 = load_category_csv('MoSD5')
mosd7 = load_category_csv('MoSD7')
hffunct = load_category_csv('HFFUNCT')

In [17]:
imst1 = create_category_csv(df, 'IMST1')
scren1 = create_category_csv(df, 'SCREN1')
triage1 = create_category_csv(df, 'TRIAGE1')
hfinp3 = create_category_csv(df, 'HFINP3')
hfinp1 = create_category_csv(df, 'HFINP1_SQ001')
hfinp1y = create_category_csv(df, 'HFINP1y')
lab1x = create_category_csv(df, 'LAB1x_1')
hfipc3x = create_category_csv(df, 'HFIPC3x_1')
hfgaps = create_category_csv(df, 'HFGAPS1_1')

In [18]:
df.loc[df['MoSD3'] == 'CSREF: CENTRE DE SANTÉ DE RÉFÉRENCE', 'MoSD3'] = 'CENTRE DE SANTÉ DE RÉFÉRENCE'
df.loc[df['LAB1'] == 'Pas prévu', 'LAB1'] = 'Non prévu'
df.loc[df['DIAG1'] == 'Pas prévu', 'DIAG1'] = 'Non prévu'
df.loc[df['DIAG2'] == 'Pas prévu', 'DIAG2'] = 'Non prévu'
df.loc[df['DIAG3'] == 'Pas prévu', 'DIAG3'] = 'Non prévu'
df.loc[df['HFIPC3'] == 'Disponible et suffisant', 'HFIPC3'] = 'Suffisante'

In [19]:
hfinp1_append = {5: 'Insuffisante', 6: 'Disponible', 7: 'Disponible et suffisante'}
hfipc3x_append = {7: 'Combustion à ciel ouvert', 8: 'Autre'}
hfgaps_append = {6: 'Autre'}
hfinp1 = append_categories(hfinp1, hfinp1_append)
hfipc3x = append_categories(hfipc3x, hfipc3x_append)
hfgaps = append_categories(hfgaps, hfgaps_append)

In [20]:
assert original_shape == df.shape

In [21]:
list_of_unique_categorical_columns = ['MoSD3', 'MoSD4', 'MoSD5', 'MoSD7', 'HFFUNCT', 'IMST1', 'SCREN1', 'TRIAGE1', 'HFINP3', 'HFINP1_SQ001', 'HFINP1y', 'LAB1x_1', 
                                      'HFIPC3x_1', 'HFGAPS1_1']
list_of_unique_categorical_dataframes = [mosd3, mosd4, mosd5, mosd7, hffunct, imst1, scren1, triage1, hfinp3, hfinp1, hfinp1y, lab1x, hfipc3x, hfgaps]
reduced_list_of_categorical_dataframes = [triage1, hfinp3, hfinp1, hfinp1y, lab1x, hfipc3x, hfgaps]
assert len(list_of_unique_categorical_columns) == len(list_of_unique_categorical_dataframes)

In [22]:
assert original_shape == df.shape

In [23]:
hfinp1x = ['HFINP1_SQ002', 'HFINP1_SQ003', 'LAB1', 'DIAG1', 'HFEQP1_SQ001', 'HFEQP1_SQ002', 'HFEQP1_SQ003', 'HFEQP1_SQ004', 'HFEQP1_SQ005',
           'HFOXY1', 'HRCAP1_SQ001', 'HRCAP1_SQ002', 'HRCAP1_SQ003', 'HRCAP1_SQ004', 'HRCAP1_SQ005', 'HRCAP1_SQ006', 'HRCAP1_SQ007', 'HRCAP1_SQ008', 'HRCAP1_SQ009',
           'HRCAP1_SQ010', 'HFIPC1', 'HFIPC2_SQ001', 'HFIPC2_SQ002', 'HFIPC2_SQ003', 'HFIPC2_SQ004', 'HFIPC2_SQ005', 'HFIPC2_SQ006', 'HFIPC2_SQ007', 'HFIPC2_SQ008',
           'HFIPC3', 'MORGUE1', 'MORGUE2', 'HFIPC2_SQ009']
hfinp3x = ['SCREN1x_SQ001', 'SCREN1x_SQ002', 'SCREN1x_SQ003', 'HFOXY2_SQ001', 'HFOXY2_SQ002']
lab1xx = ['LAB1x_2', 'LAB1x_3', 'LAB1x_4', 'DIAG1x_1', 'DIAG1x_2', 'DIAG1x_3', 'DIAG1x_4', 'DIAG2x_1', 'DIAG2x_2', 'HFIPC1x_1', 'HFIPC1x_2', 'HFIPC1x_3', 'HFIPC1x_4', 
          'MORGUE1x_1', 'MORGUE1x_2', 'MORGUE1x_3', 'MORGUE1x_4', 'MORGUE1x_5', 'HFFUNCTx_1', 'HFFUNCTx_2', 'DIAG3x_1', 'DIAG3x_2', 'DIAG3x_3', 'DIAG3x_4']
triage1x = ['DIAG2', 'DIAG3']
hfinp1yx = ['HFEQP1y', 'HRCAP1y', 'HFIPC2y']
hfgapsx = ['HFGAPS1_2', 'HFGAPS1_3']
hfipc3xx = ['HFIPC3x_2', 'HFIPC3x_3']
list_of_column_lists = [triage1x, hfinp3x, hfinp1x, hfinp1yx, lab1xx, hfipc3xx, hfgapsx]
assert len(list_of_column_lists) == len(reduced_list_of_categorical_dataframes)

In [24]:
assert original_shape == df.shape

In [25]:
for col in hfinp1x:
    df.loc[df[col] == 'Suffisant', col] = 'Suffisante'
    df.loc[df[col] == 'Insuffisant', col] = 'Insuffisante'
    df.loc[df[col] == 'Disponible mais insuffisant', col] = 'Disponible mais insuffisante'
    df.loc[df[col] == 'Disponible et suffisant', col] = 'Disponible et suffisante'

In [26]:
for i, col in enumerate(list_of_unique_categorical_columns):
    df = df.merge(list_of_unique_categorical_dataframes[i], left_on=col, right_on='CATEGORIES', how='left')
    df = df.drop(['CATEGORIES', col], axis=1)
    df = df.rename(columns={'ID': col})
assert original_shape == df.shape

In [27]:
for i, data in enumerate(reduced_list_of_categorical_dataframes):
    list_of_columns = list_of_column_lists[i]
    df = convert_to_numerical_category(df, data, list_of_columns)

In [28]:
for col in ['last_synced', 'date']:
    df[col] = pd.to_datetime(df[col])

In [29]:
float_columns = df.loc[:, df.dtypes == float].columns.tolist()
float_columns.remove('MoSDGPS_SQ001')
float_columns.remove('MoSDGPS_SQ002')

In [30]:
for col in float_columns:
    df[col] = df[col].astype('Int64')

In [31]:
assert df.shape == original_shape
df = df[original_column_order]

In [32]:
df.to_csv('data/HeRAMS_Mali_COVID_Normalized.csv', index=False)

In [33]:
for i, d in enumerate(list_of_unique_categorical_dataframes):
    d = d.drop('ID', axis=1)
    col = list_of_unique_categorical_columns[i]
    d.to_csv(f'data/herams_{col}_categories.csv', index=False)

In [34]:
df_columns = pd.Series(df.columns).to_frame()
df_columns = df_columns.rename(columns={0: 'Short_Name'})
column_names = column_names.merge(df_columns, how='right', on='Short_Name')

In [35]:
column_names.loc[column_names['Short_Name'] == 'REGION', 'Long_Name'] = 'Region'
column_names.loc[column_names['Short_Name'] == 'CERCLE', 'Long_Name'] = 'District Sanitaire'

In [36]:
column_names.to_csv('data/herams_covid_column_names.csv', index=False)

In [37]:
df.MoSD3.value_counts()

4     64
9     15
5      3
10     2
11     2
Name: MoSD3, dtype: int64

In [38]:
df2 = pd.read_excel('data/HeRAMS_Mali_COVID_20012021.xlsx', header=1)

In [39]:
df2.MoSD3.value_counts()

CSREF: Centre de Santé de Référence    64
Hôpital                                15
Other                                   3
Laboratoire P2                          2
Laboratoire P3                          2
Name: MoSD3, dtype: int64