In [1]:
import numpy as np
import pandas as pd
import os
from helper_functions import *
from herams_helper_functions import *

In [2]:
df = pd.read_excel('data/HeRAMS_Mali20012021.xlsx')

In [3]:
column_names = df.loc[0, :].T.to_frame()
column_names.reset_index(inplace=True)
column_names = column_names.rename(columns = {'index':'Long_Name', 0: 'Short_Name'})

In [4]:
df = pd.read_excel('data/HeRAMS_Mali20012021.xlsx', header=1)
regions = pd.read_csv('data/regions.csv')
cercle = pd.read_csv('data/cercle.csv')
commune = pd.read_csv('data/commune.csv')

In [5]:
regions = get_serial_column(regions, ['REGION'], 'RegionIndex')
cercle = get_serial_column(cercle, ['RegionIndex', 'CERCLE'], 'CercleIndex')
commune = get_serial_column(commune, ['RegionIndex', 'CercleIndex', 'COMMUNE'], 'CommuneIndex')

In [6]:
assert df.workspace_id.all() == df.GEO1.all()
cols_to_delete = [col for col in df.columns if df[col].isnull().sum() == df.shape[0]]
df = df.drop(cols_to_delete, axis=1)

In [7]:
df[['REGION', 'CERCLE']] = df['GEO1'].str.split('/', expand=True)
df = df.drop(['GEO1', 'workspace_id', 'GEO3'], axis=1)

In [8]:
original_column_order = list(df.columns[:3]) + list(df.columns[-2:]) + list(df.columns[3:-2])
original_shape = df.shape

In [9]:
cols_to_clean_string = ['REGION', 'CERCLE', 'GEO6', 'MoSD2', 'MoSD3', 'MoSD3other', 'HFSUP3_SQ001', 'HFSUP3_SQ002', 'HFSUP3_SQ003', 'HFSUP3_SQ004', 'HFSUP3_SQ005',
                        'HFSUP3_SQ006', 'HFSUP3_SQ007', 'HFSUP3_SQ008', 'HFSUP3_SQ009', 'HFSUP3_SQ010']
df = clean_string(df, cols_to_clean_string)

In [10]:
values_to_replace = [{'SÉGOU': 'SEGOU', 'MÉNAKA': 'MENAKA', 'TAOUDÉNIT': 'TAOUDENIT'}, 
                     {'ABEÏBARA': 'ABEIBARA', 'ALMOUSTRAT': 'ALMOUSTARAT', 'BAROUÉLI': 'BAROUELI', 'DIOÏLA': 'DIOILA', 'DIRÉ': 'DIRE', 'DIÉMA': 'DIEMA',
                      'KALABANCORO': 'KALABAN CORO', 'KÉNIÉBA': 'KENIEBA', 'MÉNAKA': 'MENAKA', 'NIAFOUNKÉ': 'NIAFUNKE', 'SÉGOU': 'SEGOU', 
                      'TAOUDÉNIT': 'TAOUDENIT', 'TENENKOUN': 'TENENKOU'}]

In [11]:
df = replace_values(df, ['REGION', 'CERCLE'], values_to_replace)
df = merge_columns(df, regions, ['REGION'], ['REGION'], ['REGION'])
df = merge_columns(df, cercle, ['RegionIndex', 'CERCLE'], ['RegionIndex', 'CERCLE'], ['CERCLE'])
df = df.rename(columns = {'RegionIndex': 'REGION', 'CercleIndex': 'CERCLE'})

In [12]:
a = df[['REGION', 'CERCLE', 'GEO6']]
a = a.drop_duplicates()
a = a.sort_values(['REGION', 'CERCLE', 'GEO6'])
a = a.reset_index().reset_index()
a = a.drop('index', axis=1)
a = a.rename(columns = {'level_0': 'GEO6Index'})
a['GEO6Index'] += 1

In [13]:
df = merge_columns(df, a, ['REGION', 'CERCLE', 'GEO6'], ['REGION', 'CERCLE', 'GEO6'], ['GEO6'])
df = df.rename(columns = {'GEO6Index': 'GEO6'})
assert df.shape == original_shape
a.drop('GEO6Index', axis=1, inplace=True)
a.to_csv('data/geo6.csv', index=False)

In [14]:
geo5 = create_category_csv(df, 'GEO5')
mosd3 = create_category_csv(df, 'MoSD3')
mosd4 = create_category_csv(df, 'MoSD4')
mosd5 = create_category_csv(df, 'MoSD5')
mosd7 = create_category_csv(df, 'MoSD7')
condb = create_category_csv(df, 'CONDB')
condbx = create_category_csv(df, 'CONDBx_1')
hffunct = create_category_csv(df, 'HFFUNCT')
hffunctx = create_category_csv(df, 'HFFUNCTx_1')
hfacc = create_category_csv(df, 'HFACC')
hfaccx = create_category_csv(df, 'HFACCx_1')
hfman = create_category_csv(df, 'HFMAN')
hfsup1 = create_category_csv(df, 'HFSUP1')
bawa1x = create_category_csv(df, 'BAWA1x_1')
basa1x = create_category_csv(df, 'BASA1x_1')
bawm2x = create_category_csv(df, 'BAWM2x_1')
baen1x = create_category_csv(df, 'BAEN1x_1')
baco1x = create_category_csv(df, 'BACO1x_1')
info1x = create_category_csv(df, 'INFO1x')
triage1 = load_category_csv('TRIAGE1')
hfinp1 = load_category_csv('HFINP1_SQ001')
lab1x = load_category_csv('LAB1x_1')

In [15]:
w = list(df.columns[140:-3])

In [16]:
qherams = [q for i, q in enumerate(w) if '_' not in q]
qheramsx = [q for i, q in enumerate(w) if '_' in q]

In [17]:
#hfinp1.loc[hfinp1.CATEGORIES=='Disponible mais insuffisante', 'CATEGORIES'] = 'Disponible mais insuffisant'
#hfinp1_append = {6: 'Disponible et suffisant'}
#hfinp1 = append_categories(hfinp1, hfinp1_append)
column_list = ['INFO1', 'INFO2', 'INFO3'] + qherams
for col in column_list:
    df.loc[df[col]=='Non disponible', col] = 'Pas disponible'
hfinp1_cols = ['BAWA1', 'BASA1', 'BAWM1', 'BAWM2', 'BAWM3', 'BAEN1', 'BACO1', 'BACC1']
for col in hfinp1_cols:
    df.loc[df[col]=='Disponible et suffisant', col] = 'Disponible et suffisante'
    df.loc[df[col]=='Disponible mais insuffisant', col] = 'Disponible mais insuffisante'

In [18]:
list_of_unique_categorical_columns = ['GEO5', 'MoSD3', 'MoSD4', 'MoSD5', 'MoSD7', 'CONDB', 'CONDBx_1', 'HFFUNCT', 'HFFUNCTx_1', 'HFACC', 'HFACCx_1', 'HFMAN', 'HFSUP1', 
                                      'BAWA1x_1', 'BASA1x_1', 'BAWM2x_1', 'BAEN1x_1', 'BACO1x_1', 'INFO1x']
condb_cols = ['CONDE']
condbx_cols = ['CONDBx_2', 'CONDBx_3', 'CONDEx_1', 'CONDEx_2', 'CONDEx_3']
hffunctx_cols = ['HFFUNCTx_2', 'HFFUNCTx_3']
hfaccx_cols = ['HFACCx_2', 'HFACCx_3']
bawa1x_cols = ['BAWA1x_2', 'BAWA1x_3']
basa1x_cols = ['BASA1x_2', 'BASA1x_3']
bawm2x_cols = ['BAWM2x_2', 'BAWM2x_3', 'BAWM3x_1', 'BAWM3x_2', 'BAWM3x_3']
baen1x_cols = ['BAEN1x_2', 'BAEN1x_3', 'BACC1x_1', 'BACC1x_2', 'BACC1x_3']
baco1x_cols = ['BACO1x_2', 'BACO1x_3']
triage1_cols = ['INFO1', 'INFO2', 'INFO3'] + qherams
info1x_cols = ['INFO2x', 'INFO3x']
lab1x_cols = qheramsx
list_of_column_lists = [condb_cols, condbx_cols, hffunctx_cols, hfaccx_cols, hfinp1_cols, bawa1x_cols, basa1x_cols, bawm2x_cols, baen1x_cols, baco1x_cols,
                        triage1_cols, info1x_cols, lab1x_cols]

In [19]:
list_of_unique_categorical_dataframes = [geo5, mosd3, mosd4, mosd5, mosd7, condb, condbx, hffunct, hffunctx, hfacc, hfaccx, hfman, hfsup1, bawa1x, basa1x, bawm2x,
                                         baen1x, baco1x, info1x, triage1, hfinp1, lab1x]
reduced_list_of_categorical_dataframes = [condb, condbx, hffunctx, hfaccx, hfinp1, bawa1x, basa1x, bawm2x, baen1x, baco1x, triage1, info1x, lab1x]
assert len(list_of_unique_categorical_columns) == len(list_of_unique_categorical_dataframes) - 3
assert len(list_of_column_lists) == len(reduced_list_of_categorical_dataframes)

In [20]:
for i, col in enumerate(list_of_unique_categorical_columns):
    df = df.merge(list_of_unique_categorical_dataframes[i], left_on=col, right_on='CATEGORIES', how='left')
    df = df.drop(['CATEGORIES', col], axis=1)
    df = df.rename(columns={'ID': col})
assert original_shape == df.shape

In [21]:
for i, data in enumerate(reduced_list_of_categorical_dataframes):
    list_of_columns = list_of_column_lists[i]
    df = convert_to_numerical_category(df, data, list_of_columns)

In [22]:
float_columns = df.loc[:, df.dtypes == float].columns.tolist()
float_columns.remove('MoSDGPS_SQ001')
float_columns.remove('MoSDGPS_SQ002')
float_columns.append('GEO5')
for col in float_columns:
    df[col] = df[col].astype('Int64')

In [23]:
assert df.shape == original_shape
df = df[original_column_order]

In [24]:
df.to_csv('data/HeRAMS_Mali_Normalized.csv', index=False)

In [25]:
updated_list_of_dataframes = list_of_unique_categorical_dataframes[:-3]
for i, d in enumerate(updated_list_of_dataframes):
    d = d.drop('ID', axis=1)
    col = list_of_unique_categorical_columns[i]
    d.to_csv(f'data/herams_{col}_categories.csv', index=False)

In [26]:
df_columns = pd.Series(df.columns).to_frame()
df_columns = df_columns.rename(columns={0: 'Short_Name'})
column_names = column_names.merge(df_columns, how='right', on='Short_Name')

In [27]:
column_names.loc[column_names['Short_Name'] == 'REGION', 'Long_Name'] = 'Region'
column_names.loc[column_names['Short_Name'] == 'CERCLE', 'Long_Name'] = 'District Sanitaire'

In [28]:
column_names.to_csv('data/herams_column_names.csv', index=False)

In [29]:
df2 = pd.read_excel('data/HeRAMS_Mali20012021.xlsx', header=1)

In [115]:
q = pd.read_csv('data/mosd2_to_delete.csv')

In [119]:
count = 0
notin = []
for m in list(q.MoSD2.unique()):
    if m in list(df.MoSD2.unique()):
        count += 1
    else:
        notin.append(m)

In [189]:
w = list(df.MoSD2.unique())
s = 'SEREFO'
inst = [x for x in w if s in x]
inst

[]

In [186]:
dicti = {'CSREF OUELESSEBOUGOU': 'CSREF DE OUELESSEBOUGOU', 'CSREF KALABANCORO': 'CSREF DE KALABANCORO', 'CSREF DE KATI': 'CSREF KATI',
        'CSREF MACINA': 'CSREF DE MACINA', 'CSREF DE BLA': 'CENTRE DE SANTÉ DE RÉFÉRENCE DE BLA', 'CHU POINT G': "CHU DE L'HOPITAL DE POINT G",
        'HÔPITAL DE DERMATOLOGIE DE BAMAKO': 'HOPITAL DERMATOLOGIE DE BAMAKO', 'HÔPITAL SOMINO DOLO': 'HÔPITAL SOMINE DOLO DE MOPTI',
        'CSREF DE KIGNAN': 'CSRÉF DE KIGNAN', 'CSREF DE NIENA': 'CSRÉF NIENA', 'CSREF DE MOPTI': 'CSRÉF DE MOPTI', 'HOPITAL DE KATI': 'CHU KATI',
        'GOLDEN LIFE AMERICAN HOSPITAL': 'GOLDEN LIFE', 'POLYCLINIQUE PASTEUR': 'PASTEUR', 'HOPITAL DU MALI': 'CHU DE HOPITAL DU MALI',
        'CSREF DE OUSSOUBIDAGNAN': 'OUSSOUBIDIAGNA'}

In [187]:
y = [k for k, _ in dicti.items()]

In [188]:
t = [a for a in notin if a not in y]
t

['INSTITUT NATIONAL DE SANTÉ PUBLIQUE (INSP)',
 'LABORATOIRE UCRC (EX-SEREFO)',
 "CENTRE D'INFECTIOLOGIE CHARLES MÉRIEUX DU MALI (CICM)",
 'POLYCLINIQUE ROI MOHAMED VI',
 'LABORATOIRE DE BIOLOGIE MOLÉCULAIRE APPPLIQUÉE (LBMA)',
 'CHU BOCAR SIDY SALL KATI',
 'CSREF DE BARAOUELI']