# Data cleaning

In [None]:
import os
import datetime
import string
import random

import pandas as pd
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='paper', style='darkgrid', font_scale=1.2)

## Loading

In [None]:
RAW_FILE = 'raw_data_cro.xlsx' 

In [None]:
df = pd.read_excel(RAW_FILE)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# check for NaNs
df.isnull().any()

In [None]:
# only test results seem to be an issue, lets see how many NaNs are there
df['Rezultat'].isnull().sum()

## Creating a working dataset

In [None]:
# list all comorbidities A-Z
comorbidity_list = list(string.ascii_uppercase)

# drop existing sumcomorbidity column, create new and drop comorbidites
df_tidy = df.copy(deep=True)
df_tidy.drop(columns='sumcomorbidity', inplace=True)
df_tidy['sum_comorbidity'] = df_tidy[comorbidity_list].sum(axis=1)
df_tidy.drop(columns=comorbidity_list, inplace=True)
df_tidy

In [None]:
# replace '(null)' with np.Nan
df_tidy = df_tidy.replace('(null)', np.NaN)
df_tidy = df_tidy.replace('""(null)""', np.NaN)
# check for NaNs
df_tidy.isnull().sum()

In [None]:
# convert testing date to datetime.datetime format and drop all testst before 2020-2-25
df_tidy.Datum_testiranja = df_tidy.Datum_testiranja.apply(pd.to_datetime)
df_tidy.drop(df_tidy[df_tidy.Datum_testiranja < datetime.datetime(2020, 2, 25)].index, inplace=True)

# convert testing result to bool
df_tidy.Rezultat = df_tidy.Rezultat.replace({1.0: True, 0.0: False})
df_tidy.drop(df_tidy[df_tidy.Rezultat.isnull()].index, inplace=True)

# convert hospitalization data to bool
df_tidy.Hospitalizacija = df_tidy.Hospitalizacija.replace({1: True, 0: False})

# convert respirator indicator data to bool
df_tidy.Respirator = df_tidy.Respirator.replace({1: True, 0: False})

# convert deceased data to bool
df_tidy.preminuli = df_tidy.Respirator.replace({1: True, 0: False})

# convert age to numeric and remove invalid values
df_tidy.dob = df_tidy.dob.apply(pd.to_numeric)
df_tidy.loc[df_tidy.dob < 0, 'dob'] = np.NaN
df_tidy.loc[df_tidy.dob > 105, 'dob'] = np.NaN

# convert sex to string
df_tidy.spol = df_tidy.spol.replace({1.: 'M', 2.: 'F'})

# tidy up counties
df_tidy.županija = df_tidy.županija.str.replace('ŽUPANIJA', '').str.replace(' ', '')

In [None]:
df_tidy.spol.value_counts().plot(kind='bar')
plt.show()

In [None]:
ax = sns.distplot(df_tidy.dob, bins=10, kde_kws=dict(bw=5, cut=0))
plt.show()

In [None]:
# lets quickly visually check if this could be described as Gaussian PDF
# for the later random generation of missing values (NaNs)
_ = sns.kdeplot(df_tidy.dob.values, cumulative=True, label='age cdf')
mu = df_tidy.dob.mean()
sigma = df_tidy.dob.std()
gaussian_cdf = np.random.normal(mu, sigma, size=df_tidy.dob.values.size)
_ = sns.kdeplot(gaussian_cdf, cumulative=True, label='normal cdf')
plt.legend(loc='upper left')
plt.show()

In [None]:
sns.distplot(df_tidy.sum_comorbidity, bins=10, kde_kws=dict(bw=0.6, cut=0))
plt.show()

In [None]:
# handling missing (NaN) values for sex
@np.vectorize
def gen_random_sex(x):
    return random.choice(('M', 'F')) # https://en.wikipedia.org/wiki/Fisher%27s_principle :)
df_tidy.loc[df_tidy.spol.isnull(), 'spol'] = df_tidy.spol.loc[df_tidy.spol.isnull()].apply(gen_random_sex)

# missing values will be filled with random values from existing age distribution because all 
# individuals with missing age are tested Negative so it is not of the greatest importance
# it could be chechked with: df_w[df_w.age.isnull() & df_w.testing_result==False]
@np.vectorize
def gen_random_age(x):
    random_nmb = -1
    while random_nmb not in range(0, 105):
        random_nmb = int(random.gauss(mu=df_tidy.dob.mean(), sigma=df_tidy.dob.std()))
    return random_nmb
df_tidy.loc[df_tidy.dob.isnull(), 'dob'] = df_tidy.dob.loc[df_tidy.dob.isnull()].apply(gen_random_age)

In [None]:
# assemble clean working dataframe
# translate column names and explicitly state data type
columns_dtypes_dict = {
    'id': str, 
    'age': float, 
    'sex': str, 
    'location': str, 
    'county': str,
    'testing_date': datetime.datetime, 
    'testing_result': bool, 
    'testing_institute': str,
    'hospitalization': bool, 
    'respirator': bool, 
    'deceased': bool,
    'total_comorbidities': int,
}

# create empty dataframe with predifined data types
columns_dtypes_list = [(k, v) for k, v in columns_dtypes_dict.items()]
dtypes = np.dtype(columns_dtypes_list)
empty_matrix = np.empty(0, dtype=dtypes)
df_w = pd.DataFrame(empty_matrix)

# fill the working dataframe with clean data
df_w.id = df_tidy.ID_osoba
df_w.age = df_tidy.dob
df_w.sex = df_tidy.spol
df_w.location = df_tidy.Lokacija
df_w.county = df_tidy.županija
df_w.testing_date = df_tidy.Datum_testiranja
df_w.testing_result = df_tidy.Rezultat
df_w.testing_institute = df_tidy.Ustanova_koja_je_napravila_testiranja
df_w.hospitalization = df_tidy.Hospitalizacija
df_w.respirator = df_tidy.Respirator
df_w.deceased = df_tidy.preminuli
df_w.total_comorbidities = df_tidy.sum_comorbidity
df_w

In [None]:
df_w.describe()

In [None]:
df_w.to_csv('clean_data_cro.csv', index=False)