In [1]:
import os
if not 'id_0123456789876543210' in locals():
    os.chdir(os.path.split(os.getcwd())[0])
    id_0123456789876543210 = None

In [2]:
from src.python.dataframe import Catalogue
from src.python.util import dict_str_hash
from src.python.dataframe import filter
from os import path
import pandas as pd
import numpy as np
import pickle
import json


import matplotlib.pyplot as plt
plt.style.use('ggplot')


def save_object(obj, file):
    with open(file, "wb") as file:
        pickle.dump(obj, file)


def load_object(file):
    with open(file, "rb") as file:
        loaded_object = pickle.load(file)
    return loaded_object

# Creación de datos de entrenamiento

Para red neuronal de clasificación (versión 1)

### Semanas epidemiológicas

In [4]:
def epidemiological_week(week, year):
    date = pd.Timestamp(f'{year}-01-01')
    while date.dayofweek != 6:
        date -= pd.Timedelta(days=1)

    start = date + pd.Timedelta(days=(week-1)*7)
    end = start + pd.Timedelta(days=6)
    return start, end


weeks = pd.DataFrame({
    'Ola': [1, 2, 3, 4, 5, 6],
    'SE_inicio': [8, 40, 23, 51, 22, 49],
    'SE_fin': [39, 15, 42, 9, 33, 4],
    'año_inicio': [2020, 2020, 2021, 2021, 2022, 2022],
    'año_fin': [2020, 2021, 2021, 2022, 2022, 2023]
}).set_index('Ola')

weeks['fecha_inicio'] = [epidemiological_week(week, year)[0]
                           for week, year in zip(weeks['SE_inicio'], weeks['año_inicio'])]
weeks['fecha_fin'] = [epidemiological_week(week, year)[1]
                           for week, year in zip(weeks['SE_fin'], weeks['año_fin'])]
weeks

Unnamed: 0_level_0,SE_inicio,SE_fin,año_inicio,año_fin,fecha_inicio,fecha_fin
Ola,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,8,39,2020,2020,2020-02-16,2020-09-26
2,40,15,2020,2021,2020-09-27,2021-04-10
3,23,42,2021,2021,2021-05-30,2021-10-16
4,51,9,2021,2022,2021-12-12,2022-02-26
5,22,33,2022,2022,2022-05-22,2022-08-13
6,49,4,2022,2023,2022-11-27,2023-01-28



### Variables de entrenamiento

In [5]:
data = load_object(
    path.join('data', 'covid', 'cleanned', f'positivos.pkl'))

data['indigena'] = data.indigena + '_' + data.lengua_indigena

data['dias'] = [x.days for x
                in data.fecha_ingreso-data.fecha_sintomas]
data['defuncion'] = ~pd.isna(data.fecha_defuncion)
data['grave'] = (data.tipo == 'HOSPITALIZADO') | data['defuncion']

fecha_etapa = [min(data.fecha_ingreso)-pd.Timedelta(days=1),
               *weeks.fecha_fin,
               max(data.fecha_ingreso)]

data['etapa'] = pd.cut(data.fecha_ingreso,
                       bins=fecha_etapa,
                       labels=[i for i in range(len(fecha_etapa)-1)],
                       right=True).astype(int)

catalogue = Catalogue()
catalogue.add('etapa')
catalogue.add('dias', function=lambda x:
              [max(0, min(16, x)) for x in x])
catalogue.add('edad', function=lambda x:
              [max(0, min(100, x)) for x in x])
catalogue.add(column='sexo', name='mujer', function=lambda x:
              [x == 'MUJER' for x in x])
catalogue.add(column='nacionalidad', name='origen',
              category={'MEXICANA': 'MEXICANO', 'EXTRANGERA': 'EXTRANGERO'})

catalogue.add('indigena', function=lambda x:
              ['SI' if 'SI' in x else ('NO' if 'NO_NO' == x else 'NE') for x in x])

catalogue.add('migrante', function=lambda x:
              ['NO' if n == 'MEXICANA' else x
                  for x, n in zip(x, data['nacionalidad'])])

for col in ['embarazo', 'diabetes', 'epoc', 'asma', 'inmunosupresion',
            'hipertension', 'cardiovascular', 'obesidad',
            'renal_cronica', 'tabaquismo', 'otra_comorbilidad']:
    catalogue.add(col)
catalogue.add('grave')
catalogue.add('defuncion')
data = filter(data, catalogue)
###########################
data.loc[data.indigena == 'SI', 'origen'] = 'MEXICANO_INDIGENA'
data.loc[data.migrante == 'SI', 'origen'] = 'EXTRANGERO_MIGRANTE'
data.drop(['indigena',	'migrante'], axis=1, inplace=True)
###########################
basis = path.join('data', 'covid', 'classification', 'dataframe')
os.makedirs(basis, exist_ok=True)
save_object(data, path.join(basis, f'positivos.pkl'))
del data, catalogue, col

### Pre-procesamiento y hashing

In [7]:
def trainingdata_1(data, alpha=5, skip=[], drop=[]):
    skip = skip if isinstance(skip, list) else [skip]
    drop = drop if isinstance(drop, list) else [drop]
    data = data.copy()
    comorb_columns = []
    columns = []
    for col in data.columns:
        if col not in skip:
            counts = data[col].value_counts()
            if 'SI' in counts.index:
                prop = 100 * counts['SI'] / (counts['SI'] + counts['NO'])
                if prop < alpha:
                    comorb_columns.append(col)
                columns.append(col)
    comorb = pd.Series(0, index=data.index)
    comorb_ne = pd.Series(0, index=data.index)
    for column in columns:
        comorb_ne += (data[column] == 'NE').astype(int)
        if column in comorb_columns:
            comorb += (data[column] == 'SI').astype(int)
        else:
            data[column] = data[column] == 'SI'
    data['comorbilidad'] = comorb
    data['comorbilidad_ne'] = comorb_ne
    grave = data.pop('grave') if 'grave' in data else None
    defuncion = data.pop('defuncion') if 'defuncion' in data else None
    if drop:
        for dp in drop:
            if dp in data.columns:
                data.drop(dp, axis=1, inplace=True)
    data.drop(comorb_columns, axis=1, inplace=True)
    if not (grave is None or 'grave' in drop):
        data['grave'] = grave
    if not (defuncion is None or 'defuncion' in drop):
        data['defuncion'] = defuncion
    return data


def hashing(data):
    grave = data.pop('grave') if 'grave' in data else None
    defuncion = data.pop('defuncion') if 'defuncion' in data else None
    data.index = [dict_str_hash(data.iloc[i]).upper()
                  for i in range(len(data))]
    data.index.name = 'hash'
    if grave is not None:
        data['grave'] = grave.tolist()
    if defuncion is not None:
        data['defuncion'] = defuncion.tolist()

In [8]:
basis = path.join('data', 'covid', 'classification', 'dataframe')
data = load_object(path.join(basis, f'positivos.pkl'))

# Positivos
positivos = trainingdata_1(data, drop='defuncion')
col = positivos.comorbilidad.copy()
col[col >= 2] = 2
positivos.comorbilidad = col
col = positivos.comorbilidad_ne.copy()
col[col >= 1] = 1
positivos.comorbilidad_ne = col
col = positivos.origen.copy()
col[col == 'EXTRANGERO_MIGRANTE'] = 'EXTRANGERO'
positivos.origen = col
col = positivos.etapa.copy()
col[col >= 5] = 5
positivos.etapa = col
hashing(positivos)
save_object(positivos, path.join(basis, f'positivos_hash-1.pkl'))

# Graves
graves = trainingdata_1(data[data.grave],
                              drop=['origen', 'comorbilidad_ne', 'grave'])
col = graves.comorbilidad.copy()
col[col >= 1] = 1
graves.comorbilidad = col
col = graves.etapa.copy()
col[col >= 5] = 5
graves.etapa = col
hashing(graves)
save_object(graves, path.join(basis, f'graves_hash-1.pkl'))

del data, positivos, graves, col

### Sets de entrenamiento

In [3]:
def datasets_1(data, column, testprop, minsize=10, seed=555):
    # Calcular probabilidad
    muestra = data.index.value_counts()
    muestra = muestra[muestra >= minsize]
    data = data.loc[muestra.index]
    casos = data[data[column]].index.value_counts()
    probabilidad = pd.Series(0.0, index=muestra.index)
    probabilidad.loc[casos.index] = casos / muestra[casos.index]
    # Estandarizar datos
    data = data.drop(column, axis=1).groupby(level=0).head(1)
    data = pd.get_dummies(data.sample(
        len(data), replace=False, random_state=seed))
    data = data[data.columns.sort_values()]
    etiqueta = data.etapa
    for column in data.columns:
        if data[column].dtype.name != 'bool':
            mx = max(data[column])
            if mx > 1.0:
                data[column] = data[column] / mx
    data = data.astype(float)
    data['probabilidad'] = probabilidad.loc[data.index]
    data['muestra'] = muestra.loc[data.index]
    data['etiqueta'] = etiqueta.loc[data.index]
    seed += 1
    # Creación de sets
    test = []
    for label in etiqueta.unique():
        subdata = data[data.etiqueta == label]
        test.append(subdata.sample(round(testprop * len(subdata)),
                                   replace=False,
                                   weights=subdata.muestra,
                                   random_state=seed))
    seed += 1
    test = pd.concat(test, axis=0)
    test = test.sample(len(test), replace=False, random_state=seed)
    train = data.drop(test.index, axis=0)

    testvar = test[['probabilidad', 'muestra', 'etiqueta']]
    trainvar = train[['probabilidad', 'muestra', 'etiqueta']]

    test.drop(['probabilidad', 'muestra', 'etiqueta'], axis=1, inplace=True)
    train.drop(['probabilidad', 'muestra', 'etiqueta'], axis=1, inplace=True)

    return (dict(x=train.values,
                 y=trainvar.probabilidad.to_numpy(),
                 sample=trainvar.muestra.to_numpy(),
                 label=trainvar.etiqueta.to_numpy(),
                 columns=train.columns.to_numpy(),
                 index=train.index.to_numpy()),
            dict(x=test.values,
                 y=testvar.probabilidad.to_numpy(),
                 sample=testvar.muestra.to_numpy(),
                 label=testvar.etiqueta.to_numpy(),
                 columns=test.columns.to_numpy(),
                 index=test.index.to_numpy()))


basis_ds = path.join('data', 'covid', 'classification', 'dataset')
basis_df = path.join('data', 'covid', 'classification', 'dataframe')

os.makedirs(basis_ds, exist_ok=True)

train, test = datasets_1(load_object(path.join(basis_df, 'graves_hash-1.pkl')),
                         column='defuncion', testprop=0.15)
save_object(train, path.join(basis_ds, 'train.graves-1.pkl'))
save_object(test, path.join(basis_ds, 'test.graves-1.pkl'))

train, test = datasets_1(load_object(path.join(basis_df, f'positivos_hash-1.pkl')),
                         column='grave', testprop=0.2)
save_object(train, path.join(basis_ds, 'train.positivos-1.pkl'))
save_object(test, path.join(basis_ds, 'test.positivos-1.pkl'))

del train, test