In [194]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

## Train EDA

In [195]:
df = pd.read_csv('./data/train.csv')

In [196]:
df.shape

(1617, 85)

In [197]:
df.head()

Unnamed: 0,MDR,ID,NHC,start_neutropenico,start_FN,days_between,days_in_hospital,hospital_stay_w_FN,prev_hospital_stay,birth_year,...,TOBRAMICINA_NEB_.MG.,VANCOMICINA_.MG.,Auto_TP,Alo_TP,room_list,mucositis,cito_group_3,cito_group_1,cito_group_2,Past_positive_result_from
0,0,374-1,404,2007-12-11,2008-01-01,,28,1,3,1941,...,0,8000,1,0,E02403,0,1,0,0,Culture
1,0,398-1,1897,2007-12-28,2008-01-01,,8,1,6,1935,...,0,0,0,0,G06512,0,0,0,0,NEGATIVE
2,0,403-1,556,2008-01-01,2008-01-01,,2,1,1,1980,...,0,0,1,0,E02407,1,0,0,0,NEGATIVE
3,0,407-1,454,2008-01-05,2008-01-05,,1,1,9,1986,...,0,0,0,0,"G06508, U10102, UHE211",0,0,0,0,NEGATIVE
4,0,394-1,1615,2007-12-22,2008-01-06,,17,1,5,1943,...,0,0,0,0,"G06502, G08501",0,0,0,0,Culture


In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1617 entries, 0 to 1616
Data columns (total 85 columns):
MDR                                         1617 non-null int64
ID                                          1617 non-null object
NHC                                         1617 non-null int64
start_neutropenico                          1617 non-null object
start_FN                                    1617 non-null object
days_between                                832 non-null float64
days_in_hospital                            1617 non-null int64
hospital_stay_w_FN                          1617 non-null int64
prev_hospital_stay                          1617 non-null int64
birth_year                                  1617 non-null int64
Gender                                      1617 non-null object
emergency                                   1617 non-null int64
num_movements                               1617 non-null int64
num_consult                                 1617 non-nul

In [199]:
print("Number of unique patients:", df.NHC.nunique())

Number of unique patients: 785


In [200]:
print("Number of observations with MDR:", df.MDR.sum())

Number of observations with MDR: 86


In [201]:
print("Number of distinct patients with MDR:", df[df.MDR==1].NHC.nunique())

Number of distinct patients with MDR: 80


### Missing values

There are 3 columns with missigns:
* `days_between`
* `days_after_anti`
* `room_list`

#### Days between and days after antibiotic

In [202]:
df['d_days_between'] = df['days_between'].apply(lambda x: 0 if np.isnan(x) else 1)
df['d_days_after_anti'] = df['days_after_anti'].apply(lambda x: 0 if np.isnan(x) else 1)

#### Room list

Split room list into dummies

In [203]:
# Write a proxy method to be passed to `pipe`
def agg_assign(gb, fdict):
    data = {
        (cl, nm): gb[cl].agg(fn)
        for cl, d in fdict.items()
        for nm, fn in d.items()
    }
    pd.options.display.float_format = '{:.0f}'.format
    return pd.DataFrame(data)

In [204]:
# Identical dictionary passed to `agg`
funcs = {
    'room_list': {
        'count': 'count',
    },
    'MDR': {
        'sum': 'sum',
    }
}

In [205]:
df.groupby('room_list').pipe(agg_assign, fdict=funcs).reset_index().sort_values(('MDR', 'sum'), ascending=False).head(20)

Unnamed: 0_level_0,room_list,room_list,MDR
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
403,G06514,12,3
71,E02401,50,3
109,E02405,41,2
279,G02407,19,2
294,G02409,25,2
82,E02402,45,2
217,G02207,5,2
150,E02408,41,2
249,G02403,20,2
418,"G07112, G09204, G09205, G7C112",1,1


In [206]:
for idx, row in df.iterrows():
    try:
        num_rooms = len(row['room_list'].split(","))
    except:
        if np.isnan(row['room_list']):
            num_rooms = 0
        else:
            num_rooms = 1
    df.loc[idx, 'num_rooms'] = num_rooms
    
df.drop('room_list', axis=1, inplace=True)

In [207]:
funcs = {
    'num_rooms': {
        'count': 'count',
    },
    'MDR': {
        'sum': 'sum',
    }
}

df_numrooms = df.groupby('num_rooms').pipe(agg_assign, fdict=funcs).reset_index()

In [208]:
df_numrooms['%MDR'] = df_numrooms[('MDR','sum')]/df_numrooms[('num_rooms','count')]*100
df_numrooms

Unnamed: 0_level_0,num_rooms,num_rooms,MDR,%MDR
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,Unnamed: 4_level_1
0,0,333,13,4
1,1,796,36,5
2,2,299,23,8
3,3,120,7,6
4,4,36,5,14
5,5,20,1,5
6,6,8,1,12
7,7,3,0,0
8,8,2,0,0


Bins decided as follows:
* 0 rooms
* 1 room
* 2 rooms
* +2 rooms

In [209]:
df['num_rooms_b'] = df['num_rooms'].apply(lambda x: x if x <= 2 else 3)

In [210]:
funcs = {
    'num_rooms_b': {
        'count': 'count',
    },
    'MDR': {
        'sum': 'sum',
    }
}

df_numrooms = df.groupby('num_rooms_b').pipe(agg_assign, fdict=funcs).reset_index()
df_numrooms['%MDR'] = df_numrooms[('MDR','sum')]/df_numrooms[('num_rooms_b','count')]*100
df_numrooms

Unnamed: 0_level_0,num_rooms_b,num_rooms_b,MDR,%MDR
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,Unnamed: 4_level_1
0,0,333,13,4
1,1,796,36,5
2,2,299,23,8
3,3,189,14,7


In [211]:
df.describe()

Unnamed: 0,MDR,NHC,days_between,days_in_hospital,hospital_stay_w_FN,prev_hospital_stay,birth_year,emergency,num_movements,num_consult,...,Auto_TP,Alo_TP,mucositis,cito_group_3,cito_group_1,cito_group_2,d_days_between,d_days_after_anti,num_rooms,num_rooms_b
count,1617,1617,832,1617,1617,1617,1617,1617,1617,1617,...,1617,1617,1617,1617,1617,1617,1617,1617,1617,1617
mean,0,948,76,13,2,3,1957,0,1,5,...,0,0,0,0,0,0,1,1,1,1
std,0,591,128,17,1,3,16,0,1,9,...,0,0,0,0,0,0,0,0,1,1
min,0,1,3,0,1,0,1919,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25%,0,522,11,1,1,1,1946,0,0,0,...,0,0,0,0,0,0,0,0,1,1
50%,0,810,37,9,1,2,1954,0,0,2,...,0,0,0,0,0,0,1,1,1,1
75%,0,1565,83,16,2,4,1968,1,1,5,...,0,0,0,1,0,0,1,1,2,2
max,1,2016,1307,166,8,20,1993,1,12,91,...,2,2,1,1,1,1,1,1,8,3


In [212]:
df.columns

Index(['MDR', 'ID', 'NHC', 'start_neutropenico', 'start_FN', 'days_between',
       'days_in_hospital', 'hospital_stay_w_FN', 'prev_hospital_stay',
       'birth_year', 'Gender', 'emergency', 'num_movements', 'num_consult',
       'share_room_MDR', 'dummy_LAM', 'dummy_others.LL',
       'dummy_Cancer.linfoproliferativo', 'dummy_SMD', 'dummy_LAL',
       'dummy_EICH', 'dummy_Leucemia.cronica', 'dummy_SMPC',
       'dummy_Cancer.solido', 'dummy_LMC', 'dummy_TLPT', 'dummy_others.LM',
       'dummy_Mieloma.like', 'dummy_LLC', 'antibiotic_count',
       'days_after_anti', 'AMIKACINA_.MG.', 'AMOXICILINA_.MG.',
       'AMPICILINA_.MG.', 'AZITROMICINA_VIAL_.MG.', 'AZTREONAM_.MG.',
       'CEFAZOLINA_.MG.', 'CEFIXIMA_.MG.', 'CEFOTAXIMA_.MG.',
       'CEFOXITINA_.MG.', 'CEFTAROLINA_FOSAMIL_.MG.', 'CEFTAZIDIMA_.MG.',
       'CEFTIBUTENO_.MG.', 'CEFTOLOZANO_.UND.', 'CEFTRIAXONA_.MG.',
       'CEFUROXIMA.AXETILO_.MG.', 'CIPROFLOXACINO_.MG.', 'CLARITROMICINA_.MG.',
       'CLINDAMICINA_.MG.', 'CLOXA

In [214]:
# drop non necessary columns
df.drop('Gender', axis=1, inplace=True)
df.drop('ID', axis=1, inplace=True)
df.drop('NHC ', axis=1, inplace=True)
df.drop('Past_positive_result_from', axis=1, inplace=True)

KeyError: "labels ['Gender'] not contained in axis"