In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, log_loss
from sklearn.metrics import roc_curve, roc_auc_score, auc

## Load data

In [33]:
data_loaded = pd.read_csv("data2classificy_j01ca_n.csv",sep=";")

## Drop columns that must not be used

In [40]:
columns2drop = ['ID_Isol','ID_Episodio','ID_Pedido','ID_Doente',
                'Resistant_concat_Fam','Resistant_Anti','Infeccoes',
                'DT_Admin']
data = data_loaded.drop(columns2drop,axis=1)
data.columns

Index(['Nome_Micro', 'IDADE', 'Genero', 'Latitude', 'Longitude', 'Dt_Colheita',
       'COLHEITA_DIFF', 'Target_J01CA', 'Diagnosticos_new',
       'Nome_Analises_new', 'Produto_Analises_new', 'Sintomas_new',
       'nr_past_visits', 'nr_past_visits_7days', 'nr_past_visits_15days',
       'nr_past_infections', 'nr_past_infections_7days',
       'nr_past_infections_15days', 'fever_7days', 'fever_15days',
       'cough_7days', 'cough_15days', 'diarrhea_7days', 'diarrhea_15days',
       'fatigue_7days', 'fatigue_15days', 'J01CA_past_resistance',
       'J01GB_past_resistance', 'J01DH_past_resistance',
       'J01MA_past_resistance', 'J01DD_past_resistance', 'Hour', 'Weekday',
       'Monthday', 'Month', 'Year', 'DT_Admin_temp', 'Temperature_3avg',
       'Temperature_3std', 'Temperature_3min', 'Humidity_3avg',
       'Humidity_3std', 'Humidity_3min', 'Temperature_5avg',
       'Temperature_5std', 'Temperature_5min', 'Humidity_5avg',
       'Humidity_5std', 'Humidity_5min', 'Temperature_7av

## Missing data

In [36]:
data.isnull().sum()[data.isnull().sum()>0]

Nome_Micro                  4
Latitude                  721
Longitude                 721
Diagnosticos_new        12674
Nome_Analises_new        3743
Produto_Analises_new     3743
Sintomas_new            17352
Temperature_3avg           23
Temperature_3std           23
Temperature_3min           23
Humidity_3avg              23
Humidity_3std              23
Humidity_3min              23
Temperature_5avg           45
Temperature_5std           45
Temperature_5min           45
Humidity_5avg              45
Humidity_5std              45
Humidity_5min              45
Temperature_7avg           56
Temperature_7std           56
Temperature_7min           56
Humidity_7avg              56
Humidity_7std              56
Humidity_7min              56
dtype: int64

So we see that are a bunch of variables that seem to have quite a amount of missing data. Looking to these variables we can divide them in 3 groups:
* Resistant_concat_Fam, Resistant_Anti, Diagnosticos_new, Nome_Analises_new, Produto_Analises_new, Sintomas_new, Infeccoes: When these variables are nan in fact means that missing is not random it means that no information was placed. The solution is just to replace these values by a value like 'Nenhum' to tell that no values were annotated by the clinical staff.
* Latitude/Longitude: No information was provided to these features solution (replace by a specific value and creat another feature indicating if these variable is missing)
* Weather features: These values are missing because we do not have data for the year 2013.

In [52]:
data['Diagnosticos_new'].fillna("Nenhum",inplace=True)
data['Produto_Analises_new'].fillna("Nenhum",inplace=True)
data['Nome_Analises_new'].fillna("Nenhum",inplace=True)
data['Sintomas_new'].fillna("Nenhum",inplace=True)

In [53]:
# Fill the nan values with the localization of one of the clinicals
data['Longitude'].fillna(-9.1628837,inplace=True) 
data['Latitude'].fillna(38.748496,inplace=True)

In [49]:
# select variables of wather:
weather_features = data.filter(regex='Temperature|Humidity').columns
# Input values based on month and monthday
for feat in weather_features:
    data[feat] = data_loaded.groupby(['Month', 'Monthday'])[feat].transform(lambda x: x.fillna(x.mean()))

In [54]:
data.isnull().sum()[data.isnull().sum()>0]

Nome_Micro    4
dtype: int64

## Transform features

In [None]:
def dumerize_bagofwords(dataset,feature,separator='--',drop_old=True):
    dummies = dataset[feature].str.get_dummies(sep=separator)
    for col_dummie_i in a.columns:
        dummies = dummies.rename(columns = {col_dummie_i:str(feature)+ '_' + str(col_dummie_i)})
        dummies = dummies.astype('uint8')
    data = pd.concat([dataset,a],axis=1)
    data = data.drop(feature,axis=1)
    return data

In [None]:
def dummerize_categorical(dataset,feature,drop_old=True):
    dummies = pd.get_dummies(data_final[feature],drop_first=True,prefix=feature)
    dummies = dummies.astype('uint8')
    data = pd.concat([dataset,dummies],axis=1)
    data = data.drop(feature,axis=1)
    return data

In [None]:
def remove_badfeatues(dataset,unique_thr=100):
    print(" ... Removing dummies that appear less than {} times:".format(unique_thr),end=" ")
    # Copy dataset to avoid probels
    dataset_out = dataset.copy()
    # Select categorical variables
    dataset_categorical = dataset_out.select_dtypes(include=['uint8'])
    # List to store the names of features to keep
    removidas = []
    # Ltst to store the names of features to drop
    mantidas = []
    # Iterate thourgh the uint columns
    for col in dataset_categorical.columns:
        if dataset_categorical[col].sum() < unique_thr:
            removidas.append(col)
            dataset_out.drop([col],axis=1,inplace=True)
        else:
            mantidas.append(col)
    # Print the number of columns to removed
    print("{} removed.".format(len(removidas)))
    return dataset_out,removidas

In [83]:
numerical_features = data.filter(regex='nr_|IDADE|Latitude|Longitude|Humidity|Temperature').columns.tolist()
categorical_features = list(set(data.columns.tolist()) - set(numerical_features))
categorical_features
for feat_cat in categorical_features:
    
 #   if 
 #       data = remove_badfeatues(data,feat_cat,drop_old=True)
 #   else:
 #        data = dummerize_categorical(data,feat_cat,drop_old=True)
            
 #   data = remove_badfeatues(data,unique_thr=2)
    

['Target_J01CA',
 'value',
 'fatigue_7days',
 'Diagnosticos_new',
 'J01GB_past_resistance',
 'Sintomas_new',
 'Nome_Micro',
 'J01MA_past_resistance',
 'J01DD_past_resistance',
 'J01CA_past_resistance',
 'J01DH_past_resistance',
 'Nome_Analises_new',
 'COLHEITA_DIFF',
 'Monthday',
 'Year',
 'cough_15days',
 'Weekday',
 'cough_7days',
 'Dt_Colheita',
 'fever_7days',
 'Produto_Analises_new',
 'fever_15days',
 'diarrhea_15days',
 'DT_Admin_temp',
 'Month',
 'Genero',
 'diarrhea_7days',
 'Hour',
 'fatigue_15days']

In [84]:
## Transform cyclic variable
#http://blog.davidkaleko.com/feature-engineering-cyclical-features.html

## Train model -  Random Forest

In [None]:
rfc = RandomForestClassifier(njobs=-1,max_features='sqrt',n_estimators=10,oob_score=False)
optimize_parameters = True

if optimize_parameters:
    # The scorers can be either be one of the predefined metric strings or a scorer
    # callable, like the one returned by make_scorer
    scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

    param_grid = {
        'n_estimators':[1,2,5,10,20,50,100,200,500],
        'max_features':['auto','sqrt','log2',0.2,0.5,0.8]
    }
    CV_rfc = GridSearchCV(estimator=rfc,param_grid=param_grid,cv=5,scoring='roc_curve')
else:
    
    

## Test model - Random Forest

In [24]:
def build_roc_cnf(y_true,y_score,y_predicted):
    fpr,tpr,thresholds = roc_curve(y_true,y_score[:,1])
    roc_auc = auc(fpr,tpr)
    
    plt.figure(figsize=(10,5))
    
    plt.subplot(1,2,1)
    plt.plot(fpr,tpr,color='darkorange',lw=2,label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0,1],[0,1],color='navy',lw=lw,linestyle='--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.xlabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc='lower right')
    
    plt.subplot(1,2,2)
    mat = confusion_matrix(y_true,y_predicted)
    sns.heatmap(mat.T, square = True, annot=True, fmt='d', char=False, cmap = 'Blues')
    plt.xlabel('true label')
    plt.ylabel('predicted label')
    plt.tight_layout()
    plt.show()
    