In [1]:
import pandas as pd
def get_dataframes():
    dataframes=pd.read_excel("com_Nas_por_campo_Todas_Tabelas_Flu_de_2010_a_2019_com_os_63_campos_da_Intersecao_de_Todos_e 244774_Amostras.xlsx")
    dataframes["NT_DT"] = pd.to_datetime(dataframes['DT_NOTIFIC'], errors = 'coerce')
    dataframes["NT_DT"] = dataframes['NT_DT'].map(lambda x: x.year)
    return dataframes

In [2]:
def convert_uf(uf):
    if uf == "RO":
        return 11
    elif uf == "AC":
        return 12
    elif uf == "AM":
        return 13
    elif uf == "RR":
        return 14
    elif uf == "PA":
        return 15
    elif uf == "AP":
        return 16
    elif uf == "TO":
        return 17
    elif uf == "MA":
        return 21
    elif uf == "PI":
        return 22
    elif uf == "CE":
        return 23
    elif uf == "RN":
        return 24
    elif uf == "PB":
        return 25
    elif uf == "PE":
        return 26
    elif uf == "AL":
        return 27
    elif uf == "SE":
        return 28
    elif uf == "BA":
        return 29
    elif uf == "MG":
        return 31
    elif uf == "ES":
        return 32
    elif uf == "RJ":
        return 33
    elif uf == "SP":
        return 35
    elif uf == "PR":
        return 41
    elif uf == "SC":
        return 42
    elif uf == "RS":
        return 43
    elif uf == "MS":
        return 50
    elif uf == "MT":
        return 51
    elif uf == "GO":
        return 52
    elif uf == "DF":
        return 53
    else:
        return int(uf)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
def label_regions(df):
    regions = [["53","52","50","51"],["16","12","13","15","11","14","17"],["27","29","23","21","25","22","26","24","28"],["32","31","33","35"],["43","42","41"]]

    df['region-centro-oeste'] = df["UF_NOT"].isin(regions[0]) #centro-oeste
    df['region-norte'] = df["UF_NOT"].isin(regions[1]) #norte
    df['region-nordeste'] = df["UF_NOT"].isin(regions[2]) #nordeste
    df['region-sudeste'] = df["UF_NOT"].isin(regions[3]) #nordeste
    df['region-sul'] = df["UF_NOT"].isin(regions[4]) #nordeste
    return df

In [66]:
def fill_nan_values(df, variable):
    result_df = df.copy()
    result_df[variable] = result_df[variable].fillna(result_df[variable].mode()[0])
    return result_df

In [151]:
def shuffle_rows(df, variable):
    result_df = df.copy()
    result_df[variable] = result_df.sample(frac = 1.0).copy()[variable].values
    return result_df.sample(frac = 1.0).copy()

In [50]:

from datetime import timedelta
def clean_df(df):
    # removing NaN
    #df_clean = df[(df['EVOLUCAO'].notna() & df['SG_UF_NOT'].notna() & df['VACINA'].notna() & df['ANTIVIRAL'].notna() & df['PNEUMOPATI'].notna() & df['CARDIOPATI'].notna())].copy()
    fill_na_fileds = ['EVOLUCAO', 'SG_UF_NOT', 'VACINA', 'ANTIVIRAL', 'PNEUMOPATI', 'CARDIOPATI']
    for i in fill_na_fileds:
        df = fill_nan_values(df, i)
    df_clean = df.copy()
    
    df_clean['MORTE'] = df_clean['EVOLUCAO'].map(lambda x: 1 if x == "2" or x == "4" or x == 2.0 or x == 4.0 else 0)
    df_clean['IDADE'] = (pd.to_datetime(df_clean['DT_NOTIFIC'], errors = 'coerce') - pd.to_datetime(df_clean['DT_NASC'], errors = 'coerce')) / timedelta(days=365)
    df_clean['UF_NOT'] = df_clean['SG_UF_NOT'].map(lambda x: convert_uf(x))
    df_clean['CS_SEXO'] = df_clean['CS_SEXO'].map(lambda x: 1 if x == "F" else 2)
    label_regions(df_clean)
    return df_clean


In [51]:
def load_dataframes(df):
    sensitive_vars = ["CS_SEXO", "IDADE", "region-centro-oeste", "region-norte", "region-nordeste", "region-sudeste", "region-sul", "VACINA", "MORTE", "ANTIVIRAL", "PNEUMOPATI", "CARDIOPATI"]
    dataframes = []
    dff = clean_df(df)
    dates = [2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0, 2018.0, 2019.0]
    for i in dates:
        dataframes.append(dff[dff["NT_DT"] == i][sensitive_vars])
    dataframes[0] = dataframes[1]
    return dataframes

In [40]:
raw_df = get_dataframes()


In [52]:
df_processed = load_dataframes(raw_df)
df = df_processed[5].dropna() #TODO check with 5 only e use complete

In [159]:
def generate_rf(df, variable):
    # Use numpy to convert to arrays
    import numpy as np
    # mortes are the values we want to predict
    mortes = np.array(df['MORTE'])
    # Remove the mortes from the features
    # axis 1 refers to the columns
    df_drop= df.drop('MORTE', axis = 1)
    # Saving feature names for later use
    feature_list = list(df_drop.columns)
    # Convert to numpy array
    df_np = np.array(df_drop)
    train_features, test_features, train_mortes, test_mortes = train_test_split(df_np, mortes, test_size = 0.20, random_state = 42)

    train_f_features, cross_features, train_f_mortes, cross_mortes = train_test_split(train_features, train_mortes, test_size = 0.20, random_state = 42)
    from sklearn.ensemble import RandomForestRegressor
    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
    # Train the model on training data
    rf.fit(train_f_features, train_f_mortes);
    importances = list(rf.feature_importances_) #problemas com o feature importance 
    # gerar com a random forest que ele mesmo gerou causa um problema, 
    #usar uma melhoria 
    #(1). melhorar o missing data, fazer a media por coluna, (valor que é mais comum)

    #tende a ser enviezado por algumas variavies, tem mais homens que mulheres (dar mais importancia para um) [diminui o binario]
    #
    # permutação - tira cada um e calcula o feature importance depois faz uma outra permutacao e adiciona

    # faz uma permutacao e calcula, so da coluna da variavel 


    # List of tuples with variable and importance

    # fazer o random shuffeling e tirar a media da disposição de todos, esse sera o 
    # (2). descobrir o feature importance "random permutation" 
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    
    # Print out the feature and importances 
    #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
    for i in feature_importances:
        if i[0] == variable:
            print("shuffle {} {}".format(variable, i[1]))

    

In [165]:
variables_for_shuffeling = ["IDADE", "VACINA", "ANTIVIRAL", "CARDIOPATI", "CS_SEXO", "PNEUMOPATI", "region-centro-oeste", "region-sudeste", "region-sul", "region-norte", "region-nordeste"]



for i in variables_for_shuffeling:#TODO check
    for _ in range(7):
        df_shuffeled = shuffle_rows(df, i).copy()
        #print(df_shuffeled[i].head()) tah fazendo o shuffle sim
        generate_rf(df_shuffeled, i)

shuffle IDADE 0.7
shuffle IDADE 0.71
shuffle IDADE 0.69
shuffle IDADE 0.7
shuffle IDADE 0.69
shuffle IDADE 0.7
shuffle IDADE 0.69
shuffle VACINA 0.06
shuffle VACINA 0.06
shuffle VACINA 0.06
shuffle VACINA 0.06
shuffle VACINA 0.06
shuffle VACINA 0.06
shuffle VACINA 0.07
shuffle ANTIVIRAL 0.05
shuffle ANTIVIRAL 0.05
shuffle ANTIVIRAL 0.05
shuffle ANTIVIRAL 0.05
shuffle ANTIVIRAL 0.05
shuffle ANTIVIRAL 0.05
shuffle ANTIVIRAL 0.05
shuffle CARDIOPATI 0.04
shuffle CARDIOPATI 0.04
shuffle CARDIOPATI 0.04
shuffle CARDIOPATI 0.04
shuffle CARDIOPATI 0.04
shuffle CARDIOPATI 0.04
shuffle CARDIOPATI 0.04
shuffle CS_SEXO 0.04
shuffle CS_SEXO 0.04
shuffle CS_SEXO 0.04
shuffle CS_SEXO 0.04
shuffle CS_SEXO 0.04
shuffle CS_SEXO 0.04
shuffle CS_SEXO 0.04
shuffle PNEUMOPATI 0.04
shuffle PNEUMOPATI 0.04
shuffle PNEUMOPATI 0.05
shuffle PNEUMOPATI 0.04
shuffle PNEUMOPATI 0.05
shuffle PNEUMOPATI 0.05
shuffle PNEUMOPATI 0.04
shuffle region-centro-oeste 0.02
shuffle region-centro-oeste 0.02
shuffle region-centr

In [53]:
# Use numpy to convert to arrays
import numpy as np
# mortes are the values we want to predict
mortes = np.array(df['MORTE'])
# Remove the mortes from the features
# axis 1 refers to the columns
df_drop= df.drop('MORTE', axis = 1)
# Saving feature names for later use
feature_list = list(df_drop.columns)
# Convert to numpy array
df_np = np.array(df_drop)

In [54]:
train_features, test_features, train_mortes, test_mortes = train_test_split(df_np, mortes, test_size = 0.20, random_state = 42)

train_f_features, cross_features, train_f_mortes, cross_mortes = train_test_split(train_features, train_mortes, test_size = 0.20, random_state = 42)



In [55]:
test_mortes

array([0, 0, 1, ..., 0, 0, 0])

In [56]:
train_features.shape

(9230, 11)

In [57]:
cross_features.shape #deixar o validation fixo
#(3). cross validation com 5 separações

(1846, 11)

In [58]:
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_f_features, train_f_mortes);

In [153]:
# Get numerical feature importances
importances = list(rf.feature_importances_) #problemas com o feature importance 
# gerar com a random forest que ele mesmo gerou causa um problema, 
#usar uma melhoria 
#(1). melhorar o missing data, fazer a media por coluna, (valor que é mais comum)

#tende a ser enviezado por algumas variavies, tem mais homens que mulheres (dar mais importancia para um) [diminui o binario]
#
# permutação - tira cada um e calcula o feature importance depois faz uma outra permutacao e adiciona

# faz uma permutacao e calcula, so da coluna da variavel 


# List of tuples with variable and importance

# fazer o random shuffeling e tirar a media da disposição de todos, esse sera o 
# (2). descobrir o feature importance "random permutation" 
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];


#!!!!!!! salvar isso numa pasta importante !!!!!!!!!!!!

Variable: IDADE                Importance: 0.7
Variable: VACINA               Importance: 0.05
Variable: ANTIVIRAL            Importance: 0.05
Variable: CARDIOPATI           Importance: 0.05
Variable: CS_SEXO              Importance: 0.04
Variable: PNEUMOPATI           Importance: 0.04
Variable: region-centro-oeste  Importance: 0.02
Variable: region-sudeste       Importance: 0.02
Variable: region-sul           Importance: 0.02
Variable: region-norte         Importance: 0.01
Variable: region-nordeste      Importance: 0.01
IDADE


In [60]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_mortes)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.2 degrees.


In [61]:
from sklearn.model_selection import cross_val_score

In [65]:
np.mean(cross_val_score(rf, cross_features, cross_mortes,scoring='neg_mean_absolute_error', cv=10))

-0.21220268801410108

In [64]:
np.mean(cross_val_score(rf, train_features, train_mortes,scoring='neg_mean_absolute_error', cv=10))

#tutorial de como fazer o cross validation
# fazer testes para fazer de forma particionada

-0.20690970807496947