# Inicialização

In [6]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

from plotly import graph_objects as go
import plotly as py

from datetime import datetime
from datetime import timedelta

import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, accuracy_score, f1_score, confusion_matrix, recall_score, precision_score

from sklearn.utils import resample

# Funções

In [7]:
def upsampleData(X, label):
    # Separate true and false
    false_label = X[X[label]==0].copy()
    true_label = X[X[label]==1].copy()
    
    # Upsample true values
    label_upsampled = resample(true_label,
                            replace=True, # sample with replacement
                            n_samples=len(false_label), # match number in majority class
                            random_state=378) # reproducible results
    upsampled = pd.concat([false_label, label_upsampled])
    
    # Separate x and y
    x = upsampled[[c for c in X.columns if label not in c]]
    y = upsampled[label]
    
    return x, y

In [8]:
def trainXGB(df, cols_rem, label, verbose=True):
    xgb = xgboost.XGBClassifier()

    # Separate x and y and remove unnecessary columns
    x = df[[c for c in df.columns if c not in cols_rem]]
    y = df[label]
    
    # Split training and test data
    x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3, random_state = 378, stratify=y)
    
    # Upsample true values
    X = pd.concat([x_treino, y_treino], axis=1)
    x_treino, y_treino = upsampleData(X, label)

    # XGBClassifier parameters
    param = {'max_depth':50, 'eta':1, 'objective':'binary:logistic', 'min_child_weight': 1, 'lambda': 1, 'alpha': 0, 'gamma': 0}

    # Generate DMatrices with training and test data
    df_train = xgboost.DMatrix(data=x_treino, label=y_treino)
    df_test = xgboost.DMatrix(data=x_teste, label=y_teste)

    # Train model and predict on training and test data
    bst = xgboost.train(param, df_train, 2, feval=f1_score)
    y_teste_pred = bst.predict(xgboost.DMatrix(data=x_teste, label=y_teste))
    y_teste_pred = [1 if i>0.5 else 0 for i in y_teste_pred]
    y_treino_pred = bst.predict(xgboost.DMatrix(data=x_treino, label=y_treino))
    y_treino_pred = [1 if i>0.5 else 0 for i in y_treino_pred]
    
    # Print results if verbose is true
    if verbose:
        print(f"Treino: {accuracy_score(y_treino, y_treino_pred)}")
        print(f"Teste: {accuracy_score(y_teste, y_teste_pred)}")
        print(f"Precisão: {precision_score(y_teste, y_teste_pred)}")
        print(f"Recall: {recall_score(y_teste, y_teste_pred)}")
        print(f"F1: {f1_score(y_teste, y_teste_pred)}")
        display(confusion_matrix(y_teste, y_teste_pred, normalize='true'))
        display(confusion_matrix(y_teste, y_teste_pred,))
        
    # Store results in a dict
    results = {
        'Features': list(x.columns),
        'Train_Acc': accuracy_score(y_treino, y_treino_pred),
        'Test_Acc': accuracy_score(y_teste, y_teste_pred),
        'Precision': precision_score(y_teste, y_teste_pred),
        'Recall': recall_score(y_teste, y_teste_pred),
        'F1': f1_score(y_teste, y_teste_pred),
        'Ver_Pos': confusion_matrix(y_teste, y_teste_pred, normalize='true')[1,1]
    }
    
    return bst, results, y_treino_pred, y_teste_pred

# Prepped Data

In [9]:
df_p = pd.read_csv('../../../data/cleandata/Info pluviometricas/Merged Data/prepped_data.csv', sep=';')
df_p.groupby('Label').count()

Unnamed: 0_level_0,Data_Hora,Mes,Dia,Local,Precipitacao,PrecSum
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,359227,359227,359227,359227,359227,359227
1.0,6798,6798,6798,6798,6798,6798


In [10]:
df_p = df_p.sort_values(['Data_Hora', 'Local'])
df_p['Label'] = df_p['Label'].shift(-5*6, fill_value = 0)

In [11]:
# Parameters
label = 'Label'
cols_rem = ['LocalMax', 'Label', 'Label_Old', 'Cluster', 'Data', 'Hora', 'Data_Hora', 'Ordens', 'Minuto'] + [c for c in df_p.columns if 'Hora_' in c]
# Result set
prepped_models = {}

for l in range(6):
    if l != 0:
        df_train = df_p[df_p['Local'] == l]
    else:
        df_train = df_p.copy()
        
    print(f'----- LOCAL {l} -----')
    model, training_res, y_treino_pred, y_teste_pred = trainXGB(df_train, cols_rem, label)
    
    prepped_models[l] = {
        'model': model,
        'results': training_res,
        'y_treino': y_treino_pred,
        'y_teste': y_teste_pred
    }

----- LOCAL 0 -----
Treino: 0.9813408203358016
Teste: 0.9623797901792219
Precisão: 0.3252005347593583
Recall: 0.9543894065718489
F1: 0.4851053221986788


array([[0.96253097, 0.03746903],
       [0.04561059, 0.95438941]])

array([[103731,   4038],
       [    93,   1946]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9802352660177942
Teste: 0.9577451962480649
Precisão: 0.3102661596958175
Recall: 0.951048951048951
F1: 0.4678899082568807


array([[0.9578786 , 0.0421214 ],
       [0.04895105, 0.95104895]])

array([[20626,   907],
       [   21,   408]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9836959761549926
Teste: 0.9668973681814043
Precisão: 0.3473877176901925
Recall: 0.9619289340101523
F1: 0.5104377104377106


array([[0.96698813, 0.03301187],
       [0.03807107, 0.96192893]])

array([[20856,   712],
       [   15,   379]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9804565756823821
Teste: 0.9616610509061105
Precisão: 0.3028523489932886
Recall: 0.9704301075268817
F1: 0.46163682864450134


array([[0.96150996, 0.03849004],
       [0.02956989, 0.97043011]])

array([[20759,   831],
       [   11,   361]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9791069152326323
Teste: 0.9572443311173846
Precisão: 0.3181484202792065
Recall: 0.9752252252252253
F1: 0.47977839335180056


array([[0.95687332, 0.04312668],
       [0.02477477, 0.97522523]])

array([[20590,   928],
       [   11,   433]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.982269221594975
Teste: 0.9608414534195429
Precisão: 0.31295843520782396
Recall: 0.9576059850374065
F1: 0.47174447174447176


array([[0.96090163, 0.03909837],
       [0.04239401, 0.95760599]])

array([[20718,   843],
       [   17,   384]], dtype=int64)

In [12]:
prepped_models[0]['results']

{'Features': ['Mes', 'Dia', 'Local', 'Precipitacao', 'PrecSum'],
 'Train_Acc': 0.9813408203358016,
 'Test_Acc': 0.9623797901792219,
 'Precision': 0.3252005347593583,
 'Recall': 0.9543894065718489,
 'F1': 0.4851053221986788,
 'Ver_Pos': 0.9543894065718489}

# Full Data

In [13]:
df_f = pd.read_csv('../../../data/cleandata/Info pluviometricas/Merged Data/full_data.csv', sep=';')
display(df_f.head())
df_f.shape

Unnamed: 0,Data_Hora,LocalMax_d_All,LocalMax_d_ow,Local_d_Null,LocalMax_h_All,LocalMax_h_ow,Local_h_Null,Local,UmidadeRelativa,PressaoAtmosferica,...,TemperaturaInterna,PontoDeOrvalho,RadiacaoSolar,DirecaoDoVento,VelocidadeDoVento,Precipitacao,LocalMax_d,LocalMax_h,Local_d,Local_h
0,2011-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,RM,86.6,926.7,...,23.3,17.8,0.0,131.0,2.5,0.0,0.0,0.0,0.0,0.0
1,2011-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,Erasmo,88.8,920.5,...,22.9,17.7,0.0,133.0,0.7,0.0,0.0,0.0,0.0,0.0
2,2011-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,Camilopolis,84.8,920.6,...,22.8,17.5,0.0,137.0,1.5,0.0,0.0,0.0,0.0,0.0
3,2011-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,Paraiso,92.3,919.1,...,22.7,18.0,0.0,101.0,2.6,0.0,0.0,0.0,0.0,0.0
4,2011-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,Vitoria,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(1464030, 21)

In [14]:
df_f['Data_Hora'] = pd.to_datetime(df_f['Data_Hora'], yearfirst=True)
df_f = df_f[df_f['Data_Hora'].dt.minute == 0]
df_f = df_f.drop(columns = ['LocalMax_d_ow', 'LocalMax_h_All', 'LocalMax_h', 'LocalMax_h_ow', 'LocalMax_d'] + [c for c in df_f.columns if 'Local_' in c])
df_f = df_f.rename(columns = {'LocalMax_d_All': 'Label'})
df_f['Dia'] = df_f['Data_Hora'].dt.day
df_f['Mes'] = df_f['Data_Hora'].dt.month
df_f['Data'] = df_f['Data_Hora'].dt.strftime('%Y-%m-%d')
df_f['Local'] = df_f['Local'].replace({'Camilopolis': 1, 'Erasmo': 2, 'Paraiso': 3, 'RM': 4, 'Vitoria': 5})

In [15]:
df_prec_sum = df_f.groupby(['Data', 'Local']).sum().reset_index()[['Data', 'Local', 'Precipitacao']]
df_prec_sum.columns = ['Data', 'Local', 'PrecSum']
df_f = df_f.merge(df_prec_sum, on=['Data', 'Local'])
df_f.loc[(df_f['Label'] == 1) & (df_f['PrecSum'] <= 10), 'Label'] = 0

In [16]:
cols_dummies = ['Local', 'Mes',]# 'Dia']

df_f_ohe = df_f.copy()

for c in cols_dummies:
    df_f_ohe = pd.concat([df_f_ohe, pd.get_dummies(df_f[c], prefix=c)], axis=1)
    
df_f_ohe = df_f_ohe.sort_values(['Data', 'Local'])

df_f_ohe['Label_Old'] = df_f_ohe['Label']
df_f_ohe['Label'] = df_f_ohe['Label'].shift(-5*6, fill_value = 0)

In [17]:
df_f_ohe.columns

Index(['Data_Hora', 'Label', 'Local', 'UmidadeRelativa', 'PressaoAtmosferica',
       'TemperaturaDoAr', 'TemperaturaInterna', 'PontoDeOrvalho',
       'RadiacaoSolar', 'DirecaoDoVento', 'VelocidadeDoVento', 'Precipitacao',
       'Dia', 'Mes', 'Data', 'PrecSum', 'Local_1', 'Local_2', 'Local_3',
       'Local_4', 'Local_5', 'Mes_1', 'Mes_2', 'Mes_3', 'Mes_4', 'Mes_5',
       'Mes_6', 'Mes_7', 'Mes_8', 'Mes_9', 'Mes_10', 'Mes_11', 'Mes_12',
       'Label_Old'],
      dtype='object')

In [18]:
test_cases = [
    [],
    ['DirecaoDoVento', 'VelocidadeDoVento'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr', 'Local'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr', 'PrecSum'],
    ['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr', 'Precipitacao']
]

df_training_result = pd.DataFrame(columns = ['Removed_Cols', 'Local', 'Features', 'Train_Acc', 'Test_Acc', 'Precision', 'Recall', 'F1', 'Ver_Pos'])
label = 'Label'

for case in test_cases:
    print(f'---------- CASE ----------')
    print(case)
    print(f'--------------------------')
    for l in range(6):
        if l != 0:
            df_train = df_f_ohe[df_f_ohe['Local'] == l].drop(columns = [c for c in df_f_ohe.columns if 'Local' in c])
        else:
            df_train = df_f_ohe.copy()

        cols_rem = ['Label', 'Label_Old', 'Data', 'Data_Hora'] + cols_dummies
        cols_rem = cols_rem + case
            
        print(f'----- LOCAL {l} -----')
        model, training_res, y_treino_pred, y_teste_pred = trainXGB(df_train, cols_rem, label)
        
        df_training_result = df_training_result.append(
            {**{'Model': model, 'Removed_Cols': case, 'Local': l}, **training_res},
            ignore_index=True
        )

---------- CASE ----------
[]
--------------------------
----- LOCAL 0 -----
Treino: 0.9965700037381988
Teste: 0.9868862013696634
Precisão: 0.6057183198023297
Recall: 0.8415890142226582
F1: 0.7044334975369458


array([[0.98963524, 0.01036476],
       [0.15841099, 0.84158901]])

array([[106652,   1117],
       [   323,   1716]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9982319168802273
Teste: 0.9911210272288499
Precisão: 0.7031578947368421
Recall: 0.8608247422680413
F1: 0.7740440324449596


array([[0.99346436, 0.00653564],
       [0.13917526, 0.86082474]])

array([[21433,   141],
       [   54,   334]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9965230867042836
Teste: 0.9910754940351516
Precisão: 0.6837121212121212
Recall: 0.9256410256410257
F1: 0.7864923747276689


array([[0.99225848, 0.00774152],
       [0.07435897, 0.92564103]])

array([[21405,   167],
       [   29,   361]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9987060556595134
Teste: 0.9924414898460978
Precisão: 0.7763975155279503
Recall: 0.8660508083140878
F1: 0.8187772925764193


array([[0.99498351, 0.00501649],
       [0.13394919, 0.86605081]])

array([[21421,   108],
       [   58,   375]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9956357490804255
Teste: 0.9815590565522265
Precisão: 0.5016556291390728
Recall: 0.7444717444717445
F1: 0.5994065281899109


array([[0.98603572, 0.01396428],
       [0.25552826, 0.74447174]])

array([[21254,   301],
       [  104,   303]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.995970870888796
Teste: 0.9858391767598579
Precisão: 0.5982300884955752
Recall: 0.8009478672985783
F1: 0.684903748733536


array([[0.98946147, 0.01053853],
       [0.19905213, 0.80094787]])

array([[21313,   227],
       [   84,   338]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento']
--------------------------
----- LOCAL 0 -----
Treino: 0.9968066237701724
Teste: 0.9888077371411919
Precisão: 0.6486784140969163
Recall: 0.8666012751348701
F1: 0.7419693470501785


array([[0.9911199 , 0.0088801 ],
       [0.13339872, 0.86660128]])

array([[106812,    957],
       [   272,   1767]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9984206448536862
Teste: 0.9926236226208907
Precisão: 0.7456521739130435
Recall: 0.884020618556701
F1: 0.8089622641509434


array([[0.99457681, 0.00542319],
       [0.11597938, 0.88402062]])

array([[21457,   117],
       [   45,   343]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9964734165143447
Teste: 0.9914852927784354
Precisão: 0.6948176583493282
Recall: 0.9282051282051282
F1: 0.7947310647639957


array([[0.99262933, 0.00737067],
       [0.07179487, 0.92820513]])

array([[21413,   159],
       [   28,   362]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9988254966755584
Teste: 0.9947181495310081
Precisão: 0.8408602150537634
Recall: 0.9030023094688222
F1: 0.8708240534521159


array([[0.99656278, 0.00343722],
       [0.09699769, 0.90300231]])

array([[21455,    74],
       [   42,   391]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.995615866388309
Teste: 0.9824697204261907
Precisão: 0.5180921052631579
Recall: 0.773955773955774
F1: 0.6206896551724138


array([[0.98640687, 0.01359313],
       [0.22604423, 0.77395577]])

array([[21262,   293],
       [   92,   315]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9960803040251497
Teste: 0.9862034423094436
Precisão: 0.6053097345132743
Recall: 0.8104265402843602
F1: 0.6930091185410334


array([[0.98964717, 0.01035283],
       [0.18957346, 0.81042654]])

array([[21317,   223],
       [   80,   342]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna']
--------------------------
----- LOCAL 0 -----
Treino: 0.9969259279879741
Teste: 0.9894269998542912
Precisão: 0.6647897897897898
Recall: 0.868563021088769
F1: 0.7531362959812885


array([[0.99171376, 0.00828624],
       [0.13143698, 0.86856302]])

array([[106876,    893],
       [   268,   1771]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9983312473925741
Teste: 0.9928968217830799
Precisão: 0.75
Recall: 0.8969072164948454
F1: 0.8169014084507044


array([[0.99462316, 0.00537684],
       [0.10309278, 0.89690722]])

array([[21458,   116],
       [   40,   348]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9965330207422713
Teste: 0.9914397595847373
Precisão: 0.6927480916030534
Recall: 0.9307692307692308
F1: 0.7943107221006565


array([[0.99253662, 0.00746338],
       [0.06923077, 0.93076923]])

array([[21411,   161],
       [   27,   363]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9988752637655771
Teste: 0.9949913486931974
Precisão: 0.84
Recall: 0.9214780600461894
F1: 0.8788546255506609


array([[0.99646988, 0.00353012],
       [0.07852194, 0.92147806]])

array([[21453,    76],
       [   34,   399]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.99526791927627
Teste: 0.9815135233585284
Precisão: 0.5008130081300813
Recall: 0.7567567567567568
F1: 0.6027397260273973


array([[0.98575736, 0.01424264],
       [0.24324324, 0.75675676]])

array([[21248,   307],
       [   99,   308]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9961101494259734
Teste: 0.9859302431472543
Precisão: 0.6
Recall: 0.8033175355450237
F1: 0.6869300911854103


array([[0.98950789, 0.01049211],
       [0.19668246, 0.80331754]])

array([[21314,   226],
       [   83,   339]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho']
--------------------------
----- LOCAL 0 -----
Treino: 0.9969756380787249
Teste: 0.9896546699694011
Precisão: 0.6703130894002264
Recall: 0.8715056400196175
F1: 0.7577825159914714


array([[0.99189006, 0.00810994],
       [0.12849436, 0.87150564]])

array([[106895,    874],
       [   262,   1777]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9984405109561555
Teste: 0.994035151625535
Precisão: 0.7811816192560175
Recall: 0.9201030927835051
F1: 0.8449704142011834


array([[0.99536479, 0.00463521],
       [0.07989691, 0.92010309]])

array([[21474,   100],
       [   31,   357]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9967416355400143
Teste: 0.9916674255532283
Precisão: 0.6963946869070209
Recall: 0.941025641025641
F1: 0.8004362050163577


array([[0.99258298, 0.00741702],
       [0.05897436, 0.94102564]])

array([[21412,   160],
       [   23,   367]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9986761954055022
Teste: 0.9944449503688189
Precisão: 0.8128772635814889
Recall: 0.9330254041570438
F1: 0.8688172043010752


array([[0.99568025, 0.00431975],
       [0.0669746 , 0.9330254 ]])

array([[21436,    93],
       [   29,   404]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9961328163833383
Teste: 0.983653583462344
Precisão: 0.54
Recall: 0.7960687960687961
F1: 0.6434955312810328


array([[0.98719555, 0.01280445],
       [0.2039312 , 0.7960688 ]])

array([[21279,   276],
       [   83,   324]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9965876758391532
Teste: 0.9885256351880521
Precisão: 0.6591760299625468
Recall: 0.8341232227488151
F1: 0.7364016736401672


array([[0.9915506 , 0.0084494 ],
       [0.16587678, 0.83412322]])

array([[21358,   182],
       [   70,   352]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa']
--------------------------
----- LOCAL 0 -----
Treino: 0.996983591693245
Teste: 0.9899369809121376
Precisão: 0.6739940387481371
Recall: 0.8871996076508092
F1: 0.7660385348295575


array([[0.99188078, 0.00811922],
       [0.11280039, 0.88719961]])

array([[106894,    875],
       [   230,   1809]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9983411804438087
Teste: 0.9934432201074583
Precisão: 0.7618025751072961
Recall: 0.9149484536082474
F1: 0.8313817330210772


array([[0.99485492, 0.00514508],
       [0.08505155, 0.91494845]])

array([[21463,   111],
       [   33,   355]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9968012397679409
Teste: 0.9932155541389673
Precisão: 0.7330754352030948
Recall: 0.9717948717948718
F1: 0.8357221609702316


array([[0.99360282, 0.00639718],
       [0.02820513, 0.97179487]])

array([[21434,   138],
       [   11,   379]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9983278257753713
Teste: 0.9939896184318369
Precisão: 0.796844181459566
Recall: 0.9330254041570438
F1: 0.8595744680851064


array([[0.99521576, 0.00478424],
       [0.0669746 , 0.9330254 ]])

array([[21426,   103],
       [   29,   404]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9966100009941347
Teste: 0.9869319734086148
Precisão: 0.6140684410646388
Recall: 0.7936117936117936
F1: 0.6923901393354769


array([[0.99058223, 0.00941777],
       [0.20638821, 0.79361179]])

array([[21352,   203],
       [   84,   323]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.995234684335144
Teste: 0.9862034423094436
Precisão: 0.5993322203672788
Recall: 0.8507109004739336
F1: 0.703232125367287


array([[0.98885794, 0.01114206],
       [0.1492891 , 0.8507109 ]])

array([[21300,   240],
       [   63,   359]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica']
--------------------------
----- LOCAL 0 -----
Treino: 0.9965362008764883
Teste: 0.9900189421535771
Precisão: 0.6678533285866857
Recall: 0.920058852378617
F1: 0.7739273927392739


array([[0.99134259, 0.00865741],
       [0.07994115, 0.92005885]])

array([[106836,    933],
       [   163,   1876]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9983610465462781
Teste: 0.9945815499499134
Precisão: 0.7880085653104925
Recall: 0.9484536082474226
F1: 0.8608187134502924


array([[0.99541114, 0.00458886],
       [0.05154639, 0.94845361]])

array([[21475,    99],
       [   20,   368]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9958078359691648
Teste: 0.9917584919406247
Precisão: 0.6882882882882883
Recall: 0.9794871794871794
F1: 0.8084656084656084


array([[0.99198034, 0.00801966],
       [0.02051282, 0.97948718]])

array([[21399,   173],
       [    8,   382]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9982183381773301
Teste: 0.9947636827247063
Precisão: 0.8117647058823529
Recall: 0.9561200923787528
F1: 0.878048780487805


array([[0.9955409 , 0.0044591 ],
       [0.04387991, 0.95612009]])

array([[21433,    96],
       [   19,   414]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9957451038870663
Teste: 0.9853383116291776
Precisão: 0.5704809286898839
Recall: 0.8452088452088452
F1: 0.6811881188118812


array([[0.98798423, 0.01201577],
       [0.15479115, 0.84520885]])

array([[21296,   259],
       [   63,   344]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9947571579219642
Teste: 0.9874783717329934
Precisão: 0.6218905472636815
Recall: 0.8886255924170616
F1: 0.7317073170731707


array([[0.98941504, 0.01058496],
       [0.11137441, 0.88862559]])

array([[21312,   228],
       [   47,   375]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar']
--------------------------
----- LOCAL 0 -----
Treino: 0.9961504505722626
Teste: 0.989581815532566
Precisão: 0.6556521739130434
Recall: 0.9244727807748897
F1: 0.7671957671957671


array([[0.99081368, 0.00918632],
       [0.07552722, 0.92447278]])

array([[106779,    990],
       [   154,   1885]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9982219838289926
Teste: 0.9943538839814224
Precisão: 0.7832618025751072
Recall: 0.9407216494845361
F1: 0.8548009367681498


array([[0.99531844, 0.00468156],
       [0.05927835, 0.94072165]])

array([[21473,   101],
       [   23,   365]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9957084955892871
Teste: 0.9917584919406247
Precisão: 0.6882882882882883
Recall: 0.9794871794871794
F1: 0.8084656084656084


array([[0.99198034, 0.00801966],
       [0.02051282, 0.97948718]])

array([[21399,   173],
       [    8,   382]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9981188039972927
Teste: 0.9951734814679901
Precisão: 0.8237623762376237
Recall: 0.9607390300230947
F1: 0.8869936034115138


array([[0.99586604, 0.00413396],
       [0.03926097, 0.96073903]])

array([[21440,    89],
       [   17,   416]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.994532259667959
Teste: 0.9838357162371368
Precisão: 0.5416666666666666
Recall: 0.8304668304668305
F1: 0.6556741028128031


array([[0.98673162, 0.01326838],
       [0.16953317, 0.83046683]])

array([[21269,   286],
       [   69,   338]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9933643725501901
Teste: 0.9841999817867225
Precisão: 0.5547445255474452
Recall: 0.9004739336492891
F1: 0.6865401987353207


array([[0.9858403 , 0.0141597 ],
       [0.09952607, 0.90047393]])

array([[21235,   305],
       [   42,   380]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr']
--------------------------
----- LOCAL 0 -----
Treino: 0.9868944316744744
Teste: 0.9740638204866676
Precisão: 0.41678666940958653
Recall: 0.9936243256498284
F1: 0.5872463768115942


array([[0.97369373, 0.02630627],
       [0.00637567, 0.99362433]])

array([[104934,   2835],
       [    13,   2026]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9944772235135189
Teste: 0.9885256351880521
Precisão: 0.60828025477707
Recall: 0.9845360824742269
F1: 0.7519685039370079


array([[0.98859739, 0.01140261],
       [0.01546392, 0.98453608]])

array([[21328,   246],
       [    6,   382]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.988555988238099
Teste: 0.9762772060832346
Precisão: 0.42778390297684676
Recall: 0.9948717948717949
F1: 0.5983037779491134


array([[0.97594103, 0.02405897],
       [0.00512821, 0.99487179]])

array([[21053,   519],
       [    2,   388]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9925349364971932
Teste: 0.9834714506875513
Precisão: 0.5441919191919192
Recall: 0.9953810623556582
F1: 0.7036734693877551


array([[0.98323192, 0.01676808],
       [0.00461894, 0.99538106]])

array([[21168,   361],
       [    2,   431]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9783278655930013
Teste: 0.954694472270285
Precisão: 0.28664731494920176
Recall: 0.9705159705159705
F1: 0.44257703081232497


array([[0.95439573, 0.04560427],
       [0.02948403, 0.97051597]])

array([[20572,   983],
       [   12,   395]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9818241508983465
Teste: 0.9645296421090975
Precisão: 0.3511259382819016
Recall: 0.9976303317535545
F1: 0.5194324491054905


array([[0.96388115, 0.03611885],
       [0.00236967, 0.99763033]])

array([[20762,   778],
       [    1,   421]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr', 'Local']
--------------------------
----- LOCAL 0 -----
Treino: 0.9868944316744744
Teste: 0.9740638204866676
Precisão: 0.41678666940958653
Recall: 0.9936243256498284
F1: 0.5872463768115942


array([[0.97369373, 0.02630627],
       [0.00637567, 0.99362433]])

array([[104934,   2835],
       [    13,   2026]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9944772235135189
Teste: 0.9885256351880521
Precisão: 0.60828025477707
Recall: 0.9845360824742269
F1: 0.7519685039370079


array([[0.98859739, 0.01140261],
       [0.01546392, 0.98453608]])

array([[21328,   246],
       [    6,   382]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.988555988238099
Teste: 0.9762772060832346
Precisão: 0.42778390297684676
Recall: 0.9948717948717949
F1: 0.5983037779491134


array([[0.97594103, 0.02405897],
       [0.00512821, 0.99487179]])

array([[21053,   519],
       [    2,   388]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9925349364971932
Teste: 0.9834714506875513
Precisão: 0.5441919191919192
Recall: 0.9953810623556582
F1: 0.7036734693877551


array([[0.98323192, 0.01676808],
       [0.00461894, 0.99538106]])

array([[21168,   361],
       [    2,   431]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9783278655930013
Teste: 0.954694472270285
Precisão: 0.28664731494920176
Recall: 0.9705159705159705
F1: 0.44257703081232497


array([[0.95439573, 0.04560427],
       [0.02948403, 0.97051597]])

array([[20572,   983],
       [   12,   395]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9818241508983465
Teste: 0.9645296421090975
Precisão: 0.3511259382819016
Recall: 0.9976303317535545
F1: 0.5194324491054905


array([[0.96388115, 0.03611885],
       [0.00236967, 0.99763033]])

array([[20762,   778],
       [    1,   421]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr', 'PrecSum']
--------------------------
----- LOCAL 0 -----
Treino: 0.9192767778316856
Teste: 0.8459128660935451
Precisão: 0.1000806234883096
Recall: 0.9131927415399705
F1: 0.18039139701608214


array([[0.84463992, 0.15536008],
       [0.08680726, 0.91319274]])

array([[91026, 16743],
       [  177,  1862]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.923743965671375
Teste: 0.8551133776523085
Precisão: 0.09717416378316032
Recall: 0.8685567010309279
F1: 0.17479253112033194


array([[0.8548716, 0.1451284],
       [0.1314433, 0.8685567]])

array([[18443,  3131],
       [   51,   337]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9309584359850592
Teste: 0.8634914852927784
Precisão: 0.10699216395418927
Recall: 0.9102564102564102
F1: 0.19147788565264295


array([[0.86264602, 0.13735398],
       [0.08974359, 0.91025641]])

array([[18609,  2963],
       [   35,   355]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9236075168212764
Teste: 0.8514251889627539
Precisão: 0.10650723025583982
Recall: 0.8845265588914549
F1: 0.1901216182675602


array([[0.85075944, 0.14924056],
       [0.11547344, 0.88452656]])

array([[18316,  3213],
       [   50,   383]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.8957152798488915
Teste: 0.7953738275202623
Precisão: 0.07559709241952232
Recall: 0.8943488943488943
F1: 0.1394101876675603


array([[0.79350499, 0.20649501],
       [0.10565111, 0.89434889]])

array([[17104,  4451],
       [   43,   364]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9258043335521996
Teste: 0.8593934978599399
Precisão: 0.11317469529889727
Recall: 0.9241706161137441
F1: 0.20165460186142709


array([[0.85812442, 0.14187558],
       [0.07582938, 0.92417062]])

array([[18484,  3056],
       [   32,   390]], dtype=int64)

---------- CASE ----------
['DirecaoDoVento', 'VelocidadeDoVento', 'TemperaturaInterna', 'PontoDeOrvalho', 'UmidadeRelativa', 'PressaoAtmosferica', 'RadiacaoSolar', 'TemperaturaDoAr', 'Precipitacao']
--------------------------
----- LOCAL 0 -----
Treino: 0.9866717304679111
Teste: 0.9738361503715576
Precisão: 0.4151058631921824
Recall: 1.0
F1: 0.5866781758020428


array([[0.97334113, 0.02665887],
       [0.        , 1.        ]])

array([[104896,   2873],
       [     0,   2039]], dtype=int64)

----- LOCAL 1 -----
Treino: 0.9938613743369689
Teste: 0.98807030325107
Precisão: 0.5969230769230769
Recall: 1.0
F1: 0.7475915221579961


array([[0.98785575, 0.01214425],
       [0.        , 1.        ]])

array([[21312,   262],
       [    0,   388]], dtype=int64)

----- LOCAL 2 -----
Treino: 0.9882778351744417
Teste: 0.9757763409525544
Precisão: 0.4229934924078091
Recall: 1.0
F1: 0.5945121951219512


array([[0.9753384, 0.0246616],
       [0.       , 1.       ]])

array([[21040,   532],
       [    0,   390]], dtype=int64)

----- LOCAL 3 -----
Treino: 0.9925150296611857
Teste: 0.9837901830434387
Precisão: 0.5487959442332065
Recall: 1.0
F1: 0.7086743044189853


array([[0.98346416, 0.01653584],
       [0.        , 1.        ]])

array([[21173,   356],
       [    0,   433]], dtype=int64)

----- LOCAL 4 -----
Treino: 0.9776021473307486
Teste: 0.9537838083963209
Precisão: 0.28285714285714286
Recall: 0.972972972972973
F1: 0.438295517432208


array([[0.95342148, 0.04657852],
       [0.02702703, 0.97297297]])

array([[20551,  1004],
       [   11,   396]], dtype=int64)

----- LOCAL 5 -----
Treino: 0.9817147177619929
Teste: 0.9644385757217011
Precisão: 0.3507896924355777
Recall: 1.0
F1: 0.5193846153846153


array([[0.96374188, 0.03625812],
       [0.        , 1.        ]])

array([[20759,   781],
       [    0,   422]], dtype=int64)

In [19]:
#df_training_result.to_csv('../../../data/analysis/training_test_shift.csv', index=False, sep=';', decimal=',')
df_training_result

Unnamed: 0,Removed_Cols,Local,Features,Train_Acc,Test_Acc,Precision,Recall,F1,Ver_Pos,Model
0,[],0,"[UmidadeRelativa, PressaoAtmosferica, Temperat...",0.996570,0.986886,0.605718,0.841589,0.704433,0.841589,<xgboost.core.Booster object at 0x0000015C4D70...
1,[],1,"[UmidadeRelativa, PressaoAtmosferica, Temperat...",0.998232,0.991121,0.703158,0.860825,0.774044,0.860825,<xgboost.core.Booster object at 0x0000015C4D70...
2,[],2,"[UmidadeRelativa, PressaoAtmosferica, Temperat...",0.996523,0.991075,0.683712,0.925641,0.786492,0.925641,<xgboost.core.Booster object at 0x0000015C4D70...
3,[],3,"[UmidadeRelativa, PressaoAtmosferica, Temperat...",0.998706,0.992441,0.776398,0.866051,0.818777,0.866051,<xgboost.core.Booster object at 0x0000015C4D70...
4,[],4,"[UmidadeRelativa, PressaoAtmosferica, Temperat...",0.995636,0.981559,0.501656,0.744472,0.599407,0.744472,<xgboost.core.Booster object at 0x0000015C4D70...
...,...,...,...,...,...,...,...,...,...,...
61,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",1,"[Dia, PrecSum, Mes_1, Mes_2, Mes_3, Mes_4, Mes...",0.993861,0.988070,0.596923,1.000000,0.747592,1.000000,<xgboost.core.Booster object at 0x0000015C7081...
62,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",2,"[Dia, PrecSum, Mes_1, Mes_2, Mes_3, Mes_4, Mes...",0.988278,0.975776,0.422993,1.000000,0.594512,1.000000,<xgboost.core.Booster object at 0x0000015C7081...
63,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",3,"[Dia, PrecSum, Mes_1, Mes_2, Mes_3, Mes_4, Mes...",0.992515,0.983790,0.548796,1.000000,0.708674,1.000000,<xgboost.core.Booster object at 0x0000015C7081...
64,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",4,"[Dia, PrecSum, Mes_1, Mes_2, Mes_3, Mes_4, Mes...",0.977602,0.953784,0.282857,0.972973,0.438296,0.972973,<xgboost.core.Booster object at 0x0000015C7081...


In [20]:
df_best_local = pd.DataFrame(columns = df_training_result.columns)

for l in range(6):
    df_best_local = df_best_local.append(df_training_result[(df_training_result['Local'] == l)].sort_values('F1', ascending=False).reset_index(drop=True).loc[0])
    
df_best_local

Unnamed: 0,Removed_Cols,Local,Features,Train_Acc,Test_Acc,Precision,Recall,F1,Ver_Pos,Model
0,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",0,"[TemperaturaDoAr, RadiacaoSolar, Precipitacao,...",0.996536,0.990019,0.667853,0.920059,0.773927,0.920059,<xgboost.core.Booster object at 0x0000015C4D6F...
0,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",1,"[TemperaturaDoAr, RadiacaoSolar, Precipitacao,...",0.998361,0.994582,0.788009,0.948454,0.860819,0.948454,<xgboost.core.Booster object at 0x0000015C4D6F...
0,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",2,"[PressaoAtmosferica, TemperaturaDoAr, Radiacao...",0.996801,0.993216,0.733075,0.971795,0.835722,0.971795,<xgboost.core.Booster object at 0x0000015C4D6F...
0,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",3,"[TemperaturaDoAr, Precipitacao, Dia, PrecSum, ...",0.998119,0.995173,0.823762,0.960739,0.886994,0.960739,<xgboost.core.Booster object at 0x0000015C4D6F...
0,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",4,"[PressaoAtmosferica, TemperaturaDoAr, Radiacao...",0.99661,0.986932,0.614068,0.793612,0.69239,0.793612,<xgboost.core.Booster object at 0x0000015C4D6F...
0,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",5,"[UmidadeRelativa, PressaoAtmosferica, Temperat...",0.996588,0.988526,0.659176,0.834123,0.736402,0.834123,<xgboost.core.Booster object at 0x0000015C4D6F...


In [21]:
#df_best_local.to_csv('../../../data/analysis/best_shift_local.csv', index=False, sep=';', decimal=',')

# OWM

In [22]:
df_owm = pd.read_csv('../../../data/cleandata/Info pluviometricas/Merged Data/OpenWeatherMapSantoAndre.csv')

In [23]:
df_owm['Data_Hora'] = pd.to_datetime(df_owm['dt_iso'].str[:-10])
df_owm['Data_Hora'] = df_owm.apply(lambda x: x['Data_Hora'] + pd.Timedelta(hours = x['timezone'] / 3600), axis = 1)
df_owm = df_owm[(datetime.strptime('2019-08-30', '%Y-%m-%d') >= df_owm['Data_Hora']) & (df_owm['Data_Hora'] >= datetime.strptime('2010-01-01', '%Y-%m-%d'))]
df_owm = df_owm.drop(columns = ['sea_level', 'grnd_level', 'rain_3h', 'snow_1h', 'snow_3h'])
df_owm = df_owm.fillna(0)
df_owm = df_owm.drop_duplicates(subset='Data_Hora')

In [24]:
df_loc = pd.read_csv('../../../data/cleandata/Ordens de serviço/labels_day.csv', sep=';')
df_loc['Data'] = pd.to_datetime(df_loc['Data'], yearfirst=True)
df_loc = df_loc[['Data', 'LocalMax']]
df_loc.columns = ['Data', 'Label']
df_loc.head()

Unnamed: 0,Data,Label
0,2010-01-07,1
1,2010-01-08,0
2,2010-01-11,0
3,2010-01-19,0
4,2010-01-20,1


In [25]:
df_owm['Data'] = pd.to_datetime(df_owm['Data_Hora'].dt.strftime('%Y-%m-%d'), yearfirst=True)
df = df_owm.merge(df_loc, on='Data', how='left')
df = df.fillna(0)

In [26]:
df_g = df.groupby('Data').sum().reset_index()[['Data', 'rain_1h']]
df_g.columns = ['Data', 'rain_sum']
df = df.merge(df_g, on='Data')

In [27]:
df['Mes'] = df['Data_Hora'].dt.month
df['Dia'] = df['Data_Hora'].dt.day
# df = df.drop(columns = ['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'weather_icon', 'weather_id', 'weather_main',
#                         'Vitoria', 'Erasmo', 'Paraiso', 'RM', 'Null', 'Camilopolis'])
df = df.drop(columns = ['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'weather_icon', 'weather_id', 'weather_main'])
# df = df.drop(columns = ['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'weather_icon', 'weather_id', 'weather_main',
#                         'Data_Hora', 'Data'])
# df['weather_description'] = df['weather_description'].rank(method='dense', ascending=False).astype(int)

In [28]:
cols_dummies = ['Mes', 'weather_description', 'Dia']

df_ohe = df.copy()

for c in cols_dummies:
    df_ohe = pd.concat([df_ohe, pd.get_dummies(df_ohe[c], prefix=c)], axis=1)
    
df_ohe = df_ohe.sort_values(['Data'])

df_ohe['Label_Old'] = df_ohe['Label']
df_ohe['Label'] = df_ohe['Label'].shift(-1*6, fill_value = 0)

In [29]:
df_ohe.columns

Index(['temp', 'feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity',
       'wind_speed', 'wind_deg', 'rain_1h', 'clouds_all',
       'weather_description', 'Data_Hora', 'Data', 'Label', 'rain_sum', 'Mes',
       'Dia', 'Mes_1', 'Mes_2', 'Mes_3', 'Mes_4', 'Mes_5', 'Mes_6', 'Mes_7',
       'Mes_8', 'Mes_9', 'Mes_10', 'Mes_11', 'Mes_12',
       'weather_description_broken clouds', 'weather_description_drizzle',
       'weather_description_few clouds', 'weather_description_fog',
       'weather_description_haze',
       'weather_description_heavy intensity drizzle',
       'weather_description_heavy intensity rain',
       'weather_description_heavy intesity shower rain',
       'weather_description_light intensity drizzle',
       'weather_description_light intensity drizzle rain',
       'weather_description_light intensity shower rain',
       'weather_description_light rain', 'weather_description_mist',
       'weather_description_moderate rain',
       'weather_description_ov

In [30]:
test_cases = [
    [],
    ['feels_like'],
    ['feels_like', 'temp_min', 'temp_max'],
    ['feels_like', 'temp_min', 'temp_max', 'pressure'],
    ['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity'],
    ['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg'],
    ['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all'],
    ['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all'] + [c for c in df_ohe.columns if 'weather_description' in c],
    ['temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all'] + [c for c in df_ohe.columns if 'weather_description' in c],
]

df_training_result_owm = pd.DataFrame(columns = ['Removed_Cols', 'Features', 'Train_Acc', 'Test_Acc', 'Precision', 'Recall', 'F1', 'Ver_Pos'])
label = 'Label'

for case in test_cases:
    print(f'---------- CASE ----------')
    print(case)
    print(f'--------------------------')
    
    df_train = df_ohe.copy()

    cols_rem = ['Label', 'Label_Old', 'Data', 'Data_Hora'] + cols_dummies
    cols_rem = cols_rem + case

    model, training_res, y_treino_pred, y_teste_pred = trainXGB(df_train, cols_rem, label)

    df_training_result_owm = df_training_result_owm.append(
        {**{'Removed_Cols': case}, **training_res},
        ignore_index=True
    )

---------- CASE ----------
[]
--------------------------
Treino: 0.9929923158314686
Teste: 0.9678346456692913
Precisão: 0.4446397188049209
Recall: 0.7322720694645442
F1: 0.5533078184800437


array([[0.97442228, 0.02557772],
       [0.26772793, 0.73227207]])

array([[24077,   632],
       [  185,   506]], dtype=int64)

---------- CASE ----------
['feels_like']
--------------------------
Treino: 0.9924892889975889
Teste: 0.9660629921259842
Precisão: 0.42493415276558383
Recall: 0.7004341534008683
F1: 0.5289617486338798


array([[0.97349144, 0.02650856],
       [0.29956585, 0.70043415]])

array([[24054,   655],
       [  207,   484]], dtype=int64)

---------- CASE ----------
['feels_like', 'temp_min', 'temp_max']
--------------------------
Treino: 0.9928275311789908
Teste: 0.9682283464566929
Precisão: 0.44876325088339225
Recall: 0.7351664254703328
F1: 0.5573230938014263


array([[0.97474604, 0.02525396],
       [0.26483357, 0.73516643]])

array([[24085,   624],
       [  183,   508]], dtype=int64)

---------- CASE ----------
['feels_like', 'temp_min', 'temp_max', 'pressure']
--------------------------
Treino: 0.9931744462368389
Teste: 0.9705511811023622
Precisão: 0.473972602739726
Recall: 0.7510853835021708
F1: 0.5811870100783875


array([[0.97668866, 0.02331134],
       [0.24891462, 0.75108538]])

array([[24133,   576],
       [  172,   519]], dtype=int64)

---------- CASE ----------
['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity']
--------------------------
Treino: 0.993477996912456
Teste: 0.9711811023622047
Precisão: 0.4806054872280038
Recall: 0.7351664254703328
F1: 0.5812356979405034


array([[0.97778138, 0.02221862],
       [0.26483357, 0.73516643]])

array([[24160,   549],
       [  183,   508]], dtype=int64)

---------- CASE ----------
['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg']
--------------------------
Treino: 0.992619382144282
Teste: 0.9719291338582677
Precisão: 0.48996350364963503
Recall: 0.7771345875542692
F1: 0.6010072747621713


array([[0.97737666, 0.02262334],
       [0.22286541, 0.77713459]])

array([[24150,   559],
       [  154,   537]], dtype=int64)

---------- CASE ----------
['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all']
--------------------------
Treino: 0.9926714194029592
Teste: 0.9721653543307086
Precisão: 0.49280575539568344
Recall: 0.7930535455861071
F1: 0.6078757626178591


array([[0.97717431, 0.02282569],
       [0.20694645, 0.79305355]])

array([[24145,   564],
       [  143,   548]], dtype=int64)

---------- CASE ----------
['feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_description', 'weather_description_broken clouds', 'weather_description_drizzle', 'weather_description_few clouds', 'weather_description_fog', 'weather_description_haze', 'weather_description_heavy intensity drizzle', 'weather_description_heavy intensity rain', 'weather_description_heavy intesity shower rain', 'weather_description_light intensity drizzle', 'weather_description_light intensity drizzle rain', 'weather_description_light intensity shower rain', 'weather_description_light rain', 'weather_description_mist', 'weather_description_moderate rain', 'weather_description_overcast clouds', 'weather_description_proximity shower rain', 'weather_description_proximity thunderstorm', 'weather_description_rain and drizzle', 'weather_description_scattered clouds', 'weather_description_shower rain', 'weather_description_sky is clear', 'weather_descriptio

array([[0.97551499, 0.02448501],
       [0.17800289, 0.82199711]])

array([[24104,   605],
       [  123,   568]], dtype=int64)

---------- CASE ----------
['temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_description', 'weather_description_broken clouds', 'weather_description_drizzle', 'weather_description_few clouds', 'weather_description_fog', 'weather_description_haze', 'weather_description_heavy intensity drizzle', 'weather_description_heavy intensity rain', 'weather_description_heavy intesity shower rain', 'weather_description_light intensity drizzle', 'weather_description_light intensity drizzle rain', 'weather_description_light intensity shower rain', 'weather_description_light rain', 'weather_description_mist', 'weather_description_moderate rain', 'weather_description_overcast clouds', 'weather_description_proximity shower rain', 'weather_description_proximity thunderstorm', 'weather_description_rain and drizzle', 'weather_description_scattered clouds', 'weather_description_shower rain', 'weather_description_sky is clear', 'weather_description_thunderstorm

array([[0.97770043, 0.02229957],
       [0.21997106, 0.78002894]])

array([[24158,   551],
       [  152,   539]], dtype=int64)

In [31]:
df_training_result_owm

Unnamed: 0,Removed_Cols,Features,Train_Acc,Test_Acc,Precision,Recall,F1,Ver_Pos
0,[],"[temp, feels_like, temp_min, temp_max, pressur...",0.992992,0.967835,0.44464,0.732272,0.553308,0.732272
1,[feels_like],"[temp, temp_min, temp_max, pressure, humidity,...",0.992489,0.966063,0.424934,0.700434,0.528962,0.700434
2,"[feels_like, temp_min, temp_max]","[temp, pressure, humidity, wind_speed, wind_de...",0.992828,0.968228,0.448763,0.735166,0.557323,0.735166
3,"[feels_like, temp_min, temp_max, pressure]","[temp, humidity, wind_speed, wind_deg, rain_1h...",0.993174,0.970551,0.473973,0.751085,0.581187,0.751085
4,"[feels_like, temp_min, temp_max, pressure, hum...","[temp, wind_speed, wind_deg, rain_1h, clouds_a...",0.993478,0.971181,0.480605,0.735166,0.581236,0.735166
5,"[feels_like, temp_min, temp_max, pressure, hum...","[temp, rain_1h, clouds_all, rain_sum, Mes_1, M...",0.992619,0.971929,0.489964,0.777135,0.601007,0.777135
6,"[feels_like, temp_min, temp_max, pressure, hum...","[temp, rain_1h, rain_sum, Mes_1, Mes_2, Mes_3,...",0.992671,0.972165,0.492806,0.793054,0.607876,0.793054
7,"[feels_like, temp_min, temp_max, pressure, hum...","[temp, rain_1h, rain_sum, Mes_1, Mes_2, Mes_3,...",0.991145,0.971339,0.484228,0.821997,0.609442,0.821997
8,"[temp_min, temp_max, pressure, humidity, wind_...","[temp, feels_like, rain_1h, rain_sum, Mes_1, M...",0.992862,0.972323,0.494495,0.780029,0.605278,0.780029


# Teste "real"

In [32]:
df_best_local = df_best_local.reset_index(drop=True)
df_best_local

Unnamed: 0,Removed_Cols,Local,Features,Train_Acc,Test_Acc,Precision,Recall,F1,Ver_Pos,Model
0,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",0,"[TemperaturaDoAr, RadiacaoSolar, Precipitacao,...",0.996536,0.990019,0.667853,0.920059,0.773927,0.920059,<xgboost.core.Booster object at 0x0000015C4D6F...
1,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",1,"[TemperaturaDoAr, RadiacaoSolar, Precipitacao,...",0.998361,0.994582,0.788009,0.948454,0.860819,0.948454,<xgboost.core.Booster object at 0x0000015C4D6F...
2,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",2,"[PressaoAtmosferica, TemperaturaDoAr, Radiacao...",0.996801,0.993216,0.733075,0.971795,0.835722,0.971795,<xgboost.core.Booster object at 0x0000015C4D6F...
3,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",3,"[TemperaturaDoAr, Precipitacao, Dia, PrecSum, ...",0.998119,0.995173,0.823762,0.960739,0.886994,0.960739,<xgboost.core.Booster object at 0x0000015C4D6F...
4,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",4,"[PressaoAtmosferica, TemperaturaDoAr, Radiacao...",0.99661,0.986932,0.614068,0.793612,0.69239,0.793612,<xgboost.core.Booster object at 0x0000015C4D6F...
5,"[DirecaoDoVento, VelocidadeDoVento, Temperatur...",5,"[UmidadeRelativa, PressaoAtmosferica, Temperat...",0.996588,0.988526,0.659176,0.834123,0.736402,0.834123,<xgboost.core.Booster object at 0x0000015C4D6F...


In [33]:
df_m = df_f_ohe[(df_f_ohe['Label'] == 1) | (df_f_ohe['Label_Old'] == 1)].copy()
df_m['Data'] = df_m['Data_Hora'].dt.strftime('%Y-%m-%d')

In [34]:
def getPrecMomento(row):
    prec_momento = df_m.loc[(df_m['Data_Hora'] <= row['Data_Hora']) & (df_m['Local'] == row['Local']) & (df_m['Data'] == row['Data']), 'Precipitacao'].sum()
    return prec_momento

df_m['PrecMomento'] = df_m.apply(getPrecMomento, axis=1)

df_m = df_m.rename(columns = {'PrecSum': 'PrecSumOld', 'PrecMomento': 'PrecSum'})

In [35]:
df_m_2 = df_m.copy()
df_m_2['Label_Pred'] = 0

for l in range(6):
    label_pred = df_best_local.loc[l,'Model'].predict(xgboost.DMatrix(data=df_m_2.loc[df_m_2['Local'] == l, df_best_local.loc[l, 'Features']]))
    df_m_2.loc[df_m_2['Local'] == l, 'Label_Pred'] = [1 if i>0.5 else 0 for i in label_pred]

In [36]:
print(df_m_2[df_m_2['Label_Pred'] == 1].shape)
print(df_m_2.shape)

(2834, 36)
(9848, 36)


In [37]:
df_m_2[['Local', 'Data_Hora', 'Precipitacao', 'PrecSum', 'PrecSumOld', 'Label', 'Label_Pred']].sort_values(by=['Local', 'Data_Hora']
).to_csv('../../../data/analysis/labels_prediction_shift.csv', index=False, sep=';', decimal=',')

In [38]:
pd.set_option("display.max_rows", 200)
df_f_ohe[(df_f_ohe['Data_Hora'] >= datetime(2018,12,23)) & (df_f_ohe['Data_Hora'] <= datetime(2018,12,30)) & (df_f_ohe['Local'] == 4)][['Data_Hora', 'Precipitacao', 'PrecSum']]

Unnamed: 0,Data_Hora,Precipitacao,PrecSum
343418,2018-12-23 00:00:00,0.0,78.4
343419,2018-12-23 01:00:00,0.0,78.4
343420,2018-12-23 02:00:00,0.0,78.4
343421,2018-12-23 03:00:00,1.0,78.4
343422,2018-12-23 04:00:00,0.6,78.4
343423,2018-12-23 05:00:00,0.0,78.4
343424,2018-12-23 06:00:00,0.0,78.4
343425,2018-12-23 07:00:00,0.0,78.4
343426,2018-12-23 08:00:00,0.0,78.4
343427,2018-12-23 09:00:00,0.0,78.4


In [39]:
df_f_ohe['Local']

48        1
49        1
50        1
51        1
52        1
         ..
365972    5
365973    5
365974    5
365975    5
365976    5
Name: Local, Length: 366025, dtype: int64