# 0 - Inicialização

In [152]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

from plotly import graph_objects as go
import plotly as py

from datetime import datetime
from datetime import timedelta

import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, accuracy_score, f1_score, confusion_matrix, recall_score, precision_score

from sklearn.utils import resample

# 1 - Classificar o dataset clusterizado por dia

In [153]:
df_cluster = pd.read_csv('../../../data/cleandata/Info pluviometricas/Merged Data/clustered_label.csv', sep = ';')
df_cluster['Data_Hora'] = pd.to_datetime(df_cluster['Data_Hora'], yearfirst=True)
df_cluster['Data'] = df_cluster['Data_Hora'].dt.strftime('%Y-%m-%d')
df_cluster

Unnamed: 0,Data_Hora,Local,UmidadeRelativa,PressaoAtmosferica,TemperaturaDoAr,TemperaturaInterna,PontoDeOrvalho,RadiacaoSolar,DirecaoDoVento,VelocidadeDoVento,Precipitacao,LocalMax_Dia,LocalMax_Hora,Data
0,2011-01-01 00:00:00,Erasmo,88.8,920.5,10.243265,22.9,17.700000,0.000000,133.0,0.7,0.0,0,0,2011-01-01
1,2011-01-01 00:00:00,Camilopolis,84.8,920.6,20.200000,22.8,17.500000,0.000000,137.0,1.5,0.0,0,0,2011-01-01
2,2011-01-01 00:00:00,Paraiso,92.3,919.1,19.300000,22.7,18.000000,0.000000,101.0,2.6,0.0,0,0,2011-01-01
3,2011-01-01 00:00:00,Vitoria,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0,0,2011-01-01
4,2011-01-01 00:00:00,RM,86.6,926.7,20.100000,23.3,17.800000,0.000000,131.0,2.5,0.0,0,0,2011-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463565,2019-06-30 23:45:00,Camilopolis,54.4,924.0,20.800000,23.0,12.200000,0.073893,332.0,4.3,0.0,0,0,2019-06-30
1463566,2019-06-30 23:45:00,RM,66.9,931.9,18.800000,22.5,13.300000,0.294461,331.0,0.7,0.0,0,0,2019-06-30
1463567,2019-06-30 23:45:00,Vitoria,38.4,925.8,20.300000,27.5,6.300000,0.000000,0.0,0.0,0.0,0,0,2019-06-30
1463568,2019-06-30 23:45:00,Paraiso,55.3,924.2,20.800000,22.1,11.790525,0.000000,25.0,3.6,0.0,0,0,2019-06-30


In [154]:
df_cluster_dia = df_cluster.drop(columns = ['Data_Hora', 'LocalMax_Hora']).rename(columns = {'LocalMax_Dia': 'LocalMax'})
s_prec_p = df_cluster_dia.groupby(['Data', 'Local']).sum().reset_index().groupby('Data').mean()['Precipitacao'].reset_index()
s_prec_o = df_cluster_dia.groupby(['Data', 'Local']).max().reset_index().groupby('Data').max()['LocalMax'].reset_index()
df_cluster_dia = df_cluster_dia.groupby(['Data']).mean().reset_index()
df_cluster_dia['Precipitacao'] = s_prec_p['Precipitacao']
df_cluster_dia['LocalMax'] = s_prec_o['LocalMax']
df_cluster_dia

Unnamed: 0,Data,UmidadeRelativa,PressaoAtmosferica,TemperaturaDoAr,TemperaturaInterna,PontoDeOrvalho,RadiacaoSolar,DirecaoDoVento,VelocidadeDoVento,Precipitacao,LocalMax
0,2011-01-01,65.679216,735.558902,16.414147,19.501318,13.894332,116.433333,114.439583,2.144792,1.52,0
1,2011-01-02,76.017708,734.376875,15.638204,18.355709,14.988855,40.879167,162.916667,1.103542,139.12,0
2,2011-01-03,73.900711,733.959619,16.060072,18.713147,15.064426,59.225559,102.732787,1.328760,68.92,0
3,2011-01-04,73.177223,734.321458,17.005510,19.832342,15.797179,82.889583,85.015766,1.597480,62.36,0
4,2011-01-05,64.092344,736.860417,18.614403,21.592730,15.512999,161.429167,150.189583,2.222917,11.96,0
...,...,...,...,...,...,...,...,...,...,...,...
3065,2019-06-26,58.462548,924.237189,20.782350,24.296560,11.613997,86.694988,154.352083,8.483779,0.00,1
3066,2019-06-27,78.492677,928.165211,19.225816,23.262681,15.223413,58.906011,125.716667,4.999583,0.32,0
3067,2019-06-28,66.947345,928.851826,21.216116,24.902792,14.681483,82.298725,89.068750,7.688742,0.00,0
3068,2019-06-29,54.981241,927.484876,21.871250,25.403125,11.656711,92.199144,143.372917,6.376733,0.00,0


In [155]:
xgb = xgboost.XGBClassifier()

cols_rem = ['LocalMax', 'Data']

x = df_cluster_dia[[c for c in df_cluster_dia.columns if c not in cols_rem]]
y = df_cluster_dia['LocalMax']

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3, random_state = 378)

# concatenate our training data back together
X = pd.concat([x_treino, y_treino], axis=1)

# separate minority and majority classes
not_ordem = X[X['LocalMax']==0].copy()
ordem = X[X['LocalMax']==1].copy()

# upsample minority
ordem_upsampled = resample(ordem,
                        replace=True, # sample with replacement
                        n_samples=len(not_ordem), # match number in majority class
                        random_state=378) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_ordem, ordem_upsampled])

x_treino = upsampled[[c for c in df_cluster_dia.columns if c not in cols_rem]]
y_treino = upsampled['LocalMax']

display(y_treino.value_counts())

#xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino), (x_teste, y_teste)], eval_metric=f1_score)
param = {'max_depth':10, 'eta':2, 'objective':'binary:logistic', 'min_child_weight': 1, 'lambda': 1, 'alpha': 0, 'gamma': 0}

df_train = xgboost.DMatrix(data=x_treino, label=y_treino)

bst = xgboost.train(param, df_train, 2, feval=f1_score)
y_teste_pred = bst.predict(xgboost.DMatrix(data=x_teste, label=y_teste))
y_teste_pred = [1 if i>0.5 else 0 for i in y_teste_pred]
y_treino_pred = bst.predict(xgboost.DMatrix(data=x_treino, label=y_treino))
y_treino_pred = [1 if i>0.5 else 0 for i in y_treino_pred]

print(f"Treino: {accuracy_score(y_treino, y_treino_pred)}")
print(f"Teste: {accuracy_score(y_teste, y_teste_pred)}")
print(f"Precisão: {precision_score(y_teste, y_teste_pred)}")
print(f"Recall: {recall_score(y_teste, y_teste_pred)}")
print(f"F1: {f1_score(y_teste, y_teste_pred)}")
display(confusion_matrix(y_teste, y_teste_pred, normalize='true'))
display(confusion_matrix(y_teste, y_teste_pred,))

1    2082
0    2082
Name: LocalMax, dtype: int64

Treino: 0.9704610951008645
Teste: 0.9554831704668838
Precisão: 0.06896551724137931
Recall: 0.125
F1: 0.08888888888888889


array([[0.97016575, 0.02983425],
       [0.875     , 0.125     ]])

array([[878,  27],
       [ 14,   2]], dtype=int64)

# 2 - Classificar o dataset clusterizado por 15 mins

In [156]:
df_label_d = pd.read_csv('../../../data/cleandata/Ordens de serviço/labels_day.csv', sep = ';')
df_cluster_hora = df_cluster.merge(df_label_d[['Data', 'LocalMax']], on = 'Data', how = 'left').fillna(0)
df_cluster_hora = df_cluster_hora.drop(columns=['LocalMax_Hora', 'LocalMax_Dia'])

In [157]:
#df_cluster_hora = df_cluster.copy()
#df_cluster_hora = df_cluster_hora.drop(columns=['LocalMax_Hora']).rename(columns = {'LocalMax_Dia': 'LocalMax'})
df_cluster_hora = df_cluster_hora.sort_values(by=['Data_Hora', 'Local'])

In [158]:
df_cluster_hora.groupby('LocalMax').count()

Unnamed: 0_level_0,Data_Hora,Local,UmidadeRelativa,PressaoAtmosferica,TemperaturaDoAr,TemperaturaInterna,PontoDeOrvalho,RadiacaoSolar,DirecaoDoVento,VelocidadeDoVento,Precipitacao,Data
LocalMax,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0.0,1418705,1418705,1418705,1418705,1418705,1418705,1418705,1418705,1418705,1418705,1418705,1418705
1.0,44865,44865,44865,44865,44865,44865,44865,44865,44865,44865,44865,44865


In [159]:
df_prec_sum = df_cluster_hora.groupby(['Data', 'Local']).sum().reset_index()[['Data', 'Local', 'Precipitacao']]
df_prec_sum.columns = ['Data', 'Local', 'PrecSum']
df_cluster_hora = df_cluster_hora.merge(df_prec_sum, on=['Data', 'Local'])
df_cluster_hora.head(10)

Unnamed: 0,Data_Hora,Local,UmidadeRelativa,PressaoAtmosferica,TemperaturaDoAr,TemperaturaInterna,PontoDeOrvalho,RadiacaoSolar,DirecaoDoVento,VelocidadeDoVento,Precipitacao,Data,LocalMax,PrecSum
0,2011-01-01 00:00:00,Camilopolis,84.8,920.6,20.2,22.8,17.5,0.0,137.0,1.5,0.0,2011-01-01,0.0,0.6
1,2011-01-01 00:15:00,Camilopolis,84.1,920.3,20.1,22.8,17.4,0.0,76.0,1.9,0.0,2011-01-01,0.0,0.6
2,2011-01-01 00:30:00,Camilopolis,83.9,920.2,20.1,22.9,17.2,0.0,92.0,2.9,0.0,2011-01-01,0.0,0.6
3,2011-01-01 00:45:00,Camilopolis,83.5,919.9,20.2,22.9,17.3,0.0,133.0,1.4,0.0,2011-01-01,0.0,0.6
4,2011-01-01 01:00:00,Camilopolis,84.0,919.9,20.1,22.8,17.3,0.0,151.0,1.8,0.0,2011-01-01,0.0,0.6
5,2011-01-01 01:15:00,Camilopolis,84.5,919.5,20.0,23.143578,17.3,0.0,124.0,1.8,0.0,2011-01-01,0.0,0.6
6,2011-01-01 01:30:00,Camilopolis,84.6,919.4,20.0,23.083792,17.3,0.0,137.0,1.8,0.0,2011-01-01,0.0,0.6
7,2011-01-01 01:45:00,Camilopolis,84.5,919.4,20.0,23.005875,17.3,0.0,115.0,1.9,0.0,2011-01-01,0.0,0.6
8,2011-01-01 02:00:00,Camilopolis,85.8,919.2,19.8,22.935631,17.4,0.0,155.0,1.8,0.0,2011-01-01,0.0,0.6
9,2011-01-01 02:15:00,Camilopolis,86.5,919.0,19.7,22.786449,17.4,0.0,140.0,1.7,0.0,2011-01-01,0.0,0.6


In [160]:
# df_clustered_total['Hora'] = pd.to_datetime(df_clustered_total['Data_Hora'], yearfirst=True).dt.hour

# df_ohe = df_clustered_total.groupby(['Data', 'Local', 'Hora']).sum().reset_index()[['Data', 'Local', 'Hora', 'Precipitacao']]
# s_ohe = df_ohe['Hora']
# df_ohe = pd.get_dummies(df_ohe, columns = ['Hora'])
# df_ohe['Hora'] = s_ohe

# for i in range(24):
#     df_ohe.loc[df_ohe['Hora_' + str(i)] == 1, 'Hora_' + str(i)] = df_ohe.loc[df_ohe['Hora_' + str(i)] == 1, 'Precipitacao']

# df_clustered_total = df_clustered_total.merge(df_ohe[['Data', 'Local'] + [c for c in df_ohe.columns if 'Hora' in c]], on=['Data', 'Local', 'Hora'])

In [161]:
df_cluster_hora_a = df_cluster_hora.copy()

In [162]:
df_cluster_hora['Hora'] = df_cluster_hora['Data_Hora'].dt.hour
df_cluster_hora['Local'] = df_cluster_hora['Local'].replace({'Camilopolis': 1, 'Erasmo': 2, 'Paraiso': 3, 'RM': 4, 'Vitoria': 5})

# df_hora = df_cluster_hora.groupby(['Data', 'Local', 'Hora']).sum().reset_index()[['Data', 'Local', 'Hora', 'Precipitacao']]
# # df_clustered_total = df_clustered_total.groupby(['Data', 'Local', 'Hora']).mean().reset_index()
# # s_prec = df_clustered_total.groupby(['Data', 'Local', 'Hora']).sum()[['Precipitacao']]
# # df_clustered_total['Precipitacao'] = s_prec

# df_cluster_hora['Minuto'] = df_cluster_hora['Data_Hora'].dt.minute
# df_cluster_hora = df_cluster_hora[df_cluster_hora['Minuto'] == 0]
# df_cluster_hora = df_cluster_hora.drop(columns = ['Data_Hora', 'Minuto'])
# # df_clustered_total = df_clustered_total.drop(columns = ['Data_Hora', 'Minuto', 'Precipitacao'])
# # df_clustered_total = df_clustered_total.merge(df_hora, on=['Data', 'Local', 'Hora'])

In [163]:
#df_slice = df_clustered_total[(df_clustered_total['Ordens'] == 1) | (df_clustered_total['Cluster'].isin([1,2]))]
#df_slice = df_clustered_total[(df_clustered_total['Ordens'] == 1) | (df_clustered_total['PrecSum'] > 10)]
df_slice = df_cluster_hora.copy()
#df_slice = df_clustered_total[(df_clustered_total['Cluster'].isin([0]))]
#df_slice.loc[df_slice['Cluster'] == 0, 'Ordens'] = 0

df_slice.loc[(df_slice['LocalMax'] == 1) & (df_slice['PrecSum'] <= 10), 'LocalMax'] = 0

#df_slice.loc[(df_slice['Ordens'] == 1) & ~((df_clustered_total[[c for c in df_clustered_total.columns if 'Hora_' in c]] <= 20).sum(axis = 1) < 24), 'Ordens'] = 0
#df_slice = df_slice[df_slice['Local'] == 4]

In [164]:
df_slice.groupby('LocalMax').count()

Unnamed: 0_level_0,Data_Hora,Local,UmidadeRelativa,PressaoAtmosferica,TemperaturaDoAr,TemperaturaInterna,PontoDeOrvalho,RadiacaoSolar,DirecaoDoVento,VelocidadeDoVento,Precipitacao,Data,PrecSum,Hora
LocalMax,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0.0,1435796,1435796,1435796,1435796,1435796,1435796,1435796,1435796,1435796,1435796,1435796,1435796,1435796,1435796
1.0,27774,27774,27774,27774,27774,27774,27774,27774,27774,27774,27774,27774,27774,27774


In [165]:
df_slice.shape

(1463570, 15)

In [166]:
for l in range(6):
    if l != 0:
        df_train = df_slice[df_slice['Local'] == l]
    else:
        df_train = df_slice.copy()
        
    print(f'----- LOCAL {l} -----')

    # Testar prever cluster

    xgb = xgboost.XGBClassifier()

    cols_rem = ['LocalMax', 'Cluster', 'Data', 'Hora', 'Data_Hora', 'Ordens'] + [c for c in df_train.columns if 'Hora_' in c]

    x = df_train[[c for c in df_train.columns if c not in cols_rem]]
    #x = x.drop(columns = 'Cluster')
    y = df_train['LocalMax']

    x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3, random_state = 378, stratify=y)

    # concatenate our training data back together
    X = pd.concat([x_treino, y_treino], axis=1)

    # separate minority and majority classes
    not_ordem = X[X['LocalMax']==0].copy()
    ordem = X[X['LocalMax']==1].copy()

    # upsample minority
    ordem_upsampled = resample(ordem,
                            replace=True, # sample with replacement
                            n_samples=len(not_ordem), # match number in majority class
                            random_state=378) # reproducible results

    # combine majority and upsampled minority
    upsampled = pd.concat([not_ordem, ordem_upsampled])

    x_treino = upsampled[[c for c in df_slice.columns if c not in cols_rem]]
    y_treino = upsampled['LocalMax']

    display(y_treino.value_counts())

    #xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino), (x_teste, y_teste)], eval_metric=f1_score)
    param = {'max_depth':50, 'eta':1, 'objective':'binary:logistic', 'min_child_weight': 1, 'lambda': 1, 'alpha': 0, 'gamma': 0}

    df_train = xgboost.DMatrix(data=x_treino, label=y_treino)
    df_test = xgboost.DMatrix(data=x_teste, label=y_teste)

    bst = xgboost.train(param, df_train, 2, feval=f1_score)
    y_teste_pred = bst.predict(xgboost.DMatrix(data=x_teste, label=y_teste))
    y_teste_pred = [1 if i>0.5 else 0 for i in y_teste_pred]
    y_treino_pred = bst.predict(xgboost.DMatrix(data=x_treino, label=y_treino))
    y_treino_pred = [1 if i>0.5 else 0 for i in y_treino_pred]

    print(f"Treino: {accuracy_score(y_treino, y_treino_pred)}")
    print(f"Teste: {accuracy_score(y_teste, y_teste_pred)}")
    print(f"Precisão: {precision_score(y_teste, y_teste_pred)}")
    print(f"Recall: {recall_score(y_teste, y_teste_pred)}")
    print(f"F1: {f1_score(y_teste, y_teste_pred)}")
    display(confusion_matrix(y_teste, y_teste_pred, normalize='true'))
    display(confusion_matrix(y_teste, y_teste_pred,))

----- LOCAL 0 -----


1.0    1005057
0.0    1005057
Name: LocalMax, dtype: int64

Treino: 0.9992393466241218
Teste: 0.9961600743387744
Precisão: 0.8711190529372348
Recall: 0.9361497839654345
F1: 0.9024644220756682


array([[0.99732088, 0.00267912],
       [0.06385022, 0.93614978]])

array([[429585,   1154],
       [   532,   7800]], dtype=int64)

----- LOCAL 1 -----


1.0    200542
0.0    200542
Name: LocalMax, dtype: int64

Treino: 0.998873054023596
Teste: 0.994602288902807
Precisão: 0.84033203125
Recall: 0.921306209850107
F1: 0.8789581205311542


array([[0.99619533, 0.00380467],
       [0.07869379, 0.92130621]])

array([[85620,   327],
       [  147,  1721]], dtype=int64)

----- LOCAL 2 -----


1.0    201208
0.0    201208
Name: LocalMax, dtype: int64

Treino: 0.9994905769154308
Teste: 0.9968342538290725
Precisão: 0.8871733966745843
Recall: 0.9443742098609356
F1: 0.9148805878750765


array([[0.99779667, 0.00220333],
       [0.05562579, 0.94437421]])

array([[86043,   190],
       [   88,  1494]], dtype=int64)

----- LOCAL 3 -----


1.0    201148
0.0    201148
Name: LocalMax, dtype: int64

Treino: 0.9996321116789627
Teste: 0.9979046859875875
Precisão: 0.9289156626506024
Recall: 0.9589552238805971
F1: 0.9436964504283966


array([[0.9986312 , 0.0013688 ],
       [0.04104478, 0.95895522]])

array([[86089,   118],
       [   66,  1542]], dtype=int64)

----- LOCAL 4 -----


1.0    200878
0.0    200878
Name: LocalMax, dtype: int64

Treino: 0.999350351954918
Teste: 0.9967317656436827
Precisão: 0.9047887323943662
Recall: 0.931554524361949
F1: 0.9179765647327808


array([[0.99803696, 0.00196304],
       [0.06844548, 0.93155452]])

array([[85922,   169],
       [  118,  1606]], dtype=int64)

----- LOCAL 5 -----


1.0    201279
0.0    201279
Name: LocalMax, dtype: int64

Treino: 0.9995801847187238
Teste: 0.9984854523714627
Precisão: 0.9376927822331894
Recall: 0.979381443298969
F1: 0.9580838323353293


array([[0.99882916, 0.00117084],
       [0.02061856, 0.97938144]])

array([[86162,   101],
       [   32,  1520]], dtype=int64)