# Inicialização

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

from plotly import graph_objects as go
import plotly as py

from datetime import datetime
from datetime import timedelta

import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, accuracy_score, f1_score, confusion_matrix, recall_score, precision_score

from sklearn.utils import resample

# Funções

In [3]:
def upsampleData(X, label):
    # Separar verdadeiro e falso
    false_label = X[X[label]==0].copy()
    true_label = X[X[label]==1].copy()
    
    # Realizar upsample para os valores verdadeiros
    label_upsampled = resample(true_label,
                            replace=True, # sample with replacement
                            n_samples=len(false_label), # match number in majority class
                            random_state=378) # reproducible results
    upsampled = pd.concat([false_label, label_upsampled])
    
    # Separar x e y
    x = upsampled[[c for c in X.columns if label not in c]]
    y = upsampled[label]
    
    return x, y

In [4]:
def trainXGB(df, cols_rem, label, verbose=True):
    xgb = xgboost.XGBClassifier()

    # Separar x e y e remover colunas desnecessárias
    x = df[[c for c in df.columns if c not in cols_rem]]
    y = df[label]
    
    # Separar dados de treinamento e teste
    x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3, random_state = 378, stratify=y)
    
    # Upsample
    X = pd.concat([x_treino, y_treino], axis=1)
    x_treino, y_treino = upsampleData(X, label)

    # Parâmetros do XGBClassifier
    param = {'max_depth':50, 'eta':1, 'objective':'binary:logistic', 'min_child_weight': 1, 'lambda': 1, 'alpha': 0, 'gamma': 0}

    # Gerar DMatrix com dados de treinamento e teste
    df_train = xgboost.DMatrix(data=x_treino, label=y_treino)
    df_test = xgboost.DMatrix(data=x_teste, label=y_teste)

    # Treinar modelo e predizer em cima dos dados de treinamento e teste
    bst = xgboost.train(param, df_train, 2, feval=f1_score)
    y_teste_pred = bst.predict(xgboost.DMatrix(data=x_teste, label=y_teste))
    y_teste_pred = [1 if i>0.5 else 0 for i in y_teste_pred]
    y_treino_pred = bst.predict(xgboost.DMatrix(data=x_treino, label=y_treino))
    y_treino_pred = [1 if i>0.5 else 0 for i in y_treino_pred]
    
    # Mostrar resultados se verbose é verdadeiro
    if verbose:
        print(f"Treino: {accuracy_score(y_treino, y_treino_pred)}")
        print(f"Teste: {accuracy_score(y_teste, y_teste_pred)}")
        print(f"Precisão: {precision_score(y_teste, y_teste_pred)}")
        print(f"Recall: {recall_score(y_teste, y_teste_pred)}")
        print(f"F1: {f1_score(y_teste, y_teste_pred)}")
        display(confusion_matrix(y_teste, y_teste_pred, normalize='true'))
        display(confusion_matrix(y_teste, y_teste_pred,))
        
    # Salvar resultados em um dict
    results = {
        'Features': list(x.columns),
        'Train_Acc': accuracy_score(y_treino, y_treino_pred),
        'Test_Acc': accuracy_score(y_teste, y_teste_pred),
        'Precision': precision_score(y_teste, y_teste_pred),
        'Recall': recall_score(y_teste, y_teste_pred),
        'F1': f1_score(y_teste, y_teste_pred),
        'Ver_Pos': confusion_matrix(y_teste, y_teste_pred, normalize='true')[1,1]
    }
    
    return bst, results, y_treino_pred, y_teste_pred

# Carregar dados e gerar modelo

In [8]:
df = pd.read_csv('../../../data/cleandata/Info pluviometricas/Merged Data/prepped_data.csv', sep=';')
df['Data_Hora'] = pd.to_datetime(df['Data_Hora'], yearfirst=True)
df.groupby('Label').count()

Unnamed: 0_level_0,Data_Hora,Mes,Dia,Local,Precipitacao,PrecSum
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,376037,376037,376037,376037,376037,376037
1.0,7368,7368,7368,7368,7368,7368


In [9]:
df = df.sort_values(['Data_Hora', 'Local'])

In [10]:
label = 'Label'
cols_rem = ['LocalMax', 'Label', 'Label_Old', 'Cluster', 'Data', 'Hora', 'Data_Hora', 'Ordens', 'Minuto'] + [c for c in df.columns if 'Hora_' in c]

In [11]:
model, training_res, y_train_pred, y_test_pred = trainXGB(df, cols_rem, label)

Treino: 0.9998670339063539
Teste: 0.999626158474031
Precisão: 0.9813416259440249
Recall: 0.9995475113122172
F1: 0.9903609056265411


array([[9.99627699e-01, 3.72300819e-04],
       [4.52488688e-04, 9.99547511e-01]])

array([[112770,     42],
       [     1,   2209]], dtype=int64)

## Obter precipitação até aquele momento do dia

In [12]:
df_m = df[df['Label'] == 1].copy()
df_m['Data'] = df_m['Data_Hora'].dt.strftime("%Y-%m-%d")

def getPrecMomento(row):
    prec_momento = df_m.loc[(df_m['Data_Hora'] <= row['Data_Hora']) & (df_m['Local'] == row['Local']) & (df_m['Data'] == row['Data']), 'Precipitacao'].sum()
    return prec_momento

df_m['PrecMomento'] = df_m.apply(getPrecMomento, axis=1)

df_m = df_m.rename(columns = {'PrecSum': 'PrecSumOld', 'PrecMomento': 'PrecSum'})

## Prever com acumulo do dia

In [15]:
label_pred = model.predict(xgboost.DMatrix(data=df_m[training_res['Features']]))
df_m['Label_Pred'] = [1 if i>0.5 else 0 for i in label_pred]

In [16]:
df_m['Label_Pred']

816       0
744       0
817       0
745       0
818       0
         ..
372743    1
372767    1
372791    1
372839    1
372815    1
Name: Label_Pred, Length: 7368, dtype: int64

In [17]:
df_g = df_m.groupby(['Data', 'Local']).max()
print(df_g[df_g['Label'] == df_g['Label_Pred']].shape)
print(df_g.shape)

(307, 8)
(307, 8)


In [18]:
df_m.columns

Index(['Data_Hora', 'Mes', 'Dia', 'Local', 'Precipitacao', 'PrecSumOld',
       'Label', 'Data', 'PrecSum', 'Label_Pred'],
      dtype='object')

In [19]:
df_g = df_m.groupby(['Data', 'Local', 'Label_Pred']).min().reset_index()
#df_g[df_g['Label_Pred'] == 1]

df_g = df_g.loc[df_g['Label_Pred'] == 1, ['Data', 'Local', 'Data_Hora']].rename(columns={'Data_Hora':'Min_Hora'})
df_g['Min_Hora'] = df_g['Min_Hora'].dt.hour

In [20]:
df_g

Unnamed: 0,Data,Local,Min_Hora
1,2011-01-07,1,22
3,2011-01-07,2,18
5,2011-01-13,1,21
7,2011-01-13,2,20
9,2011-01-13,3,20
...,...,...,...
602,2019-07-04,1,18
604,2019-07-04,2,18
606,2019-07-04,3,18
608,2019-07-04,4,18


# Gerar Label nova

In [21]:
df_new = df.copy()
df_new['Data'] = df_new['Data_Hora'].dt.strftime('%Y-%m-%d')
df_new = df_new.merge(df_g, on=['Local', 'Data'], how='left').fillna(24)

In [22]:
df_new['Label_New'] = 0
df_new.loc[(df_new['Label'] == 1) & (df_new['Data_Hora'].dt.hour >= df_new['Min_Hora']), 'Label_New'] = 1
df_new = df_new.rename(columns = {'Label': 'Label_Old', 'Label_New': 'Label'})

In [23]:
df_new['Label'].value_counts()

0    381244
1      2161
Name: Label, dtype: int64

In [24]:
df_new[['Data_Hora', 'Local', 'Label']]

Unnamed: 0,Data_Hora,Local,Label
0,2011-01-01,1,0
1,2011-01-01,2,0
2,2011-01-01,3,0
3,2011-01-01,4,0
4,2011-01-01,5,0
...,...,...,...
383400,2019-10-01,1,0
383401,2019-10-01,2,0
383402,2019-10-01,3,0
383403,2019-10-01,4,0


# Salvar resultados

In [29]:
df_new[['Data_Hora', 'Local', 'Label']].to_csv('../../../data/cleandata/Ordens de serviço/labels_predict.csv', sep=';')

In [30]:
model.save_model('../../../data/model/Identificacao_0H.json')