In [2]:
import sys 
import os 
#sys.path.append('../../')
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM,Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.cluster import KMeans
import warnings
import os
warnings.filterwarnings("ignore")

from utils import utilidades as myutils

In [3]:
def clean_group_data(dataset, parameter, range_upper = 1,range_lower = 1):
    
    q1, q3 = np.percentile(dataset[parameter],[25,75])
    iqr = q3 - q1
    lower_bound = q3 - range_upper*(1.5 * iqr)  
    upper_bound = q3 + range_upper*(1.5 * iqr)  

    # Deleting lower bound and upper bound from the dataset LinkTT2
    dataset = DFmerged.loc[(dataset[parameter] >= lower_bound) & 
                                              (dataset[parameter] <= upper_bound)]
    
    return dataset

def resultado(df,tecnica,parameter,amostras,metrica):
    
    df.loc[len(df)]= [tecnica,parameter,amostras,round(metrica,2)]
    
    
    return df

In [4]:
parameters = ['coliformes', 'od', 'turbidez', 'fosforo', 'solido', 'dbo', 'temperatura', 'ph']

df_resultado = pd.DataFrame(columns=['tecnica','parametro','amostras','mape'])

DFmerge = pd.DataFrame()
for parameter in parameters:
    # Concatenating all the parameter files
    df = pd.read_csv('../data/CETESB/'+ parameter + '.csv',encoding='utf-8',sep=';')
    DFmerge = pd.concat([DFmerge, df])


for parameter in DFmerge['parametro'].unique():    
    DFmerged = DFmerge[(DFmerge['codigo_ponto'] == 'TIET02050') &
                      (DFmerge['parametro'] == parameter)]  

    

    # DFmerge.groupby([pd.Grouper(freq='1M'), 'codigo_ponto']).mean().unstack()
    # DFmerge.groupby(['codigo_ponto', 'UGRHI'])['valor'].count().unstack()

    DFmerged = DFmerged[DFmerged['codigo_ponto'] == 'TIET02050']    
    DFmerged = clean_group_data(DFmerged, 'valor')
    
    DFmerged['data_coleta'] = pd.to_datetime(DFmerged['data_coleta'])
    DFmerged.index = DFmerged['data_coleta']    
    DFmerged = DFmerged.groupby([pd.Grouper(freq='1M'), 'parametro'])['valor'].mean().unstack()
    
    scaler = MinMaxScaler()
#     DFmerged = scaler.fit_transform()

    # #Mostra a quantidade de NaN no dataframe
    # for field in DFmerge.columns:
    #     print(field, 'NaN:', DFmerge[field].isnull().sum())

    X  = DFmerged[:-1]
    y  = DFmerged[1:]


    dataset = pd.DataFrame(np.concatenate([X,y], axis=1))
    dataset.columns = ['X','y']
    dataset.dropna(inplace=True)

    train_X = np.array(dataset['X']).reshape(len(dataset), 1)
    train_y = np.log(np.array(dataset['y']).reshape(len(dataset), 1) + 0.000000001)
    '''
    print('tamanho x:',len(train_X))
    print('tamanho y:',len(train_y))
    print('train_X[:-40]',len(train_X[:-40]))
    print('train_y[:-40]',len(train_y[:-40]))
    '''

    #Regressão Linear
    model = LinearRegression(normalize=False)


    model.fit(train_X[:-40],train_y[:-40])
    score = model.score(train_X,train_y)
#     print(score)

    #Dados de teste
    previsoes = model.predict(train_X[-40:])
    
    df_result = resultado(df_resultado,'Regressão Linear',parameter,len(DFmerged),myutils.mean_absolute_percentage_error(train_y[-40:], previsoes))
    
    #print(parameter, len(DFmerged),'MAPE',str(myutils.mean_absolute_percentage_error(train_y[-40:], previsoes)))
    
    #Random Forest
    model = RandomForestRegressor()

    model.fit(train_X[:-40],train_y[:-40])
    score = model.score(train_X,train_y)
#     print(score)

    #Dados de teste
    previsoes = model.predict(train_X[-40:])
    
    df_result = resultado(df_resultado,'Random Forest',parameter,len(DFmerged),myutils.mean_absolute_percentage_error(train_y[-40:], previsoes))
    
    #MLP
    model = Sequential()
    model.add(Dense(units = 10, activation = 'relu', input_dim = train_X[-40:].shape[1]))
    model.add(Dense(units = 21, activation = 'relu'))
    model.add(Dense(units = 1, activation = 'sigmoid'))
    model.compile(loss = 'mean_absolute_error', optimizer = 'adam',metrics = ['mean_absolute_error'])

    es = EarlyStopping(monitor='val_loss', patience = 3, verbose=0)

    #Treina o modelo
    history = model.fit(train_X[:-40],train_y[:-40], validation_data = (train_X[:-40],train_y[:-40]),  batch_size = 32, epochs = 2000, callbacks=[es], verbose=0)
    
    #Dados de teste
    media_previsoes = []
    for r in range(0,6):
        previsoes = model.predict(train_X[-40:])
        media_previsoes.append(myutils.mean_absolute_percentage_error(train_y[-40:], previsoes))  
    
    df_result = resultado(df_resultado,'MLP',parameter,len(DFmerged),np.mean(media_previsoes))
    
    
    #LSTM
    
    #train_X = np.array(dataset['X']).reshape(len(dataset), (dataset.shape[0],dataset.shape[1],1))
    #train_y = np.log(np.array(dataset['y']).reshape(len(dataset), (dataset.shape[0],dataset.shape[1],1)) + 0.000000001)
    
    train_X = np.reshape(train_X, (train_X.shape[0], train_X.shape[1], 1))
    train_y = np.reshape(train_y, (train_y.shape[0], train_y.shape[1], 1))

    
    
    model = Sequential()
    model.add(LSTM(units = 10, input_shape = (train_X[-40:].shape[1], 1)))
    model.add(Dense(21, activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(loss = 'mean_absolute_error', optimizer = 'adam',metrics = ['mean_absolute_error'])

    es = EarlyStopping(monitor='val_loss', patience = 3, verbose=0)

    #Treina o modelo
    history = model.fit(train_X[:-40],train_y[:-40],validation_data = (train_X[:-40],train_y[:-40]), batch_size = 32, epochs = 2000, callbacks=[es], verbose=0)
    
    #Dados de teste
    media_previsoes = []
    for r in range(0,6):
        previsoes = model.predict(train_X[-40:])
        media_previsoes.append(myutils.mean_absolute_percentage_error(train_y[-40:], previsoes))
    
    df_result = resultado(df_resultado,'LSTM',parameter,len(DFmerged),np.mean(media_previsoes))
    

    
    
df_result

Unnamed: 0,tecnica,parametro,amostras,mape
0,Regressão Linear,Coliformes Termotolerantes,152,38.85
1,Random Forest,Coliformes Termotolerantes,152,44.74
2,MLP,Coliformes Termotolerantes,152,70.32
3,LSTM,Coliformes Termotolerantes,152,70.32
4,Regressão Linear,Oxigênio Dissolvido,205,26.3
5,Random Forest,Oxigênio Dissolvido,205,32.29
6,MLP,Oxigênio Dissolvido,205,36.77
7,LSTM,Oxigênio Dissolvido,205,36.77
8,Regressão Linear,Turbidez,190,36.27
9,Random Forest,Turbidez,190,104.28


In [27]:
df_result.to_csv (r'/home/anderson/Downloads/predicaoagua/src/temporal.csv', index = True, header=True)