In [1]:
## Construção do dataframe utilizando buscas no banco de dados sql
%load_ext autoreload
%autoreload 2

In [2]:
from sqlalchemy import select

from data_base.models import models
from data_base.connection import session

query = select(
    models.Variables.date,
    models.Variables.precipitation.label('precipitation'),
    models.Variables.temperature.label('temperature'),
    models.Variables.evaporation.label('evaporation'),
    models.Variables.surface_runoff.label('surface_runoff'),
    models.Coordinate.river_id.label('river'),    
    models.Reservoir.level,
    models.Reservoir.streamflow
).\
    join(models.Variables.coordinate).\
    join(models.Reservoir, models.Variables.date == models.Reservoir.date)




In [3]:
import pandas
RawDataFrame = pandas.read_sql(query, session.bind)


In [4]:
# DataFrame consolidado porém com os atributos para cada rio posicionados em uma diferente coluna
ConsolidatedDataFrame = (
    RawDataFrame.
    groupby(['date', 'river', 'level', 'streamflow']).
    agg({
        'precipitation': 'sum',
        'evaporation': 'sum',
        'temperature': 'mean',
        'surface_runoff':'mean',
    }).
    reset_index().
    pivot(index=["date", 'level', 'streamflow'], columns="river")
)

ConsolidatedDataFrame.insert(0,'previous_streamflow', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('streamflow')).shift(1).values)
ConsolidatedDataFrame.insert(0,'previous_level', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('level')).shift(1).values)

ConsolidatedDataFrame = ConsolidatedDataFrame.dropna()
ConsolidatedDataFrame


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,previous_level,previous_streamflow,precipitation,precipitation,precipitation,precipitation,precipitation,precipitation,precipitation,precipitation,...,surface_runoff,surface_runoff,surface_runoff,surface_runoff,surface_runoff,surface_runoff,surface_runoff,surface_runoff,surface_runoff,surface_runoff
Unnamed: 0_level_1,Unnamed: 1_level_1,river,Unnamed: 3_level_1,Unnamed: 4_level_1,1,2,3,4,5,6,7,8,...,2,3,4,5,6,7,8,9,10,11
date,level,streamflow,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
1999-01-02,560.28,1128.00,560.19,854.00,793.719631,489.695138,110.695258,85.585158,212.667287,156.652986,442.018330,712.320556,...,0.247559,0.415066,4.208309e-01,3.955342e-01,0.252776,1.099707,1.121611e+00,1.531772,1.227036,0.095943
1999-01-03,560.35,1021.00,560.28,1128.00,1098.730582,1073.648047,191.857469,135.788901,301.326610,218.532570,296.882969,630.266191,...,0.461799,0.569603,4.967474e-01,4.353504e-01,0.326826,0.538661,8.398713e-01,1.719842,2.360424,0.322681
1999-01-04,560.47,1307.00,560.35,1021.00,966.199458,925.685374,111.636958,87.936838,219.241251,183.210818,134.095318,239.354739,...,0.449394,0.244052,2.483944e-01,2.444939e-01,0.221106,0.120044,1.128541e-01,0.830939,2.724089,0.328260
1999-01-05,560.62,1481.00,560.47,1307.00,273.113279,359.294498,2.801905,7.419221,27.256007,21.757371,153.764811,180.616363,...,0.086567,0.001172,4.046416e-03,5.847266e-03,0.005015,0.301938,2.548458e-01,0.473589,0.404729,0.008492
1999-01-06,560.78,1555.00,560.62,1481.00,820.311822,720.483393,141.758409,94.296583,189.727925,140.160450,183.347353,351.402533,...,0.621405,0.688863,6.904144e-01,6.253536e-01,0.570796,0.266499,4.545572e-01,0.479431,1.042385,0.598677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-12-26,568.51,1068.11,568.55,1227.00,62.192756,272.506781,0.128756,0.728183,4.973414,26.787834,0.000000,0.005358,...,0.132323,0.000012,2.241300e-04,5.918848e-03,0.041042,0.000000,1.959131e-07,0.009844,0.348761,0.128595
2005-12-27,568.44,1027.44,568.51,1068.11,140.071616,134.625157,28.822960,26.483711,67.049912,55.686758,62.191904,120.094629,...,0.003189,0.007080,7.337448e-03,7.120000e-03,0.011981,0.008833,1.499046e-02,0.318010,0.523005,0.017385
2005-12-28,568.40,902.97,568.44,1027.44,683.323431,1267.992157,80.176759,55.567427,139.098593,136.490363,127.424603,299.923742,...,0.421945,0.086313,8.438702e-02,9.693098e-02,0.125528,0.043880,1.938419e-01,0.440160,1.408806,0.223083
2005-12-29,568.38,1106.56,568.40,902.97,1589.087376,2039.549887,270.658218,204.444199,501.566561,414.706602,521.939386,980.966684,...,0.954975,1.278921,1.223015e+00,1.259939e+00,1.102783,1.389652,1.590770e+00,0.590228,0.848934,0.897973


In [5]:
from sklearn import svm
from sklearn.svm import SVR

from sklearn.compose import TransformedTargetRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer

from custom_transfomers.date_window import TimeWindowTransformer

In [6]:
seed = 0
scorer = make_scorer(accuracy_score) # Teste outras

cross_validation_10 = KFold(n_splits=10, shuffle=True, random_state=seed)

cross_validation_3 = KFold(n_splits=3, shuffle=True, random_state=seed)



In [7]:
from project_utils.data_manipulation import generate_aggregation
rivers = session.query(models.River).all()

precipitation_agg = generate_aggregation('sum', 'precipitation', [river.id for river in rivers])
evaporation_agg = generate_aggregation('sum', 'evaporation', [river.id for river in rivers])
temperature_agg = generate_aggregation('mean', 'temperature', [river.id for river in rivers])
runoff_agg = generate_aggregation('sum', 'surface_runoff', [river.id for river in rivers])

cols = ['precipitation', 'evaporation', 'temperature', 'surface_runoff']


In [8]:
import multiprocessing

algorithms = {
    # testar sem hiperparâmetros
    'SVR':  GridSearchCV(
            Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('transformer', TransformedTargetRegressor(
                        transformer=MinMaxScaler(feature_range=(0, 1)), 
                        regressor=SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=- 1)
                    )
                )]), 
            param_grid={
                'windowing__aggregate': [precipitation_agg | evaporation_agg | temperature_agg | runoff_agg],
                'windowing__rolling': range(1, 30, 5),
                'windowing__dropna': [False],
                'transformer__regressor__C': range(1, 15, 3)
            },
            scoring='neg_mean_squared_error',
            cv=5,
            n_jobs=multiprocessing.cpu_count()-1,
            verbose=10,
            error_score='raise'
        ),
    'RandomForest':  GridSearchCV(
            Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('random_forest', RandomForestRegressor(random_state=seed))]), 
            param_grid={
                'windowing__aggregate': [precipitation_agg | evaporation_agg | temperature_agg | runoff_agg],
                'windowing__rolling': range(1, 30, 5),
                'windowing__dropna': [False],
                'random_forest__max_depth': range(1, 20, 5)
            },
            scoring='neg_mean_squared_error',
            cv=5,
            n_jobs=multiprocessing.cpu_count()-1,
            verbose=10,
            error_score='raise'
        ),
    'NormalizedRandomForest':  GridSearchCV(
            Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('transformer', TransformedTargetRegressor(
                        transformer=MinMaxScaler(feature_range=(0, 1)),
                        regressor=RandomForestRegressor(random_state=seed)
                    )
                )
            ]), 
            param_grid={
                'windowing__aggregate': [precipitation_agg | evaporation_agg | temperature_agg | runoff_agg],
                'windowing__rolling': range(1, 30, 5),
                'windowing__dropna': [False],
                'transformer__regressor__max_depth': range(1, 20, 5)
            },
            scoring='neg_mean_squared_error',
            cv=5,
            n_jobs=-1,
            verbose=10,
            error_score='raise'
        ),
}

In [23]:
from sklearn.model_selection import cross_val_score

result_level = {}
for alg, clf in algorithms.items():
    result_level[alg] = cross_val_score(clf, ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('level'))
    result_level['target'] = 'level'




Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

In [22]:
result_streamflow = {}
for alg, clf in algorithms.items():
    result_streamflow[alg] = cross_val_score(clf, ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('streamflow'))
    result_streamflow['target'] = 'streamflow'

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits


KeyboardInterrupt: 

In [20]:
print(result)

             SVR      target  RandomForest  NormalizedRandomForest
0 -123482.446509  streamflow -94240.089630           -95087.644191
1  -50242.874798  streamflow -18509.350270           -18434.589592
2  -76314.411517  streamflow -56275.677320           -55936.723376
3  -67203.543898  streamflow -52154.755205           -51612.944528
4  -96751.841735  streamflow -69579.171091           -70359.833510


In [19]:
result = pandas.DataFrame.from_dict(result)
result.pivot(columns='target')

Unnamed: 0_level_0,SVR,RandomForest,NormalizedRandomForest
target,streamflow,streamflow,streamflow
0,-123482.446509,-94240.08963,-95087.644191
1,-50242.874798,-18509.35027,-18434.589592
2,-76314.411517,-56275.67732,-55936.723376
3,-67203.543898,-52154.755205,-51612.944528
4,-96751.841735,-69579.171091,-70359.83351
