In [1]:
import pandas
from IPython import get_ipython
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import (accuracy_score, f1_score, make_scorer,
                             precision_score, recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,
                                     train_test_split)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sqlalchemy import select

from custom_transfomers.date_window import TimeWindowTransformer
from data_base.connection import session
from data_base.models import models
from project_utils.data_manipulation import generate_aggregation

In [2]:
## Construção do dataframe utilizando buscas no banco de dados sql
%load_ext autoreload
%autoreload 2

In [3]:
query = select(
    models.Variables.date,
    models.Variables.precipitation.label('precipitation'),
    models.Variables.temperature.label('temperature'),
    models.Variables.evaporation.label('evaporation'),
    models.Variables.surface_runoff.label('surface_runoff'),
    models.Coordinate.river_id.label('river'),    
    models.Reservoir.level,
    models.Reservoir.streamflow
).\
    join(models.Variables.coordinate).\
    join(models.Reservoir, models.Variables.date == models.Reservoir.date)

RawDataFrame = pandas.read_sql(query, session.bind)




In [4]:
# DataFrame consolidado porém com os atributos para cada rio posicionados em uma diferente coluna
ConsolidatedDataFrame = (
    RawDataFrame.
    groupby(['date', 'river', 'level', 'streamflow']).
    agg({
        'precipitation': 'sum',
        'evaporation': 'sum',
        'temperature': 'mean',
        'surface_runoff':'mean',
    }).
    reset_index().
    pivot(index=["date", 'level', 'streamflow'], columns="river")
)

ConsolidatedDataFrame.insert(0,'previous_streamflow', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('streamflow')).shift(1).values)
ConsolidatedDataFrame.insert(0,'previous_level', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('level')).shift(1).values)

ConsolidatedDataFrame = ConsolidatedDataFrame.dropna()


In [31]:
seed = 0
scorer = make_scorer(accuracy_score)

rivers = session.query(models.River).all()

precipitation_agg = generate_aggregation('sum', 'precipitation', [river.id for river in rivers])
evaporation_agg = generate_aggregation('sum', 'evaporation', [river.id for river in rivers])
temperature_agg = generate_aggregation('mean', 'temperature', [river.id for river in rivers])
runoff_agg = generate_aggregation('mean', 'surface_runoff', [river.id for river in rivers])

cols = ['precipitation', 'evaporation', 'temperature', 'surface_runoff']

agg = precipitation_agg
agg.update(evaporation_agg)
agg.update(temperature_agg)
agg.update(runoff_agg)


In [42]:
grid_search_params = dict(
    estimator=Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('clf', DummyRegressor())
            ]), 
            param_grid=[
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': [TransformedTargetRegressor(
                                transformer=MinMaxScaler(feature_range=(0, 1)), 
                                regressor=SVR(cache_size=1000)
                            )],
                    'clf__regressor__C': range(1, 15, 3),
                    'clf__regressor__gamma': ['auto', 'scale'],
                    'clf__regressor__kernel': ['rbf']
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (RandomForestRegressor(),),
                    'clf__random_state': [seed],
                    'clf__n_estimators': [200]
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (DecisionTreeRegressor(), ),
                    'clf__random_state': [seed]
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (StackingRegressor(
                        estimators=[('RandomForest', RandomForestRegressor()), ('SVR', SVR())], 
                        final_estimator=Ridge()
                    ),),
                    'clf__RandomForest__random_state': [seed],
                    'clf__RandomForest__n_estimators': [200],
                }
            ],
            scoring='neg_root_mean_squared_error',
            cv=10,
            n_jobs=-1,
            verbose=1
)

In [43]:
targets = ['level', 'streamflow']

clf_search = {target: GridSearchCV(**grid_search_params) for target in targets}

level_estimator_search = clf_search['level']
streamflow_estimator_search = clf_search['streamflow']

level_estimator_search.fit(ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('level'))
streamflow_estimator_search.fit(ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('streamflow')) 

Fitting 10 folds for each of 52 candidates, totalling 520 fits
Fitting 10 folds for each of 52 candidates, totalling 520 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('windowing',
                                        TimeWindowTransformer(columns=['precipitation',
                                                                       'evaporation',
                                                                       'temperature',
                                                                       'surface_runoff'])),
                                       ('imputer', SimpleImputer()),
                                       ('scaler', MinMaxScaler()),
                                       ('clf', DummyRegressor())]),
             n_jobs=-1,
             param_grid=[{'clf': [TransformedTargetRegressor(regressor=SVR(cache_size=1000),
                                                             transformer=MinMaxScaler())]...
                                                    ('precipitation', 11): 'sum',
                                                    ('surface_runoff', 1): 'mea

In [44]:
level_X_train, level_X_test, level_y_train, level_y_test = train_test_split(
     ConsolidatedDataFrame,
     ConsolidatedDataFrame.index.get_level_values('level'), random_state=seed
)

streamflow_X_train, streamflow_X_test, streamflow_y_train, streamflow_y_test = train_test_split(
     ConsolidatedDataFrame,
     ConsolidatedDataFrame.index.get_level_values('streamflow'), random_state=seed
)

In [45]:
level_estimator = level_estimator_search.best_estimator_
streamflow_estimator = streamflow_estimator_search.best_estimator_

level_estimator.fit(level_X_train, level_y_train)
streamflow_estimator.fit(streamflow_X_train, streamflow_y_train)

Pipeline(steps=[('windowing',
                 TimeWindowTransformer(aggregate={('evaporation', 1): 'sum',
                                                  ('evaporation', 2): 'sum',
                                                  ('evaporation', 3): 'sum',
                                                  ('evaporation', 4): 'sum',
                                                  ('evaporation', 5): 'sum',
                                                  ('evaporation', 6): 'sum',
                                                  ('evaporation', 7): 'sum',
                                                  ('evaporation', 8): 'sum',
                                                  ('evaporation', 9): 'sum',
                                                  ('evaporation', 10): 'sum',
                                                  ('evaporation', 11): 'sum',
                                                  ('precipitation', 1): 's...
                                           

In [46]:
cross_val_score(
    level_estimator,
    level_X_test, level_y_test,
    scoring='neg_root_mean_squared_error',
    cv=10,
    n_jobs=-1,
    verbose=10,
    error_score='raise'
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    2.1s remaining:    5.1s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.1s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.1s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.2s finished


array([-0.1213866 , -0.08497091, -0.08873553, -0.11335988, -0.10027779,
       -0.07876949, -0.11725681, -0.09109943, -0.07495576, -0.09863412])

In [47]:
cross_val_score(
    streamflow_estimator,
    streamflow_X_test, streamflow_y_test,
    scoring='neg_root_mean_squared_error',
    cv=10,
    n_jobs=-1,
    verbose=10,
    error_score='raise'
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   11.8s remaining:   27.6s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.8s remaining:   11.8s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   11.9s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.9s finished


array([-256.32621209, -180.26168467, -269.47444049, -345.59480365,
       -181.4729543 , -173.52005328, -194.35490686, -176.77016017,
       -163.78047923, -284.31228502])

In [48]:
df = pandas.DataFrame()
df['level'] = level_y_test
df['p_level'] = level_estimator.predict(level_X_test)
df['streamflow'] = streamflow_y_test
df['p_streamflow'] = streamflow_estimator.predict(streamflow_X_test)


In [49]:
df

Unnamed: 0,level,p_level,streamflow,p_streamflow
0,557.06,557.08795,115.78,114.291984
1,561.52,561.41170,1881.78,2253.167742
2,560.47,560.50315,179.39,168.495383
3,559.26,559.11960,2123.00,901.570480
4,564.53,564.16450,2540.00,2189.446958
...,...,...,...,...
622,568.56,568.55825,351.00,335.441393
623,560.23,560.22980,239.00,326.239235
624,557.55,557.57775,100.00,98.276586
625,562.87,562.83485,1851.35,2161.692424
