In [19]:
import pandas
from IPython import get_ipython
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import (accuracy_score, f1_score, make_scorer,
                             precision_score, recall_score)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sqlalchemy import select

from custom_transfomers.date_window import TimeWindowTransformer
from data_base.connection import session
from data_base.models import models
from project_utils.data_manipulation import generate_aggregation

In [20]:
## Construção do dataframe utilizando buscas no banco de dados sql
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
query = select(
    models.Variables.date,
    models.Variables.precipitation.label('precipitation'),
    models.Variables.temperature.label('temperature'),
    models.Variables.evaporation.label('evaporation'),
    models.Variables.surface_runoff.label('surface_runoff'),
    models.Coordinate.river_id.label('river'),    
    models.Reservoir.level,
    models.Reservoir.streamflow
).\
    join(models.Variables.coordinate).\
    join(models.Reservoir, models.Variables.date == models.Reservoir.date)

RawDataFrame = pandas.read_sql(query, session.bind)




In [22]:
# DataFrame consolidado porém com os atributos para cada rio posicionados em uma diferente coluna
ConsolidatedDataFrame = (
    RawDataFrame.
    groupby(['date', 'river', 'level', 'streamflow']).
    agg({
        'precipitation': 'sum',
        'evaporation': 'sum',
        'temperature': 'mean',
        'surface_runoff':'mean',
    }).
    reset_index().
    pivot(index=["date", 'level', 'streamflow'], columns="river")
)

ConsolidatedDataFrame.insert(0,'previous_streamflow', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('streamflow')).shift(1).values)
ConsolidatedDataFrame.insert(0,'previous_level', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('level')).shift(1).values)

ConsolidatedDataFrame = ConsolidatedDataFrame.dropna()


In [23]:
seed = 0
scorer = make_scorer(accuracy_score)

rivers = session.query(models.River).all()

precipitation_agg = generate_aggregation('sum', 'precipitation', [river.id for river in rivers])
evaporation_agg = generate_aggregation('sum', 'evaporation', [river.id for river in rivers])
temperature_agg = generate_aggregation('mean', 'temperature', [river.id for river in rivers])
runoff_agg = generate_aggregation('mean', 'surface_runoff', [river.id for river in rivers])

cols = ['precipitation', 'evaporation', 'temperature', 'surface_runoff']

agg = precipitation_agg
agg.update(evaporation_agg)
agg.update(temperature_agg)
agg.update(runoff_agg)


In [24]:
grid_search_params = dict(
    estimator=Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('clf', DummyRegressor())
            ]), 
            param_grid=[
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (
                        TransformedTargetRegressor(
                        transformer=MinMaxScaler(feature_range=(0, 1)), 
                        regressor=SVR(cache_size=1000)
                    ),),
                    'clf__regressor__C': range(1, 15, 3),
                    'clf__regressor__gamma': ['auto', 'scale'],
                    'clf__regressor__kernel': ['rbf']
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (RandomForestRegressor(), ),
                    'clf__random_state': [seed],
                    'clf__n_estimators': [200]
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (DecisionTreeRegressor(), ),
                    'clf__random_state': [seed]
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (StackingRegressor(
                        estimators=[('RandomForest', RandomForestRegressor()), ('SVR', SVR())], 
                        final_estimator=Ridge()
                    ),),
                    'clf__RandomForest__random_state': [seed],
                    'clf__RandomForest__n_estimators': [200],
                }
            ],
            scoring='neg_root_mean_squared_error',
            cv=10,
            n_jobs=-1,
            verbose=10,
            error_score='raise'
)

In [25]:
targets = ['level', 'streamflow']

clf_search = {target: GridSearchCV(**grid_search_params) for target in targets}

In [26]:
streamflow_X_train, streamflow_X_test, streamflow_y_train, streamflow_y_test = train_test_split(
     ConsolidatedDataFrame,
     ConsolidatedDataFrame.index.get_level_values('streamflow'), random_state=seed
)

level_X_train, level_X_test, level_y_train, level_y_test = train_test_split(
     ConsolidatedDataFrame,
     ConsolidatedDataFrame.index.get_level_values('level'), random_state=seed
)

In [27]:
level_classifier = clf_search['level']
streamflow_classifier = clf_search['streamflow']

streamflow_classifier.fit(streamflow_X_train, streamflow_y_train) 
level_classifier.fit(level_X_train, level_y_train)

Fitting 10 folds for each of 52 candidates, totalling 520 fits
Fitting 10 folds for each of 52 candidates, totalling 520 fits


GridSearchCV(cv=10, error_score='raise',
             estimator=Pipeline(steps=[('windowing',
                                        TimeWindowTransformer(columns=['precipitation',
                                                                       'evaporation',
                                                                       'temperature',
                                                                       'surface_runoff'])),
                                       ('imputer', SimpleImputer()),
                                       ('scaler', MinMaxScaler()),
                                       ('clf', DummyRegressor())]),
             n_jobs=-1,
             param_grid=[{'clf': (TransformedTargetRegressor(regressor=SVR(cache_size=1000),
                                                             transfor...
                                                    ('precipitation', 11): 'sum',
                                                    ('surface_runoff', 1): 'me

In [28]:
df = pandas.DataFrame()
df['level'] = level_y_test
df['p_level'] = level_classifier.predict(level_X_test)
df['streamflow'] = streamflow_y_test
df['p_streamflow'] = streamflow_classifier.predict(streamflow_X_test)


In [29]:
level_results = pandas.DataFrame(level_classifier.cv_results_).sort_values('rank_test_score', ascending=False)
{result['param_clf']: (result['mean_test_score'], result['std_test_score']) for _, result in level_results.iterrows()}

{TransformedTargetRegressor(regressor=SVR(cache_size=1000),
                            transformer=MinMaxScaler()): (-0.8119226320006565,
  0.2517460605099128),
 DecisionTreeRegressor(): (-0.0945240360557121, 0.011127606616660493),
 RandomForestRegressor(): (-0.07131128271845194, 0.011784351816806203),
 StackingRegressor(estimators=[('RandomForest',
                                RandomForestRegressor(n_estimators=200,
                                                      random_state=0)),
                               ('SVR', SVR())],
                   final_estimator=Ridge()): (-0.07000870211637965,
  0.011455767344408707)}

In [30]:
streamflow_results = pandas.DataFrame(streamflow_classifier.cv_results_).sort_values('rank_test_score', ascending=False)
{result['param_clf']: (result['mean_test_score'], result['std_test_score']) for _, result in streamflow_results.iterrows()}

{TransformedTargetRegressor(regressor=SVR(cache_size=1000),
                            transformer=MinMaxScaler()): (-273.61219949197346,
  42.00820998338783),
 DecisionTreeRegressor(): (-321.6893662727692, 45.29511261095428),
 RandomForestRegressor(): (-239.50962151368032, 49.860474503472595),
 StackingRegressor(estimators=[('RandomForest',
                                RandomForestRegressor(n_estimators=200,
                                                      random_state=0)),
                               ('SVR', SVR())],
                   final_estimator=Ridge()): (-238.8529314528661,
  49.800020142710984)}

In [31]:
df

Unnamed: 0,level,p_level,streamflow,p_streamflow
0,557.06,557.076007,115.78,114.291984
1,561.52,561.423564,1881.78,2253.167742
2,560.47,560.525260,179.39,168.495383
3,559.26,559.117543,2123.00,901.570480
4,564.53,564.174552,2540.00,2189.446958
...,...,...,...,...
622,568.56,568.589757,351.00,335.441393
623,560.23,560.218557,239.00,326.239235
624,557.55,557.565049,100.00,98.276586
625,562.87,562.917312,1851.35,2161.692424
