In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.decomposition import PCA
from sklearn.compose import TransformedTargetRegressor
from custom_transfomers.date_window import TimeWindowTransformer
from sklearn.dummy import DummyRegressor
from project_utils.data_manipulation import generate_aggregation
from sklearn.metrics import make_scorer
import pandas
from data_base.connection import session
from data_base.models import models
from sqlalchemy import select
from IPython import get_ipython

In [2]:
## Construção do dataframe utilizando buscas no banco de dados sql
%load_ext autoreload
%autoreload 2

In [3]:
query = select(
    models.Variables.date,
    models.Variables.precipitation.label('precipitation'),
    models.Variables.temperature.label('temperature'),
    models.Variables.evaporation.label('evaporation'),
    models.Variables.surface_runoff.label('surface_runoff'),
    models.Coordinate.river_id.label('river'),    
    models.Reservoir.level,
    models.Reservoir.streamflow
).\
    join(models.Variables.coordinate).\
    join(models.Reservoir, models.Variables.date == models.Reservoir.date)

RawDataFrame = pandas.read_sql(query, session.bind)




In [4]:
# DataFrame consolidado porém com os atributos para cada rio posicionados em uma diferente coluna
ConsolidatedDataFrame = (
    RawDataFrame.
    groupby(['date', 'river', 'level', 'streamflow']).
    agg({
        'precipitation': 'sum',
        'evaporation': 'sum',
        'temperature': 'mean',
        'surface_runoff':'mean',
    }).
    reset_index().
    pivot(index=["date", 'level', 'streamflow'], columns="river")
)

ConsolidatedDataFrame.insert(0,'previous_streamflow', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('streamflow')).shift(1).values)
ConsolidatedDataFrame.insert(0,'previous_level', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('level')).shift(1).values)

ConsolidatedDataFrame = ConsolidatedDataFrame.dropna()


In [5]:
seed = 0
scorer = make_scorer(accuracy_score)

rivers = session.query(models.River).all()

precipitation_agg = generate_aggregation('sum', 'precipitation', [river.id for river in rivers])
evaporation_agg = generate_aggregation('sum', 'evaporation', [river.id for river in rivers])
temperature_agg = generate_aggregation('mean', 'temperature', [river.id for river in rivers])
runoff_agg = generate_aggregation('mean', 'surface_runoff', [river.id for river in rivers])

cols = ['precipitation', 'evaporation', 'temperature', 'surface_runoff']

agg = precipitation_agg
agg.update(evaporation_agg)
agg.update(temperature_agg)
agg.update(runoff_agg)


In [19]:
StackingRegressor(estimators=[('RandomForest', RandomForestRegressor()), ('SVR', SVR())], final_estimator=('Ridge', RidgeCV())).get_params()

{'cv': None,
 'estimators': [('RandomForest', RandomForestRegressor()), ('SVC', SVR())],
 'final_estimator': ('Ridge', RidgeCV(alphas=array([ 0.1,  1. , 10. ]))),
 'n_jobs': None,
 'passthrough': False,
 'verbose': 0,
 'RandomForest': RandomForestRegressor(),
 'SVC': SVR(),
 'RandomForest__bootstrap': True,
 'RandomForest__ccp_alpha': 0.0,
 'RandomForest__criterion': 'mse',
 'RandomForest__max_depth': None,
 'RandomForest__max_features': 'auto',
 'RandomForest__max_leaf_nodes': None,
 'RandomForest__max_samples': None,
 'RandomForest__min_impurity_decrease': 0.0,
 'RandomForest__min_impurity_split': None,
 'RandomForest__min_samples_leaf': 1,
 'RandomForest__min_samples_split': 2,
 'RandomForest__min_weight_fraction_leaf': 0.0,
 'RandomForest__n_estimators': 100,
 'RandomForest__n_jobs': None,
 'RandomForest__oob_score': False,
 'RandomForest__random_state': None,
 'RandomForest__verbose': 0,
 'RandomForest__warm_start': False,
 'SVC__C': 1.0,
 'SVC__cache_size': 200,
 'SVC__coef0': 0.

In [22]:
clf_search = GridSearchCV(
            Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('clf', DummyRegressor())
            ]), 
            param_grid=[
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (
                        TransformedTargetRegressor(
                        transformer=MinMaxScaler(feature_range=(0, 1)), 
                        regressor=SVR(cache_size=1000)
                    ),),
                    'clf__regressor__C': range(1, 15, 3),
                    'clf__regressor__gamma': ['auto', 'scale'],
                    'clf__regressor__kernel': ['rbf']
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (RandomForestRegressor(), ),
                    'clf__random_state': [seed],
                    'clf__n_estimators': [200]
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (DecisionTreeRegressor(), ),
                    'clf__random_state': [seed]
                },
                {
                    'windowing__aggregate': [agg],
                    'windowing__rolling': range(1, 32, 10),
                    'windowing__dropna': [False],
                    'clf': (StackingRegressor(
                        estimators=[('RandomForest', RandomForestRegressor()), ('SVR', SVR())], 
                        final_estimator=RidgeCV()
                    ),),
                    'clf__RandomForest__random_state': [seed]
                }
            ],
            scoring='neg_root_mean_squared_error',
            cv=10,
            n_jobs=-1,
            verbose=10,
            error_score='raise'
        )

In [23]:
classifier = clf_search

df = pandas.DataFrame()
df['level'] = ConsolidatedDataFrame.index.get_level_values('level')
classifier.fit(ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('level'))
df['p_level'] = classifier.predict(ConsolidatedDataFrame)
classifier.fit(ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('streamflow')) 
df['streamflow'] = ConsolidatedDataFrame.index.get_level_values('streamflow')
df['p_streamflow'] = classifier.predict(ConsolidatedDataFrame)
df


Fitting 10 folds for each of 6 candidates, totalling 60 fits
Fitting 10 folds for each of 6 candidates, totalling 60 fits


Unnamed: 0,level,p_level,streamflow,p_streamflow
0,560.28,560.214981,1128.00,1272.337750
1,560.35,560.292758,1021.00,1000.722692
2,560.47,560.375647,1307.00,1172.341818
3,560.62,560.485660,1481.00,1434.546704
4,560.78,560.674334,1555.00,1567.032489
...,...,...,...,...
2502,568.51,568.537682,1068.11,1079.529270
2503,568.44,568.501531,1027.44,1011.371752
2504,568.40,568.422610,902.97,981.796578
2505,568.38,568.346431,1106.56,1053.074736
