In [24]:
import pandas
from IPython import get_ipython
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import (accuracy_score, f1_score, make_scorer,
                             precision_score, recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,
                                     train_test_split)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sqlalchemy import select

from custom_transfomers.date_window import TimeWindowTransformer, Debug
from data_base.connection import session
from data_base.models import models
from project_utils.data_manipulation import generate_aggregation

In [8]:
## Construção do dataframe utilizando buscas no banco de dados sql
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
query = select(
    models.Variables.date,
    models.Variables.precipitation.label('precipitation'),
    models.Variables.temperature.label('temperature'),
    models.Variables.evaporation.label('evaporation'),
    models.Variables.surface_runoff.label('surface_runoff'),
    models.Coordinate.river_id.label('river'),    
    models.Reservoir.level,
    models.Reservoir.streamflow
).\
    join(models.Variables.coordinate).\
    join(models.Reservoir, models.Variables.date == models.Reservoir.date)

RawDataFrame = pandas.read_sql(query, session.bind)




In [10]:
# DataFrame consolidado porém com os atributos para cada rio posicionados em uma diferente coluna
ConsolidatedDataFrame = (
    RawDataFrame.
    groupby(['date', 'level', 'river', 'streamflow']).
    agg({
        'precipitation': 'sum',
        'evaporation': 'sum',
        'temperature': 'mean',
        'surface_runoff':'mean',
    }).
    reset_index().
    pivot(index=["date", 'level', 'streamflow'], columns="river")
)

ConsolidatedDataFrame.insert(0,'previous_streamflow', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('streamflow')).shift(1).values)
ConsolidatedDataFrame.insert(0,'previous_level', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('level')).shift(1).values)

ConsolidatedDataFrame = ConsolidatedDataFrame.dropna()




In [11]:
seed = 0
scorer = make_scorer(accuracy_score)

rivers = session.query(models.River).all()

precipitation_agg = generate_aggregation('sum', 'precipitation', [river.id for river in rivers])
evaporation_agg = generate_aggregation('sum', 'evaporation', [river.id for river in rivers])
temperature_agg = generate_aggregation('mean', 'temperature', [river.id for river in rivers])
runoff_agg = generate_aggregation('mean', 'surface_runoff', [river.id for river in rivers])

cols = ['precipitation', 'evaporation', 'temperature', 'surface_runoff']

agg = precipitation_agg
agg.update(evaporation_agg)
agg.update(temperature_agg)
agg.update(runoff_agg)


In [30]:
cv = KFold(n_splits=10, shuffle=False)

grid_search_params = dict(
    estimator=Pipeline(
        [
            ('windowing', TimeWindowTransformer(columns=cols)),
            ('debug', Debug()),
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler(feature_range=(0, 1))),
            ('clf', DummyRegressor())
        ],
    ), 
    param_grid=[
        {
            'windowing__aggregate': [agg],
            'windowing__rolling': range(1, 32, 10),
            'windowing__dropna': [False],
            'clf': [TransformedTargetRegressor(
                        transformer=MinMaxScaler(feature_range=(0, 1)), 
                        regressor=SVR(cache_size=1000)
                    )],
            'clf__regressor__C': range(1, 15, 3),
            'clf__regressor__gamma': ['auto', 'scale'],
            'clf__regressor__kernel': ['rbf']
        },
        {
            'windowing__aggregate': [agg],
            'windowing__rolling': range(1, 32, 10),
            'windowing__dropna': [False],
            'clf': (RandomForestRegressor(),),
            'clf__random_state': [seed],
            'clf__n_estimators': [200]
        },
        {
            'windowing__aggregate': [agg],
            'windowing__rolling': range(1, 32, 10),
            'windowing__dropna': [False],
            'clf': (DecisionTreeRegressor(), ),
            'clf__random_state': [seed]
        },
        {
            'windowing__aggregate': [agg],
            'windowing__rolling': range(1, 32, 10),
            'windowing__dropna': [False],
            'clf': (StackingRegressor(
                estimators=[('RandomForest', RandomForestRegressor()), ('SVR', SVR())], 
                final_estimator=Ridge()
            ),),
            'clf__RandomForest__random_state': [seed],
            'clf__RandomForest__n_estimators': [200],
        }
    ],
    scoring='neg_root_mean_squared_error',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

In [31]:
targets = ['level', 'streamflow']

clf_search = {target: GridSearchCV(**grid_search_params) for target in targets}

level_estimator_search = clf_search['level']
streamflow_estimator_search = clf_search['streamflow']

level_estimator_search.fit(ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('level'))
streamflow_estimator_search.fit(ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('streamflow')) 

Fitting 10 folds for each of 52 candidates, totalling 520 fits


In [15]:
level_X_train, level_X_test, level_y_train, level_y_test = train_test_split(
     ConsolidatedDataFrame,
     ConsolidatedDataFrame.index.get_level_values('level'),
     random_state=seed,
     shuffle=False
)

streamflow_X_train, streamflow_X_test, streamflow_y_train, streamflow_y_test = train_test_split(
     ConsolidatedDataFrame,
     ConsolidatedDataFrame.index.get_level_values('streamflow'),
     random_state=seed,
     shuffle=False
)

In [16]:
level_estimator = level_estimator_search.best_estimator_
streamflow_estimator = streamflow_estimator_search.best_estimator_

level_estimator.fit(level_X_train, level_y_train)
streamflow_estimator.fit(streamflow_X_train, streamflow_y_train)

Pipeline(steps=[('windowing',
                 TimeWindowTransformer(aggregate={('evaporation', 1): 'sum',
                                                  ('evaporation', 2): 'sum',
                                                  ('evaporation', 3): 'sum',
                                                  ('evaporation', 4): 'sum',
                                                  ('evaporation', 5): 'sum',
                                                  ('evaporation', 6): 'sum',
                                                  ('evaporation', 7): 'sum',
                                                  ('evaporation', 8): 'sum',
                                                  ('evaporation', 9): 'sum',
                                                  ('evaporation', 10): 'sum',
                                                  ('evaporation', 11): 'sum',
                                                  ('precipitation', 1): 's...
                                           

In [17]:
level_cv_score = cross_val_score(
    level_estimator,
    level_X_test, level_y_test,
    scoring='neg_root_mean_squared_error',
    cv=10,
    n_jobs=-1,
    error_score='raise'
)

print(f"level score: {level_cv_score.mean()} +- {level_cv_score.std()}")

level score: -0.09811071491078249 +- 0.08143622559082732


In [18]:
streamflow_cv_score = cross_val_score(
    streamflow_estimator,
    streamflow_X_test, streamflow_y_test,
    scoring='neg_root_mean_squared_error',
    cv=10,
    n_jobs=-1,
    error_score='raise'
)

print(f"streamflow score: {streamflow_cv_score.mean()} +- {streamflow_cv_score.std()}")

streamflow score: -228.1500149107802 +- 127.43607064430101


In [19]:
test_df = pandas.DataFrame()
test_df['level'] = level_y_test
test_df['p_level'] = level_estimator.predict(level_X_test)
test_df['streamflow'] = streamflow_y_test
test_df['p_streamflow'] = streamflow_estimator.predict(streamflow_X_test)
test_df

Unnamed: 0,level,p_level,streamflow,p_streamflow
0,569.73,569.38105,977.11,979.145425
1,569.78,569.38105,943.69,940.215285
2,569.83,569.38105,927.69,928.356708
3,569.88,569.38105,927.69,893.812914
4,569.92,569.38105,821.26,979.130355
...,...,...,...,...
622,568.51,568.63855,1068.11,1095.624956
623,568.44,568.62080,1027.44,1007.062057
624,568.40,568.51360,902.97,1110.739947
625,568.38,568.50185,1106.56,1025.132982


In [20]:
full_df = pandas.DataFrame()
full_df['level'] = ConsolidatedDataFrame.index.get_level_values('level')
full_df['p_level'] = level_estimator.predict(ConsolidatedDataFrame)
full_df['streamflow'] = ConsolidatedDataFrame.index.get_level_values('streamflow')
full_df['p_streamflow'] = streamflow_estimator.predict(ConsolidatedDataFrame)
full_df

Unnamed: 0,level,p_level,streamflow,p_streamflow
0,560.28,560.24625,1128.00,1050.115575
1,560.35,560.39680,1021.00,992.265916
2,560.47,560.44490,1307.00,1189.442065
3,560.62,560.56920,1481.00,1343.516031
4,560.78,560.75635,1555.00,1562.833802
...,...,...,...,...
2502,568.51,568.63855,1068.11,1095.624956
2503,568.44,568.62080,1027.44,1007.062057
2504,568.40,568.51360,902.97,1110.739947
2505,568.38,568.50185,1106.56,1025.132982
