In [None]:
## Construção do dataframe utilizando buscas no banco de dados sql
%load_ext autoreload
%autoreload 2

In [None]:
from sqlalchemy import select

from data_base.models import models
from data_base.connection import session

query = select(
    models.Variables.date,
    models.Variables.precipitation.label('precipitation'),
    models.Variables.temperature.label('temperature'),
    models.Variables.evaporation.label('evaporation'),
    models.Variables.surface_runoff.label('surface_runoff'),
    models.Coordinate.river_id.label('river'),    
    models.Reservoir.level,
    models.Reservoir.streamflow
).\
    join(models.Variables.coordinate).\
    join(models.Reservoir, models.Variables.date == models.Reservoir.date)




In [None]:
import pandas
RawDataFrame = pandas.read_sql(query, session.bind)


In [None]:
# DataFrame consolidado porém com os atributos para cada rio posicionados em uma diferente coluna
ConsolidatedDataFrame = (
    RawDataFrame.
    groupby(['date', 'river', 'level', 'streamflow']).
    agg({
        'precipitation': 'sum',
        'evaporation': 'sum',
        'temperature': 'mean',
        'surface_runoff':'mean',
    }).
    reset_index().
    pivot(index=["date", 'level', 'streamflow'], columns="river")
)

ConsolidatedDataFrame.insert(0,'previous_streamflow', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('streamflow')).shift(1).values)
ConsolidatedDataFrame.insert(0,'previous_level', pandas.DataFrame(ConsolidatedDataFrame.index.get_level_values('level')).shift(1).values)

ConsolidatedDataFrame = ConsolidatedDataFrame.dropna()
ConsolidatedDataFrame


In [None]:
(ConsolidatedDataFrame - ConsolidatedDataFrame.min()) / (ConsolidatedDataFrame.max() - ConsolidatedDataFrame.min())

In [None]:
from sklearn import svm
from sklearn.svm import SVR

from sklearn.compose import TransformedTargetRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer

from custom_transfomers.date_window import TimeWindowTransformer

In [None]:
seed = 0
scorer = make_scorer(accuracy_score) # Teste outras

cross_validation_10 = KFold(n_splits=10, shuffle=True, random_state=seed)

cross_validation_3 = KFold(n_splits=3, shuffle=True, random_state=seed)



In [None]:
from project_utils.data_manipulation import generate_aggregation
rivers = session.query(models.River).all()

precipitation_agg = generate_aggregation('sum', 'precipitation', [river.id for river in rivers])
evaporation_agg = generate_aggregation('sum', 'evaporation', [river.id for river in rivers])
temperature_agg = generate_aggregation('mean', 'temperature', [river.id for river in rivers])
runoff_agg = generate_aggregation('sum', 'surface_runoff', [river.id for river in rivers])

cols = ['precipitation', 'evaporation', 'temperature', 'surface_runoff']


In [None]:
agg = precipitation_agg
agg.update(evaporation_agg)
agg.update(temperature_agg)
agg.update(runoff_agg)

In [None]:
import multiprocessing

algorithms = {
    # testar sem hiperparâmetros
    'SVR':  GridSearchCV(
            Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('transformer', TransformedTargetRegressor(
                        transformer=MinMaxScaler(feature_range=(0, 1)), 
                        regressor=SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=- 1)
                    )
                )]), 
            param_grid={
                'windowing__aggregate': [agg],
                'windowing__rolling': range(1, 30, 5),
                'windowing__dropna': [False],
                'transformer__regressor__C': range(1, 15, 3)
            },
            scoring='neg_mean_squared_error',
            cv=5,
            n_jobs=multiprocessing.cpu_count()-1,
            verbose=10,
            error_score='raise'
        ),
    'RandomForest':  GridSearchCV(
            Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('random_forest', RandomForestRegressor(random_state=seed))]), 
            param_grid={
                'windowing__aggregate': [agg],
                'windowing__rolling': range(1, 30, 5),
                'windowing__dropna': [False],
                'random_forest__max_depth': range(1, 20, 5)
            },
            scoring='neg_mean_squared_error',
            cv=5,
            n_jobs=multiprocessing.cpu_count()-1,
            verbose=10,
            error_score='raise'
        ),
    'NormalizedRandomForest':  GridSearchCV(
            Pipeline([
                ('windowing', TimeWindowTransformer(columns=cols)),
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('transformer', TransformedTargetRegressor(
                        transformer=MinMaxScaler(feature_range=(0, 1)),
                        regressor=RandomForestRegressor(random_state=seed)
                    )
                )
            ]), 
            param_grid={
                'windowing__aggregate': [agg],
                'windowing__rolling': range(1, 30, 5),
                'windowing__dropna': [False],
                'transformer__regressor__max_depth': range(1, 20, 5)
            },
            scoring='neg_mean_squared_error',
            cv=5,
            n_jobs=-1,
            verbose=10,
            error_score='raise'
        ),
}

In [None]:
from sklearn.model_selection import cross_val_score

result_level = {}
for alg, clf in algorithms.items():
    result_level[alg] = cross_val_score(clf, ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('level'))
    result_level['target'] = 'level'




In [None]:
result_streamflow = {}
for alg, clf in algorithms.items():
    result_streamflow[alg] = cross_val_score(clf, ConsolidatedDataFrame, ConsolidatedDataFrame.index.get_level_values('streamflow'))
    result_streamflow['target'] = 'streamflow'

In [None]:
print(result_streamflow)

In [None]:
result = pandas.DataFrame.from_dict(result_streamflow).append(pandas.DataFrame.from_dict(result_level))
result.pivot(columns='target')