# Comparative Analysis

## Importando bibliotecas

In [1]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder,
    LabelEncoder, SplineTransformer, OrdinalEncoder
)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn import set_config
from sklearn.model_selection import RandomizedSearchCV, cross_validate, ShuffleSplit
from sklearn import metrics
from sklearn.metrics import make_scorer 

## Tratamento dos Dados

#### Como nosso objetivo é predizer a precipitação mensal para as Regiões Hidrográficas do Estado do Ceará, precisamos que nosso modelo consiga fazer a predição da precipitação futura. <br><br> Para tal, foi criada uma função que cria uma janela e para cada ponto (lat,lon), adiciona a respectiva precipitação 2 meses à frente.  <br><br>  Além disso, avaliamos a influência das variáveis preditoras (índices oceânicos e variáveis atmosféricas) nos 4 meses anteriores, a fim de buscar a melhor forma de predição. 

In [2]:
def get_future(df, columns, janela):
    """
    A função pega a base de dados, e para cada ponto (lat,lon), adiciona a respectiva precipitação 2 meses à frente e algumas variáveis nos últimos 4 meses
    
    """
    suffix = 'mais' if janela > 0 else 'menos'
    df_out = df.copy()
    new_columns = [f'{variavel}_{suffix}_{abs(janela)}' for variavel in columns]
    for posicao in df.posicao.unique():
        criteria = "posicao == @posicao"
        df_out.loc[df_out.eval(criteria), new_columns] = (
            df_out
            .query(criteria)
            .shift(periods=-janela)[columns].values
        )
    return df_out

important_columns = [
    #'divergencia', 'umidade', 'vento_vertical', 'vorticidade', 'fluxo_energia', 
    'EMI', 'nino3', 'atl3' #, 'atn', 'ats', 'atlgrad',  'seta', 'nesta'
]
df_original = (
    pd
    .read_csv("../data/raw/data_regiao_hidro.csv")
    .pipe(get_future, ['pr'], 2)
    .pipe(get_future, important_columns, -1)
    .pipe(get_future, important_columns, -2)
    .pipe(get_future, important_columns, -3)
    .pipe(get_future, important_columns, -4)
    .sample(1000, random_state=42) #remover depois
)


In [3]:
df = df_original.assign(
    lat = df_original.posicao.apply(lambda x: eval(x)[0]),
    lon = df_original.posicao.apply(lambda x: eval(x)[1]),
    ano = df_original.data.apply(lambda x: int(x[:4])),
    mes = df_original.data.apply(lambda x: int(x[5:7]))
).drop(columns=["data","posicao","regiao_hidro"], axis=1).dropna()

In [4]:
df

Unnamed: 0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,...,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4,lat,lon,ano,mes
51892,266.61,-2.170750e-07,88.00,0.30,-0.000003,185.95,0.92,0.25,-0.26,0.34,...,0.36,1.14,0.38,0.24,1.25,-0.02,-4.25,-41.00,2003,3
56734,289.18,3.376360e-06,81.33,-0.19,0.000007,10.95,0.81,0.04,0.75,0.14,...,0.84,0.65,0.16,0.80,0.59,0.23,-5.75,-40.50,2005,3
62896,4.05,-2.306960e-08,63.55,-0.08,0.000010,72.78,-0.50,-1.61,-0.19,0.01,...,0.08,-1.15,0.05,0.09,-0.92,0.27,-5.00,-39.00,2007,11
15323,19.93,2.969190e-06,73.13,0.31,0.000014,67.04,0.08,1.36,0.52,0.39,...,-0.23,0.97,0.14,0.16,1.13,0.22,-3.75,-41.00,1987,7
81862,9.36,4.516920e-06,67.78,-0.15,0.000026,51.28,0.36,2.81,0.13,0.33,...,0.03,2.47,-0.13,0.19,2.05,-0.39,-5.75,-39.00,2015,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10078,255.55,5.928090e-06,91.19,-0.08,0.000007,-57.31,-0.86,-0.72,-0.66,0.23,...,-0.25,-1.18,0.26,-0.58,-1.09,0.48,-4.00,-39.50,1985,4
39010,3.68,-1.399030e-06,56.23,0.01,0.000017,1.80,-0.68,2.80,-0.04,-0.20,...,-0.47,1.60,-1.17,0.25,0.73,-0.54,-4.25,-38.50,1997,9
79307,3.89,-4.737290e-06,68.78,-0.05,0.000016,-115.34,0.66,0.77,-0.05,-0.37,...,0.11,0.17,-0.32,0.03,0.28,-0.48,-3.75,-39.25,2014,11
88069,1.70,-5.498480e-06,67.76,-0.04,0.000013,148.34,0.67,-0.06,-0.09,0.33,...,0.27,-0.35,0.32,0.21,-0.64,0.01,-3.00,-40.50,2018,8


In [5]:
df.isnull().sum().sort_values(ascending=False)

pr                0
nino3_menos_1     0
ano               0
lon               0
lat               0
atl3_menos_4      0
nino3_menos_4     0
EMI_menos_4       0
atl3_menos_3      0
nino3_menos_3     0
EMI_menos_3       0
atl3_menos_2      0
nino3_menos_2     0
EMI_menos_2       0
atl3_menos_1      0
EMI_menos_1       0
divergencia       0
pr_mais_2         0
nesta             0
seta              0
atl3              0
atlgrad           0
ats               0
atn               0
nino3             0
EMI               0
fluxo_energia     0
vorticidade       0
vento_vertical    0
umidade           0
mes               0
dtype: int64

In [6]:
df['range_pr'] = pd.cut(df.pr_mais_2, bins=[0,50,100,200,300,400,800], labels=['0 a 50','50.1 a 100','100.1 a 200','200.1 a 300','300.1 a 400','Acima de 400'])

In [7]:
df['range_pr'].value_counts().sort_index()

0 a 50          612
50.1 a 100      131
100.1 a 200     161
200.1 a 300      56
300.1 a 400      23
Acima de 400      6
Name: range_pr, dtype: int64

In [8]:
target_column = 'range_pr'
#nominal_columns = [column for column in list(df.select_dtypes(object)) if column != target_column]
quantitative_columns = [column for column in list(df.select_dtypes(np.number)) if column != 'pr_mais_2'] 

In [9]:
X = (
    df
    .query(f'{target_column}.notna()')
    .drop([target_column,'pr_mais_2'], axis=1)
)
target_transformer = LabelEncoder()
y = target_transformer.fit_transform(
    df
    .query(f'{target_column}.notna()')[[target_column]]
    .values.ravel()
)#[np.newaxis].T

In [10]:
print(X.shape)
print(y.shape)

(989, 30)
(989,)


In [11]:
X

Unnamed: 0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,...,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4,lat,lon,ano,mes
51892,266.61,-2.170750e-07,88.00,0.30,-0.000003,185.95,0.92,0.25,-0.26,0.34,...,0.36,1.14,0.38,0.24,1.25,-0.02,-4.25,-41.00,2003,3
56734,289.18,3.376360e-06,81.33,-0.19,0.000007,10.95,0.81,0.04,0.75,0.14,...,0.84,0.65,0.16,0.80,0.59,0.23,-5.75,-40.50,2005,3
62896,4.05,-2.306960e-08,63.55,-0.08,0.000010,72.78,-0.50,-1.61,-0.19,0.01,...,0.08,-1.15,0.05,0.09,-0.92,0.27,-5.00,-39.00,2007,11
15323,19.93,2.969190e-06,73.13,0.31,0.000014,67.04,0.08,1.36,0.52,0.39,...,-0.23,0.97,0.14,0.16,1.13,0.22,-3.75,-41.00,1987,7
81862,9.36,4.516920e-06,67.78,-0.15,0.000026,51.28,0.36,2.81,0.13,0.33,...,0.03,2.47,-0.13,0.19,2.05,-0.39,-5.75,-39.00,2015,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10078,255.55,5.928090e-06,91.19,-0.08,0.000007,-57.31,-0.86,-0.72,-0.66,0.23,...,-0.25,-1.18,0.26,-0.58,-1.09,0.48,-4.00,-39.50,1985,4
39010,3.68,-1.399030e-06,56.23,0.01,0.000017,1.80,-0.68,2.80,-0.04,-0.20,...,-0.47,1.60,-1.17,0.25,0.73,-0.54,-4.25,-38.50,1997,9
79307,3.89,-4.737290e-06,68.78,-0.05,0.000016,-115.34,0.66,0.77,-0.05,-0.37,...,0.11,0.17,-0.32,0.03,0.28,-0.48,-3.75,-39.25,2014,11
88069,1.70,-5.498480e-06,67.76,-0.04,0.000013,148.34,0.67,-0.06,-0.09,0.33,...,0.27,-0.35,0.32,0.21,-0.64,0.01,-3.00,-40.50,2018,8


In [12]:
# nominal_preprocessing = Pipeline([    
#     ("missing", SimpleImputer(strategy='most_frequent')),
#     ("encoder", OneHotEncoder(sparse=False)),
#     ("scaler", StandardScaler())
# ])
quantitative_preprocessing = Pipeline([
    ("missing", SimpleImputer()),
    ("scaler", StandardScaler())
])
preprocessing = ColumnTransformer([
    ("ohe", OneHotEncoder(), ["mes"]),
    #("nominal", nominal_preprocessing, nominal_columns),
    ("quantitative", quantitative_preprocessing, quantitative_columns)
])

In [13]:
models = [{
    'name': 'knn',
    'model': KNeighborsClassifier(),
    'parameters': {
        'n_neighbors': np.arange(3, 17, 2),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
}
    ,{
    'name': 'LR',
    'model': LogisticRegression(max_iter=3000, solver='saga', multi_class='ovr'),
    'parameters': {
        'penalty': ['l1', 'l2'],
        'C' : np.logspace(-4, 4, 10),
    }
}
,{
    'name': 'SVC',
    'model': SVC(max_iter=10000, gamma='auto'),
    'parameters': {
        "C": [1, 10, 100, 1e3]
    }
},
    {
    'name': 'GB',
    'model': GradientBoostingClassifier(loss="deviance"),
    'parameters': {
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "min_samples_split": np.linspace(0.1, 0.5, 6),
        "min_samples_leaf": np.linspace(0.1, 0.5, 6),
        "max_depth": [3, 5, 8],
        "max_features": ["log2", "sqrt"],
        "criterion": ["friedman_mse", "squared_error"],
        "subsample": [0.5, 0.8, 0.9, 1.0]
    }
}
]

In [14]:
def concatenate(*args):
    final_dict = {key: [] for key in args[0].keys()}
    for dictionary in args:
        for key, value in dictionary.items():
            final_dict[key].extend(value)
    return final_dict

In [15]:
n_splits_cv = 2
n_splits_cv_gs = 3
sc = []
for model in models:
    print(f"running {model['name']}")
    param_grid = {
        'preprocessing__quantitative__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        #'preprocessing__nominal__encoder': [OneHotEncoder(sparse=False), OrdinalEncoder()],
        #'preprocessing__nominal__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        'preprocessing__quantitative__missing__strategy': ['mean', 'median'],
        **{f"model__{key}": value for key, value in model['parameters'].items()}
    }
    approach = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model['model'])
    ])
    gs = RandomizedSearchCV(
        estimator=approach,
        param_distributions=param_grid,
        scoring='accuracy',
        cv=n_splits_cv_gs,
        random_state=42
    )
    scores = cross_validate(
        estimator = gs,
        X=X,
        y=y,
        cv = ShuffleSplit(n_splits=n_splits_cv, test_size=.2, random_state=42),
        n_jobs = -1,
        scoring = {
           'accuracy': make_scorer(metrics.accuracy_score, average='weighted'),
           'precision': make_scorer(metrics.precision_score, average='weighted'),
           'recall': make_scorer(metrics.recall_score, average='weighted'),
           'f1': make_scorer(metrics.f1_score, average='weighted')
           }
    )
    scores['model'] = [model['name']] * n_splits_cv
    sc.append(scores)
scores = concatenate(*sc)

running knn
running LR
running SVC
running GB


In [16]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(scores)
    .groupby(['model'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
print(f'O melhor modelo é o {winner}')

score,GB,LR,SVC,knn
fit_time,7.865 ± 0.025,17.645 ± 0.398,0.712 ± 0.015,0.466 ± 0.005
score_time,0.005 ± 0.000,0.003 ± 0.000,0.007 ± 0.000,0.007 ± 0.000
test_accuracy,nan ± nan,nan ± nan,nan ± nan,nan ± nan
test_precision,nan ± nan,nan ± nan,nan ± nan,nan ± nan
test_recall,nan ± nan,nan ± nan,nan ± nan,nan ± nan
test_f1,nan ± nan,nan ± nan,nan ± nan,nan ± nan


O melhor modelo é o GB


In [17]:
# def highlight_max(s, props=''):
#     values = [float(value.split()[0]) for value in s.values[1:]]
#     result = [''] * len(s.values)
#     if s.values[0].endswith('time'):
#         result[np.argmin(values)+1] = props
#     else:
#         result[np.argmax(values)+1] = props
#     return result

# def get_winner(s):
#     metric = s.values[0]
#     values = [float(value.split()[0]) for value in s.values[1:]]
#     models = results.columns[1:]
    
#     if s.values[0].endswith('time'):
#         return models[np.argmin(values)]
#     else:
#         return models[np.argmax(values)]

# results = (
#     pd
#     .DataFrame(scores)
#     .groupby(['model'])
#     .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
#     .transpose()
#     .reset_index()
#     .rename(columns={"level_0": "score"})
#     .drop(columns="level_1")
#     # .set_index('score')
# )
# time_scores = ['fit_time', 'score_time']
# winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
# results.columns.name = ''
# results = (
#     results
#     .style
#     .hide(axis='index')
#     .apply(highlight_max, props='color:white;background-color:gray', axis=1)
# )
# display(results)
# print(f'O melhor modelo é o {winner}')

In [18]:
# best_model = next(item for item in models if item["name"] == winner)

# param_grid = {
#     'preprocessing__quantitative__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
#     'preprocessing__nominal__encoder': [OneHotEncoder(sparse=False), OrdinalEncoder()],
#     'preprocessing__nominal__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
#     'preprocessing__quantitative__missing__strategy': ['mean', 'median'],
#     **{f"model__{key}": value for key, value in best_model['parameters'].items()}
# }

# approach = Pipeline([
#     ('preprocessing', preprocessing),
#     ('model', best_model['model'])
# ])

# gs = RandomizedSearchCV(
#     estimator=approach,
#     param_distributions=param_grid,
#     scoring='accuracy',
#     cv=n_splits_cv_gs,
#     random_state=42
# )

# gs.fit(X, y)

# model = gs.best_estimator_
# joblib.dump(model, '../models/best_model.joblib')

Traceback (most recent call last):
  File "/home/bcnishi/.cache/pypoetry/virtualenvs/src-n1MWWP9q-py3.8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/bcnishi/.cache/pypoetry/virtualenvs/src-n1MWWP9q-py3.8/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/bcnishi/.cache/pypoetry/virtualenvs/src-n1MWWP9q-py3.8/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
TypeError: accuracy_score() got an unexpected keyword argument 'average'

Traceback (most recent call last):
  File "/home/bcnishi/.cache/pypoetry/virtualenvs/src-n1MWWP9q-py3.8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/hom