In [3]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyod.models.hbos import HBOS
from sklearn.cluster import DBSCAN

from tqdm import tqdm

In [16]:
def dbscan(serie: pd.Series):
    clusterizacao = DBSCAN(eps=0.4, min_samples=15).fit_predict(serie.values)
    serie_clusterizada = serie.assign(cluster=clusterizacao)
    clusters = np.unique(clusterizacao)
    return clusters


def hbos(serie: pd.Series, clusters):

    mediana10 = serie.loc[serie.values > 0].median() * 10
    
    if serie.std().item() > mediana10.item():
        contamination = 0.25
    else:
        contamination = 0.01   

    serie = serie.assign(mes=serie.index.month)
    serie = serie.assign(ano=serie.index.year)

    hbos = HBOS(n_bins=clusters.shape[0], contamination=contamination, alpha=0.1, tol=0.9)
    hbos.fit(serie)
    outliers = hbos.predict(serie)
    
    serie = serie.assign(outlier=outliers)
    return serie.loc[serie.outlier != 1].drop(["mes", "ano", "outlier"], axis=1)


def remover_quatil_inferior(serie: pd.Series):
    quantil_inferior = serie.loc[serie.values > 0].quantile(0.1).item()
    return serie.replace({quantil_inferior: 0})

In [17]:
diretorio = Path("arquivos", "series-concatenadas", "Iguaçú.csv")
postos = pd.read_csv(diretorio, index_col=0)
postos = postos.set_index(pd.to_datetime(postos.index, format="%Y-%m-%d"))

series_sem_outliers = list()

for codigo_posto in tqdm(postos.columns):
    serie = postos.loc[:, [codigo_posto]].copy().dropna()
    serie = remover_quatil_inferior(serie)

    serie = hbos(serie, dbscan(serie))
    
    series_sem_outliers.append(serie)

diretorio_saida = Path(diretorio.parent, "Iguaçú_sout.csv")
pd.concat(series_sem_outliers, axis=1).to_csv(diretorio_saida)

100%|██████████| 33/33 [00:11<00:00,  2.78it/s]
