# Isolation Forest (iForest)

In [8]:
# Bibliotecas de manipulação de dados
import pandas as pd
import numpy as np

# Biblioteca para a aplicação do iForest
from sklearn.ensemble import IsolationForest

# Biblioteca para desligar os avisos do Python
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [9]:
# Carrega odataset
df = pd.read_csv(r'..\..\datasets\df_norm_most_common_ncm.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76457 entries, 0 to 76456
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   nf_timestamp     76457 non-null  float64
 1   emit_lat         76457 non-null  float64
 2   emit_long        76457 non-null  float64
 3   prod_quant       76457 non-null  float64
 4   prod_valor_unit  76457 non-null  float64
dtypes: float64(5)
memory usage: 2.9 MB


In [10]:
# Cria um objeto IsolationForest com contaminação de 1%
model = IsolationForest(
    random_state=26,
    verbose=True,
    contamination=0.01,
    n_estimators=100
)

# Treina o modelo
model.fit(df)

# Encontra o Anomaly Score de cada registro
anomaly_predictions = model.predict(df)
df['anomaly_score'] = model.decision_function(df)

# Cria uma nova lsita com apenas os outliers
df_outliers = df[anomaly_predictions == -1]
df_outliers.head(5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score
40617,0.428321,-34.017456,-3.577181,8.006701,0.916291,-0.00547
40618,0.428321,-34.017456,-3.577181,7.601402,0.57098,-0.005622
40619,0.638342,-34.017456,-3.577181,8.006701,0.463734,-0.007349
40620,0.638342,-34.017456,-3.577181,8.006701,0.500775,-0.007349
40625,0.357084,-34.017456,-3.577181,9.615872,0.470004,-0.029144


In [11]:
# Cira uma lista contendo o index dos outliers
index_array = df_outliers.index.values
index_array

array([40617, 40618, 40619, 40620, 40625, 46587, 46588, 46589, 46590,
       46591, 46592, 46593, 46594, 46596, 46597, 46598, 46599, 46600,
       46601, 46602, 46603, 46604, 46605, 46606, 47297, 47533, 47540,
       47557, 47558, 47568, 47569, 47581, 47588, 47594, 47595, 47604,
       47607, 47614, 47616, 47617, 47628, 47629, 47635, 47647, 47651,
       47657, 47665, 47667, 47675, 47678, 47680, 47682, 47686, 47690,
       47694, 47696, 50887, 55969, 55970, 55974, 55975, 55976, 55978,
       55979, 56028, 56029, 56033, 56034, 56035, 56037, 56038, 56123,
       56131, 56136, 56143, 56144, 56149, 56150, 56151, 56236, 56278,
       56288, 56313, 56342, 56355, 56357, 56397, 56450, 56452, 56479,
       56504, 56511, 56513, 56516, 56519, 56520, 56593, 56594, 56614,
       56671, 56964, 56965, 56967, 56968, 56969, 56971, 56973, 56974,
       56976, 56977, 56978, 56981, 56982, 56983, 56984, 56989, 56992,
       56993, 56995, 56998, 57000, 62143, 62147, 62149, 62153, 62154,
       62157, 62159,

In [12]:
# Ordena os resultados pelos outliers com maior Anomaly Score
df_outliers['idx'] = df_outliers.index
df_outliers = df_outliers.sort_values(by='anomaly_score')

# Salva a lista completa e ordenada de outliers em um CSV
df_outliers.to_csv('output\outliers.csv', index=False)
df_outliers

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score,idx
71964,0.144375,-54.642650,-7.315482,10.747229,0.182322,-0.083333,71964
71971,0.136962,-54.642650,-7.315482,11.695255,0.270027,-0.081833,71971
71970,0.908121,-54.642650,-7.315482,11.002117,0.122218,-0.081319,71970
71966,0.144375,-54.642650,-7.315482,9.105091,0.215111,-0.070608,71966
76339,0.118189,-42.341793,-4.920552,10.338447,1.202972,-0.068100,76339
...,...,...,...,...,...,...,...
74025,0.467469,-40.751167,-4.852574,6.658011,4.183576,-0.000060,74025
73171,0.578011,-39.020332,-4.738681,7.273093,0.398776,-0.000052,73171
67416,0.174810,-37.272421,-5.371197,0.693147,0.916291,-0.000051,67416
70838,0.759769,-64.463802,-6.744972,1.098612,2.765690,-0.000047,70838


In [13]:
# Salva a lista de indexes em um CSV
index_df = pd.DataFrame(index_array, columns=['idx'])
index_df.to_csv('output\outlier_indexes.csv', index=True)