In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv(r'..\..\datasets\df_norm_most_common_ncm.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76457 entries, 0 to 76456
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   nf_timestamp     76457 non-null  float64
 1   emit_lat         76457 non-null  float64
 2   emit_long        76457 non-null  float64
 3   prod_quant       76457 non-null  float64
 4   prod_valor_unit  76457 non-null  float64
dtypes: float64(5)
memory usage: 2.9 MB


In [3]:
model = IsolationForest(
    random_state=26,
    verbose=True,
    contamination=0.01,
    n_estimators=100
)

model.fit(df)

anomaly_predictions = model.predict(df)
df['anomaly_score'] = model.decision_function(df)

df_outliers = df[anomaly_predictions == -1]
df_outliers.head(5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s finished


Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score
40457,0.989462,-41.864126,-4.148677,5.192957,3.048799,-0.006586
40617,0.428321,-34.017456,-3.577181,8.006701,0.916291,-0.000949
40619,0.638342,-34.017456,-3.577181,8.006701,0.463734,-0.008528
40620,0.638342,-34.017456,-3.577181,8.006701,0.500775,-0.008348
40625,0.357084,-34.017456,-3.577181,9.615872,0.470004,-0.022492


In [4]:
index_array = df_outliers.index.values
index_array

array([40457, 40617, 40619, 40620, 40625, 46587, 46588, 46589, 46590,
       46591, 46592, 46593, 46594, 46596, 46597, 46598, 46599, 46600,
       46601, 46602, 46603, 46604, 46605, 46606, 47533, 47540, 47557,
       47558, 47567, 47568, 47569, 47581, 47588, 47594, 47595, 47604,
       47607, 47614, 47616, 47617, 47628, 47629, 47635, 47647, 47667,
       47672, 47675, 47678, 47680, 47682, 47686, 47690, 47692, 47693,
       47694, 47696, 50887, 55969, 55970, 55974, 55975, 55976, 55978,
       55979, 56028, 56029, 56033, 56034, 56035, 56037, 56038, 56136,
       56143, 56144, 56149, 56150, 56151, 56236, 56257, 56278, 56288,
       56313, 56342, 56355, 56357, 56370, 56390, 56397, 56418, 56450,
       56452, 56479, 56511, 56516, 56519, 56520, 56526, 56593, 56594,
       56614, 56671, 56964, 56965, 56967, 56968, 56969, 56971, 56973,
       56974, 56976, 56977, 56978, 56981, 56982, 56983, 56984, 56989,
       56992, 56993, 56995, 56998, 57000, 62143, 62147, 62149, 62152,
       62153, 62154,

In [5]:
df_outliers['idx'] = df_outliers.index
df_outliers = df_outliers.sort_values(by='anomaly_score')
df_outliers.to_csv('output\outliers.csv', index=False)
df_outliers

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score,idx
71971,0.136962,-54.642650,-7.315482,11.695255,0.270027,-0.085583,71971
71964,0.144375,-54.642650,-7.315482,10.747229,0.182322,-0.083142,71964
71970,0.908121,-54.642650,-7.315482,11.002117,0.122218,-0.081285,71970
71968,0.453199,-54.642650,-7.315482,11.918397,0.113329,-0.071458,71968
71966,0.144375,-54.642650,-7.315482,9.105091,0.215111,-0.070326,71966
...,...,...,...,...,...,...,...
70817,0.246525,-64.463802,-6.744972,1.386294,3.258097,-0.000063,70817
70816,0.246525,-64.463802,-6.744972,1.386294,3.259250,-0.000063,70816
73188,0.668846,-39.020332,-4.738681,6.167516,0.039221,-0.000040,73188
72301,0.511450,-60.249002,-7.338564,1.609438,2.550226,-0.000030,72301


In [6]:
index_df = pd.DataFrame(index_array, columns=['idx'])
index_df.to_csv('output\outlier_indexes.csv', index=True)

In [7]:
dft = pd.read_csv(r'output\outliers.csv', encoding='latin-1')
dft.set_index('idx', inplace=True)
dft = dft.sort_values(by='anomaly_score', ascending=True)
dft.head(5)

Unnamed: 0_level_0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
71971,0.136962,-54.64265,-7.315482,11.695255,0.270027,-0.085583
71964,0.144375,-54.64265,-7.315482,10.747229,0.182322,-0.083142
71970,0.908121,-54.64265,-7.315482,11.002117,0.122218,-0.081285
71968,0.453199,-54.64265,-7.315482,11.918397,0.113329,-0.071458
71966,0.144375,-54.64265,-7.315482,9.105091,0.215111,-0.070326
