In [56]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [57]:
df = pd.read_csv(r'..\..\datasets\df_minmax_norm.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2065547 entries, 0 to 2065546
Data columns (total 7 columns):
 #   Column            Dtype  
---  ------            -----  
 0   nf_timestamp      float64
 1   emit_lat          float64
 2   emit_long         float64
 3   prod_ncm          float64
 4   prod_quant        float64
 5   prod_valor_unit   float64
 6   prod_valor_total  float64
dtypes: float64(7)
memory usage: 110.3 MB


In [58]:
model = IsolationForest(
    random_state=26,
    verbose=True,
    contamination=0.0001
)

model.fit(df)

anomaly_predictions = model.predict(df)
df['anomaly_scores'] = model.decision_function(df)

df_outliers = df[anomaly_predictions == -1]
df_outliers.head(5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s finished


Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_scores
617058,0.986435,0.71257,0.997562,0.90185,1.5e-07,0.000254,0.003044,-0.000644
668827,0.994641,0.71257,0.997562,0.90185,1.5e-07,0.000254,0.003044,-0.005267
790673,0.973785,0.71257,0.997562,0.902751,1.25e-08,0.00268,0.00268,-0.002192
794659,0.966331,0.71257,0.997562,0.902781,1.25e-08,0.001535,0.001535,-0.002192
806356,0.968863,0.71257,0.997562,0.902751,1.25e-08,0.00336,0.00336,-0.002192


In [59]:
index_array = outliers_df.index.values
index_array

array([2065189, 2065190, 2065186, 2065182, 2064097, 2065192, 2065188,
       2059323, 2001226, 2065193, 2060086, 1988389, 2058943, 2059288,
       1631699, 2055077, 2055069, 2055085, 2055047, 2055055, 2065185,
       2065196, 2065184, 2001164, 1089316, 1988875, 2054297, 2054302,
       1631760, 1634603, 1558514, 1634888, 1557854, 2055093, 2055033,
       2055038, 2055049, 2055057, 1633053, 1558290, 1557936, 1633231,
       1919610, 1919439, 1919516, 1919384, 2003866, 1919377, 1919517,
       1919440, 1633913, 1634196, 1632159, 1634602, 1631567, 1557852,
       1558513, 2003932, 2003964, 2061138, 1634506, 2059328, 1988385,
       1919512, 1919435, 1631872, 1632877, 1632039, 1631614, 1988388,
       1980293, 1983663, 1984041, 2060002, 2052052, 2052171, 2051426,
       2051042, 2050974, 1633054, 1633632, 2064664, 1991158, 1633583,
       1633911, 1632407, 2059297, 1632622, 1632822, 1632463, 1981105,
       1982427, 1632876, 1634300, 2001179, 2001168, 2001163, 2061129,
       1632761, 1631

In [61]:
df_outliers['idx'] = df_outliers.index
df_outliers.to_csv('output\outliers.csv', index=False)
df_outliers = df_outliers.sort_values(by='anomaly_scores')
df_outliers

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_scores,idx
2065189,0.995813,0.326848,0.318027,0.960910,6.387125e-04,4.729287e-08,0.002417,-0.021934,2065189
2064097,0.298928,0.053435,0.436355,0.000000,2.500000e-08,1.791397e-03,0.003583,-0.019398,2064097
2065186,0.995813,0.326848,0.318027,0.960820,3.703625e-04,6.449028e-08,0.001911,-0.019289,2065186
2065190,0.995813,0.326848,0.318027,0.960910,2.963675e-03,3.439481e-09,0.000815,-0.018761,2065190
1988875,0.038332,0.237667,0.582565,0.902751,1.250000e-08,5.991419e-03,0.005991,-0.015075,1988875
...,...,...,...,...,...,...,...,...,...
1633390,0.911731,0.619720,0.366796,0.490110,1.491587e-03,6.433263e-07,0.076766,-0.000268,1633390
1633391,0.911731,0.619720,0.366796,0.490110,1.491587e-03,6.823071e-07,0.081418,-0.000268,1633391
2059327,0.991056,0.261073,0.568905,0.901840,9.000000e-04,4.299352e-09,0.000306,-0.000098,2059327
2059038,0.996200,0.258691,0.718174,0.854371,2.300000e-06,1.604518e-04,0.029523,-0.000085,2059038


In [62]:
index_df = pd.DataFrame(index_array, columns=['idx'])
index_df.to_csv('output\outlier_indexes.csv', index=True)

In [63]:
dft = pd.read_csv(r'output\outliers.csv', encoding='latin-1')
dft.set_index('idx', inplace=True)
dft = dft.sort_values(by='anomaly_scores', ascending=True)
dft.head(5)

Unnamed: 0_level_0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_scores
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2065189,0.995813,0.326848,0.318027,0.96091,0.0006387125,4.729287e-08,0.002417,-0.021934
2064097,0.298928,0.053435,0.436355,0.0,2.5e-08,0.001791397,0.003583,-0.019398
2065186,0.995813,0.326848,0.318027,0.96082,0.0003703625,6.449028e-08,0.001911,-0.019289
2065190,0.995813,0.326848,0.318027,0.96091,0.002963675,3.439481e-09,0.000815,-0.018761
1988875,0.038332,0.237667,0.582565,0.902751,1.25e-08,0.005991419,0.005991,-0.015075
