In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv(r'..\..\datasets\df_rob_norm_most_common_ncm.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76457 entries, 0 to 76456
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   nf_timestamp     76457 non-null  float64
 1   emit_lat         76457 non-null  float64
 2   emit_long        76457 non-null  float64
 3   prod_quant       76457 non-null  float64
 4   prod_valor_unit  76457 non-null  float64
dtypes: float64(5)
memory usage: 2.9 MB


In [3]:
model = IsolationForest(
    random_state=26,
    verbose=True,
    contamination=0.001
)

model.fit(df)

anomaly_predictions = model.predict(df)
df['anomaly_score'] = model.decision_function(df)

df_outliers = df[anomaly_predictions == -1]
df_outliers.head(5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score
4364,0.753283,0.0,0.0,202.648649,-0.330606,-0.008322
4695,0.218742,0.0,0.0,337.783784,-0.331328,-0.001635
4757,-0.664935,0.0,0.0,540.486486,-0.330847,-0.041941
4761,-0.414766,0.0,0.0,364.810811,-0.328681,-0.022004
4793,-1.066231,0.0,0.0,156.128378,-0.331088,-0.012491


In [4]:
index_array = df_outliers.index.values
index_array

array([ 4364,  4695,  4757,  4761,  4793,  4904,  5492,  5493,  5621,
        5859,  6010,  6839,  6872,  6876,  6897,  7266,  7309,  7310,
        7315,  7316,  7320,  8013,  8440,  9505,  9862, 11941, 11974,
       11980, 13094, 13179, 14271, 14548, 14767, 14768, 14770, 14771,
       14911, 15765, 16760, 17248, 17250, 17251, 18134, 18237, 18238,
       18309, 18312, 18425, 19538, 19540, 19544, 19558, 19994, 20031,
       22647, 22648, 22686, 23662, 24535, 24923, 25055, 25474, 26011,
       26280, 26281, 26283, 26284, 27928, 27944, 27976, 28014, 28629,
       28964, 29033, 29034, 29477, 30507, 30509, 30510, 30512, 31568,
       31569, 31573, 31759, 32618, 32619, 32742, 33823, 33824, 33883,
       33926, 35339, 37551, 37552, 37554, 37555, 37758, 39373, 39732,
       40189, 40364, 40367, 40621, 40625, 41179, 41412, 41809, 41995,
       43868, 43972, 44040, 44089, 44090, 44094, 44265, 44269, 44319,
       44404, 44428, 44430, 44432, 44440, 44567, 44569, 44570, 44571,
       44685, 44687,

In [12]:
df_outliers['idx'] = df_outliers.index
df_outliers = df_outliers.sort_values(by='anomaly_score')
df_outliers.to_csv('output\outliers.csv', index=False)
df_outliers

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_score,idx
2065189,0.999397,-39.841271,-7.351677,1.317446,1064.312500,-0.127148,504.192274,-0.026280,2065189
2065190,0.999397,-39.841271,-7.351677,1.317446,4939.250000,-0.232302,169.918794,-0.025215,2065190
2065186,0.999397,-39.841271,-7.351677,1.317258,617.062500,-0.085911,398.601179,-0.024152,2065186
2065182,0.999397,-39.841271,-7.351677,1.025874,2583.083333,-0.226804,148.066186,-0.022559,2065182
2064097,-0.508692,-68.082198,-6.003544,-0.690973,-0.166667,4295.292096,747.691571,-0.022559,2064097
...,...,...,...,...,...,...,...,...,...
1632821,-0.502856,-9.590436,-6.796041,-0.690973,0.458333,305.601375,851.817719,-0.000142,1632821
1558003,-0.679891,-64.688044,-6.171945,-0.607116,400.645833,-0.183849,94.652015,-0.000083,1558003
1557932,-0.550051,-64.688044,-6.171945,-0.607116,514.791667,-0.183849,121.701924,-0.000083,1557932
1557850,-0.450926,-64.688044,-6.171945,-0.607116,482.604167,-0.183849,114.074235,-0.000083,1557850


In [13]:
index_df = pd.DataFrame(index_array, columns=['idx'])
index_df.to_csv('output\outlier_indexes.csv', index=True)

In [14]:
dft = pd.read_csv(r'output\outliers.csv', encoding='latin-1')
dft.set_index('idx', inplace=True)
dft = dft.sort_values(by='anomaly_score', ascending=True)
dft.head(5)

Unnamed: 0_level_0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_score
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2065189,0.999397,-39.841271,-7.351677,1.317446,1064.3125,-0.127148,504.192274,-0.02628
2065190,0.999397,-39.841271,-7.351677,1.317446,4939.25,-0.232302,169.918794,-0.025215
2065186,0.999397,-39.841271,-7.351677,1.317258,617.0625,-0.085911,398.601179,-0.024152
2065182,0.999397,-39.841271,-7.351677,1.025874,2583.083333,-0.226804,148.066186,-0.022559
2064097,-0.508692,-68.082198,-6.003544,-0.690973,-0.166667,4295.292096,747.691571,-0.022559
