In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [9]:
df = pd.read_csv(r'..\..\datasets\df_rob_norm_most_common_ncm.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76457 entries, 0 to 76456
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   nf_timestamp     76457 non-null  float64
 1   emit_lat         76457 non-null  float64
 2   emit_long        76457 non-null  float64
 3   prod_quant       76457 non-null  float64
 4   prod_valor_unit  76457 non-null  float64
dtypes: float64(5)
memory usage: 2.9 MB


In [10]:
model = IsolationForest(
    random_state=26,
    verbose=True,
    contamination=0.001
)

model.fit(df)

anomaly_predictions = model.predict(df)
df['anomaly_score'] = model.decision_function(df)

df_outliers = df[anomaly_predictions == -1]
df_outliers.head(5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score
46587,-0.867215,-42.023423,-3.25096,160.081081,-0.316169,-0.018173
46597,0.020243,-42.023423,-3.25096,623.662162,-0.315688,-0.017186
46598,0.020243,-42.023423,-3.25096,407.175676,-0.28128,-0.023762
46601,-0.82201,-42.023423,-3.25096,0.108108,37.606112,-0.001546
56992,-0.625443,-43.514492,-4.722266,135.081081,-0.299326,-0.017464


In [11]:
index_array = df_outliers.index.values
index_array

array([46587, 46597, 46598, 46601, 56992, 56993, 56995, 62149, 62162,
       62178, 62184, 62195, 62209, 62223, 62237, 63839, 65203, 71964,
       71968, 71970, 71971, 72074, 72075, 72076, 72078, 72080, 72081,
       72083, 72102, 72103, 72104, 72123, 72144, 72167, 72184, 72186,
       72197, 72198, 72199, 72220, 72221, 72222, 72223, 72224, 72227,
       72241, 72242, 72260, 72263, 72264, 73141, 73152, 73169, 73211,
       73222, 73243, 73248, 73274, 73281, 73286, 73298, 73301, 73372,
       73373, 73375, 76298, 76336, 76339, 76439, 76440, 76441, 76443,
       76444, 76447, 76448, 76450], dtype=int64)

In [12]:
df_outliers['idx'] = df_outliers.index
df_outliers = df_outliers.sort_values(by='anomaly_score')
df_outliers.to_csv('output\outliers.csv', index=False)
df_outliers

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score,idx
71971,-0.829796,-54.642650,-7.315482,810.756757,-0.324591,-0.058297,71971
71970,0.836828,-54.642650,-7.315482,405.351351,-0.328922,-0.054192,71970
71964,-0.813774,-54.642650,-7.315482,314.135135,-0.327238,-0.053558,71964
71968,-0.146346,-54.642650,-7.315482,1013.459459,-0.329163,-0.050569,71968
72083,1.021847,-25.333011,-5.930694,299.135135,-0.265881,-0.042203,72083
...,...,...,...,...,...,...,...
73248,0.192927,-39.020332,-4.738681,116.837838,-0.286574,-0.003685,73248
73301,0.308581,-39.020332,-4.738681,162.108108,-0.325794,-0.003254,73301
73169,-0.075835,-39.020332,-4.738681,168.864865,-0.328200,-0.001664,73169
46601,-0.822010,-42.023423,-3.250960,0.108108,37.606112,-0.001546,46601


In [13]:
index_df = pd.DataFrame(index_array, columns=['idx'])
index_df.to_csv('output\outlier_indexes.csv', index=True)

In [14]:
dft = pd.read_csv(r'output\outliers.csv', encoding='latin-1')
dft.set_index('idx', inplace=True)
dft = dft.sort_values(by='anomaly_score', ascending=True)
dft.head(5)

Unnamed: 0_level_0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit,anomaly_score
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
71971,-0.829796,-54.64265,-7.315482,810.756757,-0.324591,-0.058297
71970,0.836828,-54.64265,-7.315482,405.351351,-0.328922,-0.054192
71964,-0.813774,-54.64265,-7.315482,314.135135,-0.327238,-0.053558
71968,-0.146346,-54.64265,-7.315482,1013.459459,-0.329163,-0.050569
72083,1.021847,-25.333011,-5.930694,299.135135,-0.265881,-0.042203
