In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [9]:
df = pd.read_csv(r'..\..\datasets\df_rob_norm.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2065547 entries, 0 to 2065546
Data columns (total 7 columns):
 #   Column            Dtype  
---  ------            -----  
 0   nf_timestamp      float64
 1   emit_lat          float64
 2   emit_long         float64
 3   prod_ncm          float64
 4   prod_quant        float64
 5   prod_valor_unit   float64
 6   prod_valor_total  float64
dtypes: float64(7)
memory usage: 110.3 MB


In [10]:
model = IsolationForest(
    random_state=26,
    verbose=True,
    contamination=0.0001
)

model.fit(df)

anomaly_predictions = model.predict(df)
df['anomaly_score'] = model.decision_function(df)

df_outliers = df[anomaly_predictions == -1]
df_outliers.head(5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s finished


Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_score
1089316,-0.881439,-47.14815,-2.876557,0.333602,20.625,115.093814,10041.937704,-0.012745
1089341,-0.798843,-47.14815,-2.876557,1.195886,-0.145833,2794.456014,729.669998,-0.003137
1091482,-1.078579,-47.14815,-2.876557,-0.063684,971.041667,0.385567,2541.223781,-0.004671
1557777,0.59075,-64.688044,-6.171945,-0.607116,477.916667,-0.179038,122.577122,-0.001782
1557847,-0.679895,-64.688044,-6.171945,-0.607116,408.791667,-0.183849,96.582388,-8.3e-05


In [11]:
index_array = df_outliers.index.values
index_array

array([1089316, 1089341, 1091482, 1557777, 1557847, 1557848, 1557850,
       1557852, 1557854, 1557929, 1557932, 1557936, 1558003, 1558088,
       1558177, 1558290, 1558513, 1558514, 1625835, 1626246, 1626310,
       1626311, 1627751, 1628482, 1629222, 1631567, 1631614, 1631699,
       1631760, 1631818, 1631872, 1631882, 1631887, 1631888, 1631890,
       1632039, 1632062, 1632112, 1632114, 1632159, 1632166, 1632275,
       1632276, 1632407, 1632463, 1632562, 1632620, 1632621, 1632622,
       1632624, 1632761, 1632821, 1632822, 1632876, 1632877, 1632878,
       1633053, 1633054, 1633169, 1633231, 1633390, 1633391, 1633417,
       1633583, 1633590, 1633632, 1633634, 1633911, 1633913, 1634196,
       1634197, 1634300, 1634506, 1634525, 1634559, 1634602, 1634603,
       1634888, 1684931, 1919377, 1919384, 1919435, 1919439, 1919440,
       1919512, 1919516, 1919517, 1919610, 1942898, 1947004, 1947126,
       1947127, 1957887, 1957921, 1979471, 1980076, 1980293, 1980692,
       1981091, 1981

In [12]:
df_outliers['idx'] = df_outliers.index
df_outliers = df_outliers.sort_values(by='anomaly_score')
df_outliers.to_csv('output\outliers.csv', index=False)
df_outliers

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_score,idx
2065189,0.999397,-39.841271,-7.351677,1.317446,1064.312500,-0.127148,504.192274,-0.026280,2065189
2065190,0.999397,-39.841271,-7.351677,1.317446,4939.250000,-0.232302,169.918794,-0.025215,2065190
2065186,0.999397,-39.841271,-7.351677,1.317258,617.062500,-0.085911,398.601179,-0.024152,2065186
2065182,0.999397,-39.841271,-7.351677,1.025874,2583.083333,-0.226804,148.066186,-0.022559,2065182
2064097,-0.508692,-68.082198,-6.003544,-0.690973,-0.166667,4295.292096,747.691571,-0.022559,2064097
...,...,...,...,...,...,...,...,...,...
1632821,-0.502856,-9.590436,-6.796041,-0.690973,0.458333,305.601375,851.817719,-0.000142,1632821
1558003,-0.679891,-64.688044,-6.171945,-0.607116,400.645833,-0.183849,94.652015,-0.000083,1558003
1557932,-0.550051,-64.688044,-6.171945,-0.607116,514.791667,-0.183849,121.701924,-0.000083,1557932
1557850,-0.450926,-64.688044,-6.171945,-0.607116,482.604167,-0.183849,114.074235,-0.000083,1557850


In [13]:
index_df = pd.DataFrame(index_array, columns=['idx'])
index_df.to_csv('output\outlier_indexes.csv', index=True)

In [14]:
dft = pd.read_csv(r'output\outliers.csv', encoding='latin-1')
dft.set_index('idx', inplace=True)
dft = dft.sort_values(by='anomaly_score', ascending=True)
dft.head(5)

Unnamed: 0_level_0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_score
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2065189,0.999397,-39.841271,-7.351677,1.317446,1064.3125,-0.127148,504.192274,-0.02628
2065190,0.999397,-39.841271,-7.351677,1.317446,4939.25,-0.232302,169.918794,-0.025215
2065186,0.999397,-39.841271,-7.351677,1.317258,617.0625,-0.085911,398.601179,-0.024152
2065182,0.999397,-39.841271,-7.351677,1.025874,2583.083333,-0.226804,148.066186,-0.022559
2064097,-0.508692,-68.082198,-6.003544,-0.690973,-0.166667,4295.292096,747.691571,-0.022559
