In [1]:
from minisom import MiniSom
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv(r'..\..\datasets\df_rob_norm.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2065547 entries, 0 to 2065546
Data columns (total 7 columns):
 #   Column            Dtype  
---  ------            -----  
 0   nf_timestamp      float64
 1   emit_lat          float64
 2   emit_long         float64
 3   prod_ncm          float64
 4   prod_quant        float64
 5   prod_valor_unit   float64
 6   prod_valor_total  float64
dtypes: float64(7)
memory usage: 110.3 MB


In [3]:
df_subset = df.iloc[:,:].values

input_size = df_subset.shape[1]

outliers_percentage = 0.0001

som = MiniSom(35, 35, df_subset.shape[1], sigma=5, learning_rate=0.5,
              neighborhood_function='triangle', random_seed=10)


som.train(df_subset, 10000, random_order=False, verbose=True)

df['anomaly_score'] = np.linalg.norm(som.quantization(df_subset) - df_subset, axis=1)

error_treshold = np.percentile(df['anomaly_score'], 100 * (1 - outliers_percentage))

print('Error treshold:', error_treshold)

df_outliers = df[df['anomaly_score'] > error_treshold]

 [     0 / 10000 ]   0% - ? it/s [     0 / 10000 ]   0% - ? it/s [     1 / 10000 ]   0% - 0:01:21 left  [     2 / 10000 ]   0% - 0:00:45 left  [     3 / 10000 ]   0% - 0:00:30 left  [     4 / 10000 ]   0% - 0:00:22 left  [     5 / 10000 ]   0% - 0:00:18 left  [     6 / 10000 ]   0% - 0:00:15 left  [     7 / 10000 ]   0% - 0:00:14 left  [     8 / 10000 ]   0% - 0:00:12 left  [     9 / 10000 ]   0% - 0:00:11 left  [    10 / 10000 ]   0% - 0:00:10 left  [    11 / 10000 ]   0% - 0:00:09 left  [    12 / 10000 ]   0% - 0:00:09 left  [    13 / 10000 ]   0% - 0:00:08 left  [    14 / 10000 ]   0% - 0:00:07 left  [    15 / 10000 ]   0% - 0:00:07 left  [    16 / 10000 ]   0% - 0:00:07 left  [    17 / 10000 ]   0% - 0:00:07 left  [    18 / 10000 ]   0% - 0:00:06 left  [    19 / 10000 ]   0% - 0:00:06 left  [    20 / 10000 ]   0% - 0:00:05 left  [    21 / 10000 ]   0% - 0:00:06 left  [    22 / 10000 ]   0% - 0:00:05 left  [    23 / 10000 ]   0% - 0:00:05 left  [    24 / 10

 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.13858017969679104
Error treshold: 0.8719983808669666


In [4]:
df_outliers['idx'] = df_outliers.index
df_outliers = df_outliers.sort_values(by='anomaly_score', ascending=False)
df_outliers.to_csv('output\outliers.csv', index=False)
df_outliers

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_score,idx
2000736,0.008419,0.691933,0.996733,0.490199,1.250000e-08,1.000000e+00,1.000000e+00,1.420805,2000736
1828069,0.012655,0.691630,0.984758,0.300490,1.000000e+00,0.000000e+00,3.152858e-04,1.006988,1828069
2065369,0.016619,0.056931,0.277179,0.841452,1.250000e-08,5.015910e-06,5.015910e-06,0.890233,2065369
2065379,0.068564,0.056931,0.277179,0.871200,1.250000e-08,4.643300e-06,4.643300e-06,0.890048,2065379
2065375,0.090386,0.056931,0.277179,0.732111,1.250000e-08,6.434697e-06,6.434697e-06,0.889804,2065375
...,...,...,...,...,...,...,...,...,...
1755185,0.454858,0.686213,0.000000,0.271019,8.547625e-06,4.886930e-08,3.341729e-05,0.871999,1755185
1753902,0.454822,0.686213,0.000000,0.271019,6.492625e-06,4.757949e-08,2.471310e-05,0.871998,1753902
1756531,0.131841,0.686213,0.000000,0.401590,2.500000e-08,7.165586e-08,1.433117e-07,0.871998,1756531
1753630,0.454800,0.686213,0.000000,0.271019,7.089000e-06,4.757949e-08,2.698316e-05,0.871998,1753630


In [5]:
dft = pd.read_csv(r'output\outliers.csv', encoding='latin-1')
dft.set_index('idx', inplace=True)
dft = dft.sort_values(by='anomaly_score', ascending=False)
dft.head(5)

Unnamed: 0_level_0,nf_timestamp,emit_lat,emit_long,prod_ncm,prod_quant,prod_valor_unit,prod_valor_total,anomaly_score
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000736,0.008419,0.691933,0.996733,0.490199,1.25e-08,1.0,1.0,1.420805
1828069,0.012655,0.69163,0.984758,0.30049,1.0,0.0,0.000315,1.006988
2065369,0.016619,0.056931,0.277179,0.841452,1.25e-08,5e-06,5e-06,0.890233
2065379,0.068564,0.056931,0.277179,0.8712,1.25e-08,5e-06,5e-06,0.890048
2065375,0.090386,0.056931,0.277179,0.732111,1.25e-08,6e-06,6e-06,0.889804
