In [124]:
import pandas as pd
import matplotlib.pyplot as plt 
import matplotlib as mpl
import seaborn as sns
from pycaret.datasets import get_data 
from pycaret.anomaly import * 
mpl.rcParams['figure.dpi'] = 300
# Loading Dataset
numeric = ['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicassen']
categorical = ['Channel','Region']
replace_dict = { "Channel": {1: "Horeca", 2: "Retail"},
                 "Region":  {1: "Lisbon", 2: "Oporto", 3: "Other"} }
data = get_data('wholesale', verbose = False)
data.replace(replace_dict, inplace = True)
# PyCaret environment setup
anomaly = setup(data, session_id = 8477)

Unnamed: 0,Description,Value
0,Session id,8477
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 10)"
3,Ordinal features,1
4,Numeric features,6
5,Categorical features,2
6,Preprocess,True
7,Imputation type,simple
8,Numeric imputation,mean
9,Categorical imputation,mode


In [125]:
data.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,Retail,Other,12669,9656,7561,214,2674,1338
1,Retail,Other,7057,9810,9568,1762,3293,1776
2,Retail,Other,6353,8808,7684,2405,3516,7844
3,Horeca,Other,13265,1196,4221,6404,507,1788
4,Retail,Other,22615,5410,7198,3915,1777,5185


In [126]:
# Create Local Outlier Factor model with an outlier proportion of 10% and assign it to model variable
model = create_model('lof', fraction = 0.10)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [127]:
# Assign anomaly labels and scores to the data_assigned variable
data_assigned = assign_model(model)
data_assigned.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,1.107687
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,1.027102
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,1.398439
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,1.200384
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,1.164052
5,Retail,Other,9413,8259,5126,666,1795,1451,0,1.184313
6,Retail,Other,12126,3199,6975,480,3140,545,0,1.130491
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,1.013751
8,Horeca,Other,5963,3648,6192,425,1716,750,0,1.201904
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,1.053333


In [128]:
# Create new dataset of outliers named data_outliers
data_inliers = data_assigned.query('Anomaly == 0')
data_outliers = data_assigned.query('Anomaly == 1')

In [129]:
# Print first 5 instances of the outlier dataset
data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
17,Horeca,Other,5876,6157,2933,839,370,4478,1,1.458161
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,2.249573
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,3.224197
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,1.529652
61,Retail,Other,35942,38369,59598,3254,26701,2017,1,2.799411


In [130]:
plot_model(model,plot='umap')

In [86]:
# SVM, ABOD, IForest, Bonus point: KNN

In [136]:
#SVM
svm_model = create_model('svm', fraction = 0.1)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [137]:
#Assign anomaly labels and scores to the data_assigned variable
data_assigned_svm = assign_model(svm_model)
data_assigned_svm.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-0.0
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-0.0
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-0.0
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-0.0
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-0.0
5,Retail,Other,9413,8259,5126,666,1795,1451,0,-0.0
6,Retail,Other,12126,3199,6975,480,3140,545,0,-0.0
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,-0.0
8,Horeca,Other,5963,3648,6192,425,1716,750,0,-0.0
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,-0.0


In [138]:
# Create new dataset of outliers named data_outliers
data_inliers_svm = data_assigned_svm.query('Anomaly == 0')
data_outliers_svm = data_assigned_svm.query('Anomaly == 1')

In [139]:
# Print first 5 instances of the outlier dataset
data_outliers_svm.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score


In [140]:
plot_model(svm_model,plot='umap')

In [141]:
#ABOD
abod_model = create_model('abod', fraction = 0.1)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [142]:
#Assign anomaly labels and scores to the data_assigned variable
data_assigned_abod = assign_model(abod_model)
data_assigned_abod.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-5.66432e-16
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-1.723476e-15
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-3.949763e-17
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-1.465539e-15
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-2.898443e-16
5,Retail,Other,9413,8259,5126,666,1795,1451,0,-4.686617e-16
6,Retail,Other,12126,3199,6975,480,3140,545,0,-1.068663e-15
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,-2.60408e-15
8,Horeca,Other,5963,3648,6192,425,1716,750,0,-5.329998e-15
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,-1.170232e-15


In [143]:
# Create new dataset of outliers named data_outliers
data_inliers_abod = data_assigned_abod.query('Anomaly == 0')
data_outliers_abod = data_assigned_abod.query('Anomaly == 1')


In [144]:
# Print first 5 instances of the outlier dataset
data_outliers_abod.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,-4.124037e-20
39,Horeca,Other,56159,555,902,10002,212,2916,1,-2.620307e-17
45,Retail,Other,5181,22044,21531,1740,7353,4985,1,-3.469083e-17
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,-7.848227e-20
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,-1.1340130000000001e-17


In [145]:
plot_model(abod_model,plot='umap')

In [146]:
#IFOREST
iforest_model = create_model('iforest', fraction = 0.1)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [147]:
#Assign anomaly labels and scores to the data_assigned variable
data_assigned_iforest= assign_model(iforest_model)
data_assigned_iforest.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-0.142231
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-0.146928
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-0.068953
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-0.166442
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-0.062139
5,Retail,Other,9413,8259,5126,666,1795,1451,0,-0.135892
6,Retail,Other,12126,3199,6975,480,3140,545,0,-0.136629
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,-0.135296
8,Horeca,Other,5963,3648,6192,425,1716,750,0,-0.173082
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,-0.132392


In [148]:
# Create new dataset of outliers named data_outliers
data_inliers_if = data_assigned_iforest.query('Anomaly == 0')
data_outliers_if = data_assigned_iforest.query('Anomaly == 1')

In [149]:
# Print first 5 instances of the outlier dataset
data_outliers_if.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,0.069319
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,0.166455
61,Retail,Other,35942,38369,59598,3254,26701,2017,1,0.11634
65,Retail,Other,85,20959,45828,36,24231,1423,1,0.043972
71,Horeca,Other,18291,1266,21042,5373,4173,14472,1,0.02156


In [150]:
plot_model(iforest_model,plot='umap')

In [151]:
#knn
knn_model = create_model('knn', fraction = 0.1)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [152]:
#Assign anomaly labels and scores to the data_assigned variable
data_assigned_knn = assign_model(knn_model)
data_assigned_knn.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,4412.718437
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,3893.80829
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,6837.430877
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,3621.922694
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,5769.670874
5,Retail,Other,9413,8259,5126,666,1795,1451,0,4412.718437
6,Retail,Other,12126,3199,6975,480,3140,545,0,3802.832365
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,3465.169693
8,Horeca,Other,5963,3648,6192,425,1716,750,0,2814.278238
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,5012.273137


In [153]:
# Create new dataset of outliers named data_outliers
data_inliers_knn = data_assigned_knn.query('Anomaly == 0')
data_outliers_knn = data_assigned_knn.query('Anomaly == 1')

In [154]:
# Print first 5 instances of the outlier dataset
data_outliers_knn.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,29286.316498
39,Horeca,Other,56159,555,902,10002,212,2916,1,11788.374358
45,Retail,Other,5181,22044,21531,1740,7353,4985,1,9341.904035
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,50651.461104
49,Retail,Other,4967,21412,28921,1798,13583,1163,1,9557.43407


In [155]:
plot_model(knn_model,plot='umap')

In [156]:
# Please run other algorithms  - svm, abod, Iforest
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pycaret.internal.patches.pyod.COFPatched
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD
