In [3]:
import pandas as pd
import matplotlib.pyplot as plt 
import matplotlib as mpl
import seaborn as sns
from pycaret.datasets import get_data 
from pycaret.anomaly import * 
mpl.rcParams['figure.dpi'] = 300
# Loading Dataset
numeric = ['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicassen']
categorical = ['Channel','Region']
replace_dict = { "Channel": {1: "Horeca", 2: "Retail"},
                 "Region":  {1: "Lisbon", 2: "Oporto", 3: "Other"} }
data = get_data('wholesale', verbose = False)
data.replace(replace_dict, inplace = True)
# PyCaret environment setup
anomaly = setup(data, session_id = 8477)

Unnamed: 0,Description,Value
0,Session id,8477
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 10)"
3,Ordinal features,1
4,Numeric features,6
5,Categorical features,2
6,Preprocess,True
7,Imputation type,simple
8,Numeric imputation,mean
9,Categorical imputation,mode


In [4]:
# Create Local Outlier Factor model with an outlier proportion of 10% and assign it to model variable
model = create_model('lof', fraction = 0.10)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# Assign anomaly labels and scores to the data_assigned variable
data_assigned = assign_model(model)
data_assigned.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,1.107687
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,1.027102
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,1.398439
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,1.200384
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,1.164052


In [7]:
#Create new dataset of outliers named data_outliers
data_outliers = data_assigned.query('Anomaly == 1')

In [8]:
# Print first 5 instances of the outlier dataset
data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
17,Horeca,Other,5876,6157,2933,839,370,4478,1,1.458161
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,2.249573
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,3.224197
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,1.529652
61,Retail,Other,35942,38369,59598,3254,26701,2017,1,2.799411


#SVM

In [9]:
# Create SVM model with an outlier proportion of 10% and assign it to model variable
svm = create_model('svm', fraction = 0.10)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
 #Assign anomaly labels and scores to the data_assigned variable
svm_data_assigned = assign_model(svm)
svm_data_assigned.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-0.0
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-0.0
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-0.0
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-0.0
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-0.0


In [11]:
# Create new dataset of outliers named data_outliers
svm_data_outliers = svm_data_assigned.query('Anomaly == 1')

In [13]:
# Print first 5 instances of the outlier dataset
svm_data_outliers.head(5)
#svm does not pick up any outliers

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score


#Iforest

In [14]:
# Create iForest model with an outlier proportion of 10% and assign it to model variable
iforest = create_model('iforest', fraction = 0.10)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
 #Assign anomaly labels and scores to the data_assigned variable
iforest_data_assigned = assign_model(iforest)
iforest_data_assigned.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-0.142231
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-0.146928
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-0.068953
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-0.166442
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-0.062139


In [16]:
# Create new dataset of outliers named data_outliers
iforest_data_outliers = iforest_data_assigned.query('Anomaly == 1')

In [17]:
# Print first 5 instances of the outlier dataset
iforest_data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,0.069319
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,0.166455
61,Retail,Other,35942,38369,59598,3254,26701,2017,1,0.11634
65,Retail,Other,85,20959,45828,36,24231,1423,1,0.043972
71,Horeca,Other,18291,1266,21042,5373,4173,14472,1,0.02156


#ABOD

In [18]:
# Create ABOD model with an outlier proportion of 10% and assign it to model variable
abod = create_model('abod', fraction = 0.10)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
 #Assign anomaly labels and scores to the data_assigned variable
abod_data_assigned = assign_model(abod)
abod_data_assigned.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-5.66432e-16
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-1.723476e-15
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-3.949763e-17
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-1.465539e-15
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-2.898443e-16


In [20]:
# Create new dataset of outliers named data_outliers
abod_data_outliers =abod_data_assigned.query('Anomaly == 1')

In [21]:
# Print first 5 instances of the outlier dataset
abod_data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,-4.124037e-20
39,Horeca,Other,56159,555,902,10002,212,2916,1,-2.620307e-17
45,Retail,Other,5181,22044,21531,1740,7353,4985,1,-3.469083e-17
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,-7.848227e-20
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,-1.1340130000000001e-17


#KNN

In [22]:
# Create KNN model with an outlier proportion of 10% and assign it to model variable
knn= create_model('knn', fraction = 0.10)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
 #Assign anomaly labels and scores to the data_assigned variable
knn_data_assigned = assign_model(knn)
knn_data_assigned.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,4412.718437
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,3893.80829
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,6837.430877
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,3621.922694
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,5769.670874


In [24]:
# Create new dataset of outliers named data_outliers
knn_data_outliers =abod_data_assigned.query('Anomaly == 1')

In [25]:
# Print first 5 instances of the outlier dataset
knn_data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,-4.124037e-20
39,Horeca,Other,56159,555,902,10002,212,2916,1,-2.620307e-17
45,Retail,Other,5181,22044,21531,1740,7353,4985,1,-3.469083e-17
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,-7.848227e-20
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,-1.1340130000000001e-17
