**Importing necessary libraries**

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from pycaret.datasets import get_data
from pycaret.anomaly import *
mpl.rcParams['figure.dpi'] = 300

**Loading the Dataset**

In [2]:
numeric = ['Fresh','Milk','Grocery','Frozen','Detergents_Paper','Delicassen']
categorical = ['Channel','Region']

replace_dict = { "Channel": {1: "Horeca", 2: "Retail"},
                 "Region":  {1: "Lisbon", 2: "Oporto", 3: "Other"} }

data = get_data('wholesale', verbose = False)
data.replace(replace_dict, inplace = True) 
data.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,Retail,Other,12669,9656,7561,214,2674,1338
1,Retail,Other,7057,9810,9568,1762,3293,1776
2,Retail,Other,6353,8808,7684,2405,3516,7844
3,Horeca,Other,13265,1196,4221,6404,507,1788
4,Retail,Other,22615,5410,7198,3915,1777,5185
5,Retail,Other,9413,8259,5126,666,1795,1451
6,Retail,Other,12126,3199,6975,480,3140,545
7,Retail,Other,7579,4956,9426,1669,3321,2566
8,Horeca,Other,5963,3648,6192,425,1716,750
9,Retail,Other,6006,11093,18881,1159,7425,2098


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel           440 non-null    object
 1   Region            440 non-null    object
 2   Fresh             440 non-null    int64 
 3   Milk              440 non-null    int64 
 4   Grocery           440 non-null    int64 
 5   Frozen            440 non-null    int64 
 6   Detergents_Paper  440 non-null    int64 
 7   Delicassen        440 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 27.6+ KB


**Initializing the Pycaret Environment**

In [4]:
anomaly = setup(data, silent = True , session_id = 8477)

Unnamed: 0,Description,Value
0,session_id,8477
1,Original Data,"(440, 8)"
2,Missing Values,False
3,Numeric Features,6
4,Categorical Features,2
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(440, 11)"
9,CPU Jobs,-1


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=False, features_todrop=[],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=[],
                                      target='UNSUPERVISED_DUMMY_TARGET',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='most frequent',
                                fill_value_categorical=None,
                                fill_value_numerical=Non...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                ('cluster_all', 'passthrough'),
        

**MODEL I - Create Local Outlier Factor model with an outlier proportion of 10% and assign it to model variable**

In [5]:
model = create_model('lof', fraction = 0.10)

INFO:logs:create_model_container: 1
INFO:logs:master_model_container: 1
INFO:logs:display_container: 2
INFO:logs:LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=-1, n_neighbors=20, novelty=True, p=2)
INFO:logs:create_model() succesfully completed......................................


**Assign anomaly labels and scores to the data_assigned variable**

In [6]:
data_assigned = assign_model(model)
data_assigned.head(10)

INFO:logs:Initializing assign_model()
INFO:logs:assign_model(model=LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=-1, n_neighbors=20, novelty=True, p=2), transformation=False, score=True, verbose=True)
INFO:logs:Checking exceptions
INFO:logs:Determining Trained Model
INFO:logs:Trained Model : Local Outlier Factor
INFO:logs:Copying data
INFO:logs:(440, 10)
INFO:logs:assign_model() succesfully completed......................................


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,1.107687
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,1.027102
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,1.398439
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,1.200384
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,1.164052
5,Retail,Other,9413,8259,5126,666,1795,1451,0,1.184313
6,Retail,Other,12126,3199,6975,480,3140,545,0,1.130491
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,1.013751
8,Horeca,Other,5963,3648,6192,425,1716,750,0,1.201904
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,1.053333


**Create new dataset of outliers named data_outliers**

In [7]:
data_inliers = data_assigned.query('Anomaly == 0')
data_outliers = data_assigned.query('Anomaly == 1')

**Print first 5 instances of the outlier dataset**

In [8]:
data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
17,Horeca,Other,5876,6157,2933,839,370,4478,1,1.458161
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,2.249573
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,3.224197
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,1.529652
61,Retail,Other,35942,38369,59598,3254,26701,2017,1,2.799411


****MODEL II - Create Angle Based Outlier Detection Model with an outlier proportion of 10% and assign it to model variable****

In [9]:
abod = create_model('abod', fraction = 0.1)

INFO:logs:create_model_container: 2
INFO:logs:master_model_container: 2
INFO:logs:display_container: 3
INFO:logs:ABOD(contamination=0.1, method='fast', n_neighbors=5)
INFO:logs:create_model() succesfully completed......................................


**Assign anomaly labels and scores to the data_assigned variable**

In [10]:
data_assigned = assign_model(abod)
data_assigned.head(10)

INFO:logs:Initializing assign_model()
INFO:logs:assign_model(model=ABOD(contamination=0.1, method='fast', n_neighbors=5), transformation=False, score=True, verbose=True)
INFO:logs:Checking exceptions
INFO:logs:Determining Trained Model
INFO:logs:Trained Model : Angle-base Outlier Detection
INFO:logs:Copying data
INFO:logs:(440, 10)
INFO:logs:assign_model() succesfully completed......................................


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-5.664321e-16
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-1.723476e-15
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-3.949764e-17
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-1.465539e-15
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-2.898443e-16
5,Retail,Other,9413,8259,5126,666,1795,1451,0,-4.686618e-16
6,Retail,Other,12126,3199,6975,480,3140,545,0,-1.068663e-15
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,-2.60408e-15
8,Horeca,Other,5963,3648,6192,425,1716,750,0,-5.329998e-15
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,-1.170233e-15


**Create new dataset of outliers named data_outliers**

In [11]:
data_inliers = data_assigned.query('Anomaly == 0')
data_outliers = data_assigned.query('Anomaly == 1')

**Print first 5 instances of the outlier dataset**

In [12]:
data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,-4.124038e-20
39,Horeca,Other,56159,555,902,10002,212,2916,1,-2.620307e-17
45,Retail,Other,5181,22044,21531,1740,7353,4985,1,-3.469083e-17
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,-7.848229e-20
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,-1.1340130000000001e-17


**MODEL III - Create iForest Model with an outlier proportion of 10% and assign it to model variable**

In [13]:
iforest = create_model('iforest', fraction = 0.1)

INFO:logs:create_model_container: 3
INFO:logs:master_model_container: 3
INFO:logs:display_container: 4
INFO:logs:IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0,
    max_samples='auto', n_estimators=100, n_jobs=-1, random_state=8477,
    verbose=0)
INFO:logs:create_model() succesfully completed......................................


**Assign anomaly labels and scores to the data_assigned variable**

In [22]:
data_assigned = assign_model(iforest)
data_assigned.head(10)

INFO:logs:Initializing assign_model()
INFO:logs:assign_model(model=IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0,
    max_samples='auto', n_estimators=100, n_jobs=-1, random_state=8477,
    verbose=0), transformation=False, score=True, verbose=True)
INFO:logs:Checking exceptions
INFO:logs:Determining Trained Model
INFO:logs:Trained Model : Isolation Forest
INFO:logs:Copying data
INFO:logs:(440, 10)
INFO:logs:assign_model() succesfully completed......................................


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-0.124265
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-0.130061
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-0.059521
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-0.1544
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-0.046611
5,Retail,Other,9413,8259,5126,666,1795,1451,0,-0.121751
6,Retail,Other,12126,3199,6975,480,3140,545,0,-0.115792
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,-0.127756
8,Horeca,Other,5963,3648,6192,425,1716,750,0,-0.161993
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,-0.125618


**Create new dataset of outliers named data_outliers**

In [14]:
data_inliers = data_assigned.query('Anomaly == 0')
data_outliers = data_assigned.query('Anomaly == 1')

**Print first 5 instances of the outlier dataset**

In [15]:
data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,-4.124038e-20
39,Horeca,Other,56159,555,902,10002,212,2916,1,-2.620307e-17
45,Retail,Other,5181,22044,21531,1740,7353,4985,1,-3.469083e-17
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,-7.848229e-20
56,Retail,Other,4098,29892,26866,2616,17740,1340,1,-1.1340130000000001e-17


**MODEL IV - Create SVM with an outlier proportion of 10% and assign it to model variable**

In [16]:
svm = create_model('svm',)

INFO:logs:create_model_container: 4
INFO:logs:master_model_container: 4
INFO:logs:display_container: 5
INFO:logs:OCSVM(cache_size=200, coef0=0.0, contamination=0.05, degree=3, gamma='auto',
   kernel='rbf', max_iter=-1, nu=0.5, shrinking=True, tol=0.001,
   verbose=False)
INFO:logs:create_model() succesfully completed......................................


**Assign anomaly labels and scores to the data_assigned variable**

In [17]:
data_assigned = assign_model(svm)
data_assigned.head(10)

INFO:logs:Initializing assign_model()
INFO:logs:assign_model(model=OCSVM(cache_size=200, coef0=0.0, contamination=0.05, degree=3, gamma='auto',
   kernel='rbf', max_iter=-1, nu=0.5, shrinking=True, tol=0.001,
   verbose=False), transformation=False, score=True, verbose=True)
INFO:logs:Checking exceptions
INFO:logs:Determining Trained Model
INFO:logs:Trained Model : One-class SVM detector
INFO:logs:Copying data
INFO:logs:(440, 10)
INFO:logs:assign_model() succesfully completed......................................


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,-0.0
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,-0.0
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,-0.0
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,-0.0
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,-0.0
5,Retail,Other,9413,8259,5126,666,1795,1451,0,-0.0
6,Retail,Other,12126,3199,6975,480,3140,545,0,-0.0
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,-0.0
8,Horeca,Other,5963,3648,6192,425,1716,750,0,-0.0
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,-0.0


**Create new dataset of outliers named data_outliers**

In [18]:
data_inliers = data_assigned.query('Anomaly == 0')
data_outliers = data_assigned.query('Anomaly == 1')

**Print first 5 instances of the outlier dataset**

In [19]:
data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score


**MODEL V - Create KNN with an outlier proportion of 10% and assign it to model variable**

In [20]:
knn = create_model('knn', fraction = 0.1)

INFO:logs:create_model_container: 5
INFO:logs:master_model_container: 5
INFO:logs:display_container: 6
INFO:logs:KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
  radius=1.0)
INFO:logs:create_model() succesfully completed......................................


**Assign anomaly labels and scores to the data_assigned variable**

In [21]:
data_assigned = assign_model(knn)
data_assigned.head(10)

INFO:logs:Initializing assign_model()
INFO:logs:assign_model(model=KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
  radius=1.0), transformation=False, score=True, verbose=True)
INFO:logs:Checking exceptions
INFO:logs:Determining Trained Model
INFO:logs:Trained Model : K-Nearest Neighbors Detector
INFO:logs:Copying data
INFO:logs:(440, 10)
INFO:logs:assign_model() succesfully completed......................................


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
0,Retail,Other,12669,9656,7561,214,2674,1338,0,4412.718437
1,Retail,Other,7057,9810,9568,1762,3293,1776,0,3893.808419
2,Retail,Other,6353,8808,7684,2405,3516,7844,0,6837.430877
3,Horeca,Other,13265,1196,4221,6404,507,1788,0,3621.922694
4,Retail,Other,22615,5410,7198,3915,1777,5185,0,5769.670874
5,Retail,Other,9413,8259,5126,666,1795,1451,0,4412.718437
6,Retail,Other,12126,3199,6975,480,3140,545,0,3802.832365
7,Retail,Other,7579,4956,9426,1669,3321,2566,0,3465.169837
8,Horeca,Other,5963,3648,6192,425,1716,750,0,2814.278238
9,Retail,Other,6006,11093,18881,1159,7425,2098,0,5012.273137


**Create new dataset of outliers named data_outliers**

In [22]:
data_inliers = data_assigned.query('Anomaly == 0')
data_outliers = data_assigned.query('Anomaly == 1')

**Print first 5 instances of the outlier dataset**

In [23]:
data_outliers.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Anomaly,Anomaly_Score
23,Retail,Other,26373,36423,22019,5154,4337,16523,1,29286.316515
39,Horeca,Other,56159,555,902,10002,212,2916,1,11788.374358
45,Retail,Other,5181,22044,21531,1740,7353,4985,1,9341.904035
47,Retail,Other,44466,54259,55571,7782,24171,6465,1,50651.461104
49,Retail,Other,4967,21412,28921,1798,13583,1163,1,9557.43407
