Import necessary libraries. 

In [34]:
import numpy as np
from pyod.models.lof import LOF
from pyod.models.cof import COF
from pycaret.anomaly import *

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN

In [2]:
input_df = pd.read_csv('cars.csv')

In [3]:
input_df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


Let's drop the car_name

In [4]:
input_df.drop('car_name', axis=1, inplace=True)

In [5]:
input_df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [6]:
input_df.drop(input_df[input_df['hp'] == '?'].index, axis=0, inplace=True)

In [7]:
print('The shape of the input data:',input_df.shape)

The shape of the input data: (392, 8)


In [8]:
#lets apply zscore to compress the data between zero and one
minmaxscaler = MinMaxScaler()

for i in input_df:
    input_df[i] = minmaxscaler.fit_transform(np.array(input_df[i]).reshape(-1,1))

In [9]:
input_df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin
0,0.239362,1.0,0.617571,0.456522,0.53615,0.238095,0.0,0.0
1,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,0.0
2,0.239362,1.0,0.645995,0.565217,0.51687,0.178571,0.0,0.0
3,0.18617,1.0,0.609819,0.565217,0.516019,0.238095,0.0,0.0
4,0.212766,1.0,0.604651,0.51087,0.520556,0.14881,0.0,0.0


In [10]:
lof = LOF()

lof.fit(input_df)

LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=20, p=2)

In [11]:
lof.decision_function(input_df)[:10]

array([1.20590199, 1.10882749, 1.21273742, 1.16145826, 1.23931199,
       1.12018135, 1.21147674, 1.19303774, 1.19485838, 1.19958153])

In [12]:
predict_lof = lof.predict(input_df)

In [13]:
#let's count the total number of outlier
np.sum(predict_lof)

37

In [14]:
cof = COF(n_neighbors=20)

cof.fit(input_df)

COF(contamination=0.1, n_neighbors=None)

In [15]:
predict_cof = cof.predict(input_df)

In [16]:
np.sum(predict_cof)

40

In [17]:
outliers = setup(input_df)

Unnamed: 0,Description,Value
0,session_id,6558
1,Original Data,"(392, 8)"
2,Missing Values,False
3,Numeric Features,8
4,Categorical Features,0
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(392, 8)"
9,CPU Jobs,-1


In [18]:
lof = create_model('lof')

In [19]:
print(lof)

LOF(algorithm='auto', contamination=0.05, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=-1, n_neighbors=20, p=2)


In [23]:
output = assign_model(lof)

In [28]:
output[output['Anomaly'] == 1].count()[0]

20

In [29]:
c = create_model('cof')

In [30]:
print(cof)

COF(contamination=0.1, n_neighbors=None)


In [32]:
ouput = assign_model(cof)

In [33]:
output[output['Anomaly'] == 1].count()[0]

20

In [37]:
model = DBSCAN(eps = 0.4, min_samples = 10).fit(input_df)

In [38]:
model

DBSCAN(algorithm='auto', eps=0.4, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=10, n_jobs=None, p=None)

In [43]:
labels = model.labels_

In [45]:
labels

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  2,
        2,  1,  3,  3,  3,  3,  3,  2,  0,  0,  0,  0,  1,  4,  1,  2,  2,
        2,  2,  2,  0,  0,  0,  0,  0,  0,  0,  2,  4,  2,  2,  4,  3,  3,
        3,  1,  1,  3,  4,  1,  4,  3,  4,  4,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  1,  0,  0,  0,  0,  3,  3,  3,  3,  4,  1,  1,  4,  1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  2,  2,  2,  3,
        0,  0,  0,  0,  2,  1,  4,  1,  1,  4,  2,  3,  0,  0,  3,  3,  3,
        3,  0,  3, -1,  0,  2,  2,  2,  1,  4,  1,  4,  2,  2,  2,  0,  0,
        0,  0,  0,  3,  3,  3,  1,  1,  4,  3,  3,  1,  1,  3,  2,  2,  2,
        2,  0,  0,  0,  0,  2,  2,  2,  2,  2,  0,  0,  1,  4,  2,  4,  1,
        3,  1,  2,  3,  2,  3,  3,  3,  3,  1,  3,  3,  4,  4,  3,  0,  0,
        0,  0,  2,  2,  2,  2,  4,  4,  3,  1,  2,  2,  2,  2,  3,  1,  1,
        4,  3,  0,  3, -1, -1,  0,  0,  0,  0,  1,  4,  3,  4,  1,  0,  0,
        0,  0,  2,  2,  2

In [44]:
labels[labels == -1]

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
      dtype=int64)

In [54]:
labels_df =  pd.DataFrame(labels, columns=['cluster'])

In [62]:
labels_df['cluster'].value_counts()

 0    103
 1     73
 2     72
 4     69
 3     60
-1     15
Name: cluster, dtype: int64