In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.cluster import KMeans

from typing import *

In [3]:
def make_binary(labels: List, positive: List = []):
  '''convert multiple classification into binary classification'''
  return [int(l in positive) for l in labels]

def make_positive(data: List[List], labels: List):
  '''filter out positive samples'''
  new_data = [row for i, row in enumerate(data) if labels[i] == 1]
  new_label = np.array([1]*len(new_data))
  return new_data, new_label

def make_one_class(data: List[List], labels: List):
  '''filter multiple classes into separate one classes'''
  uniq_labels = np.unique(labels)
  return [[row for i, row in enumerate(data) if labels[i] == l] for l in uniq_labels]

def multi_one_class_predict(data, oneK_classifiers):
  '''Ensemble classifier. return 1 when atleast one of the classifiers labels as positive. else 0'''
  preds = []
  for row in data:
    res = [okc.predict([row])[0] for okc in oneK_classifiers]
    preds.append(int(1 in res))
  return preds

In [15]:
# prepare data
# save 20% for testing
# rest is used for tarining one-class and two-class classifiers

data = datasets.load_iris( )
X = data.data
y = make_binary(data.target, [0, 1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_1c, y_train_1c = make_positive(X_train, y_train)
X_train_2c, y_train_2c = X_train, y_train

In [16]:
def without_clustering():
  ocsvm = OneClassSVM(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
  ocrf = IsolationForest(n_estimators=10, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, random_state=None, verbose=0, warm_start=False)
  tcrf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
  tcnb = GaussianNB(priors=None, var_smoothing=1e-09)

  ocsvm.fit(X_train_1c)
  ocrf.fit(X_train_1c)
  tcrf.fit(X_train_2c, y_train_2c)
  tcnb.fit(X_train_2c, y_train_2c)

  y_pred_ocsvm = ocsvm.predict(X_test)
  y_pred_ocrf = ocrf.predict(X_test)
  y_pred_tcrf = tcrf.predict(X_test)
  y_pred_tcnb = tcnb.predict(X_test)

  acc_ocsvm = accuracy_score(y_test, y_pred_ocsvm)
  acc_ocrf = accuracy_score(y_test, y_pred_ocrf)
  acc_tcrf = accuracy_score(y_test, y_pred_tcrf)
  acc_tcnb = accuracy_score(y_test, y_pred_tcnb)

  return acc_ocsvm, acc_ocrf, acc_tcrf, acc_tcnb

# test with 100 repetitions
# consider mean of all test results for final accuracy
np.mean([without_clustering() for _ in range(0,100)], axis=0)

array([0.3       , 0.46333333, 0.99933333, 0.93333333])

In [17]:
def with_clustering(n: int):
  # use k means clustering first
  km = KMeans(n_clusters=n, init='k-means++', n_init=10, max_iter=300, tol=0.0001, random_state=None)
  clusters = km.fit_predict(X_train_1c)
  oneKs = make_one_class(X_train_1c, clusters)
  # on each cluster, train a one class classifier
  ocsvms = [OneClassSVM(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1).fit(oneK) for oneK in oneKs]
  ocrfs = [IsolationForest(n_estimators=10, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, random_state=None, verbose=0, warm_start=False).fit(oneK) for oneK in oneKs]
  # get predictions
  y_pred_mocsvm = multi_one_class_predict(X_test, ocsvms)
  y_pred_mocrf= multi_one_class_predict(X_test, ocrfs)
  # compute accuracy
  acc_mocsvm = accuracy_score(y_test, y_pred_mocsvm)
  acc_mocrf = accuracy_score(y_test, y_pred_mocrf)
  return acc_mocsvm, acc_mocrf

# test with 100 repetitions
# consider mean of all test results for final accuracy
for n in range(1,10):
  res = np.mean([with_clustering(n) for _ in range(0,100)], axis=0)
  out = "\t".join([str(acc) for acc in res])
  print(f'{n}\t{out}')

1	0.6666666666666667	0.8433333333333334
2	0.7000000000000001	0.8233333333333335
3	0.7000000000000001	0.8566666666666667
4	0.7366666666666666	0.79
5	0.7066666666666668	0.7666666666666666
6	0.5900000000000001	0.8066666666666666
7	0.5999999999999999	0.7566666666666666
8	0.5633333333333334	0.8600000000000001
9	0.5833333333333333	0.6966666666666665
