In [1]:
import pandas as pd
import numpy as np
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier

# 1. Data Preprocessing

In [2]:
data = pd.read_csv("data/svdd_data.csv")

In [3]:
## select which vip_index == 1 (VIP) and vip_index == 0 (Non-VIP)
vip = data[data["vip_index"] == 1]
nvip = data[data["vip_index"] == 0]
print "VIP:" + str(vip.shape), "Non-VIP:" + str(nvip.shape)

VIP:(2000, 26) Non-VIP:(10000, 26)


In [4]:
## Sampling (Non-VIP 筆數 = VIP 筆數)
nvip_sample = nvip.sample(n = vip.shape[0], replace = False)

In [5]:
## split X and Y
vip_x = vip.drop("vip_index", axis = 1).values
vip_y = vip["vip_index"].values
nvip_x = nvip_sample.drop("vip_index", axis = 1).values
nvip_y = nvip_sample["vip_index"].values

In [6]:
## split training data and testing data
vip1_x, vip2_x, vip1_y, vip2_y = train_test_split(vip_x, vip_y, train_size = 0.5, random_state = 2017)

# 2. Binary Class SVM

In [7]:
## training data and testing data
train_x = np.concatenate((vip1_x, nvip_x), axis = 0)
train_y = np.concatenate((vip1_y, nvip_y), axis = 0)
test_x = vip2_x
test_y = vip2_y

In [8]:
## build model
binary_model = svm.SVC(gamma = 0.005)
binary_model.fit(train_x, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
## predict
print("training data accuracy: " + str(metrics.accuracy_score(train_y, binary_model.predict(train_x))))
print("testing data accuracy: " + str(metrics.accuracy_score(test_y, binary_model.predict(test_x))))

training data accuracy: 0.840666666667
testing data accuracy: 0.165


In [10]:
## build model by bagging 
## http://stackoverflow.com/questions/31681373/making-svm-run-faster-in-python
bagging_model = BaggingClassifier(svm.SVC(gamma = 0.005), random_state = 2017)
bagging_model.fit(train_x, train_y)

BaggingClassifier(base_estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=2017, verbose=0, warm_start=False)

In [11]:
## predict
print("training data accuracy: " + str(metrics.accuracy_score(train_y, bagging_model.predict(train_x))))
print("testing data accuracy: " + str(metrics.accuracy_score(test_y, bagging_model.predict(test_x))))

training data accuracy: 0.819666666667
testing data accuracy: 0.167


# 3. One Class SVM

In [12]:
## training data and testing data
train_x = vip1_x
train_y = vip1_y
test_x = vip2_x
test_y = vip2_y

In [13]:
## build model
## https://thisdata.com/blog/unsupervised-machine-learning-with-one-class-support-vector-machines/
one_model = svm.OneClassSVM(nu = 0.1, gamma = 0.0005)
one_model.fit(train_x)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.0005, kernel='rbf',
      max_iter=-1, nu=0.1, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [14]:
## predict
print("training data accuracy: " + str(metrics.accuracy_score(train_y, one_model.predict(train_x))))
print("testing data accuracy: " + str(metrics.accuracy_score(test_y, one_model.predict(test_x))))

training data accuracy: 0.897
testing data accuracy: 0.882


In [15]:
## predict potential vip
nvip_data = nvip.drop("vip_index", axis = 1).values
potential_vip = one_model.predict(nvip_data)
print len(potential_vip[np.where(potential_vip == 1)])
print len(nvip_data)

9089
10000
