## Running O^2

In [12]:
import numpy as np 
import pandas as pd
import o2
import random 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [13]:
# read data and put in shape
# we encourage normalizing the data if possible using sklearn StandardSclaler

d = str("Data/Wine/")
X_train = pd.read_csv(d+"X_train.csv") 
y_train = pd.read_csv(d+"y_train.csv") 
X_test = pd.read_csv(d+"X_test.csv") 
y_test = pd.read_csv(d+"y_test.csv")
col_namesX = X_train.columns 
col_namesy = y_train.columns 
(n_cur,p) = X_train.shape 
X_train = X_train.values 
y_train = y_train.values
y_train = y_train.reshape((n_cur,)) 
X_test = X_test.values 
y_test = y_test.values 
y_test = y_test.reshape((len(y_test),))

In [14]:
# the minority class is 0 or 1
min_class=1

# how many new points to create
points=300

# categorical features
# cat is the simplest apporach
# cat_tab uses a neural network to add new catergorical points
method="cat"

# no need to use this
clf = None

# three options for creating new points: lr, svm or tree
# if you don't have a gurobi license use lr or tree
ovs_m="lr"

# epochs for neural network in cat_tab method is 20
# the number 10 is not important here, tune only the 20 if you want
eps=[10, 20]

In [15]:
X_train_new, y_train_new = o2.ovs(X_train, 
                            y_train, min_class, points, method=method, clf=None, ovs_m=ovs_m, 
                               eps=[10, 10], l1=0.5, l2=0.5, l3=0.5, optimizer="lbfgs", init_loras=False)

(299, 11)


In [16]:
# fit a decision tree model using grid seach
tree_para = {'criterion':['gini','entropy'],'max_depth':[3, 6, 10, 20, 50, 100]}
clf = GridSearchCV(DecisionTreeClassifier(random_state=36), tree_para, cv=3)
clf.fit(X_train_new, y_train_new)
preds_ovs = clf.predict(X_test)
proba_ovs = clf.predict_proba(X_test)[:, 1]
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
proba = clf.predict_proba(X_test)[:, 1]
auc_ovs = roc_auc_score(y_test, proba_ovs)
f1_ovs = f1_score(y_test, preds_ovs, average='binary')
auc = roc_auc_score(y_test, proba)
f1 = f1_score(y_test, preds, average='binary')
print("Baseline AUC " + str(auc))
print("O2 AUC " + str(auc_ovs))
print("Baseline F1 " + str(f1))
print("O2 F1 " + str(f1_ovs))

Baseline AUC 0.6535502533129799
O2 AUC 0.6811409039623018
Baseline F1 0.4397163120567376
O2 F1 0.5227722772277228


In [17]:
# fit a logistic regression model
clf = LogisticRegression()
clf.fit(X_train_new, y_train_new)
preds_ovs = clf.predict(X_test)
proba_ovs = clf.predict_proba(X_test)[:, 1]
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
proba = clf.predict_proba(X_test)[:, 1]
auc_ovs = roc_auc_score(y_test, proba_ovs)
f1_ovs = f1_score(y_test, preds_ovs, average='binary')
auc = roc_auc_score(y_test, proba)
f1 = f1_score(y_test, preds, average='binary')
print("Baseline AUC " + str(auc))
print("O2 AUC " + str(auc_ovs))
print("Baseline F1 " + str(f1))
print("O2 F1 " + str(f1_ovs))

Baseline AUC 0.7331917435454969
O2 AUC 0.7309794031105507
Baseline F1 0.30625
O2 F1 0.37500000000000006
