In [41]:
from cvxopt import solvers, matrix
import numpy as np
import sklearn.svm

class SVM453X ():
    def __init__ (self):
        pass

    # Expects each *row* to be an m-dimensional row vector. X should
    # contain n rows, where n is the number of examples.
    # y should correspondingly be an n-vector of labels (-1 or +1).
    def fit (self, X, y):
        
        Xtilde = np.append(X, np.ones((X.shape[0], 1)), axis=1)
        m, n = Xtilde.shape
        
        G = -1 * y.reshape(1, y.shape[0]).T * Xtilde
        P = np.eye(n)
        q = np.zeros(n)
        h = np.full((m, 1), -1)

        # Solve -- if the variables above are defined correctly, you can call this as-is:
        sol = solvers.qp(matrix(P, tc='d'), matrix(q, tc='d'), matrix(G, tc='d'), matrix(h, tc='d'))

        # Fetch the learned hyperplane and bias parameters out of sol['x']
        results = np.array(sol['x'])
        self.w = results[:-1].reshape((results[:-1].shape[0]))
        self.b = results[-1]

    # Given a 2-D matrix of examples X, output a vector of predicted class labels
    def predict (self, x):
        return np.sign(np.dot(x, self.w) + self.b)

def test1 ():
    # Set up toy problem
    X = np.array([ [1,1], [2,1], [1,2], [2,3], [1,4], [2,4] ])
    y = np.array([-1,-1,-1,1,1,1])

    # Train your model
    svm453X = SVM453X()
    svm453X.fit(X, y)
    print(svm453X.w, svm453X.b)

    # Compare with sklearn
    svm = sklearn.svm.SVC(kernel='linear', C=1e15)  # 1e15 -- approximate hard-margin
    svm.fit(X, y)
    print(svm.coef_, svm.intercept_)

    acc = np.mean(svm453X.predict(X) == svm.predict(X))
    print("Acc={}".format(acc))

def test2 (seed):
    np.random.seed(seed)

    # Generate random data
    X = np.random.rand(20,3)
    # Generate random labels based on a random "ground-truth" hyperplane
    while True:
        w = np.random.rand(3)
        y = 2*(X.dot(w) > 0.5) - 1
        # Keep generating ground-truth hyperplanes until we find one
        # that results in 2 classes
        if len(np.unique(y)) > 1:
            break

    svm453X = SVM453X()
    svm453X.fit(X, y)

    # Compare with sklearn
    svm = sklearn.svm.SVC(kernel='linear', C=1e15)  # 1e15 -- approximate hard margin
    svm.fit(X, y)
    diff = np.linalg.norm(svm.coef_ - svm453X.w) + np.abs(svm.intercept_ - svm453X.b)
    print(diff)

    acc = np.mean(svm453X.predict(X) == svm.predict(X))
    print("Acc={}".format(acc))

    if acc == 1 and diff < 1e-1:
        print("Passed")

if __name__ == "__main__": 
    test1()
    for seed in range(5):
        test2(seed)


     pcost       dcost       gap    pres   dres
 0:  4.5713e-01  2.3378e+00  4e+00  2e+00  6e-16
 1:  9.8313e-01  3.9366e+00  7e-01  7e-01  3e-15
 2:  3.8561e+00  7.8776e+00  1e+00  4e-01  1e-13
 3:  8.5531e+00  8.9940e+00  3e-01  4e-02  3e-14
 4:  8.9951e+00  9.0000e+00  3e-03  5e-04  6e-14
 5:  9.0000e+00  9.0000e+00  3e-05  5e-06  2e-14
 6:  9.0000e+00  9.0000e+00  3e-07  5e-08  3e-14
Optimal solution found.
[0.99999996 0.99999998] [-3.99999989]
[[1. 1.]] [-4.]
Acc=1.0
     pcost       dcost       gap    pres   dres
 0:  1.4564e+00  1.1836e+01  2e+01  2e+00  4e-15
 1:  2.7889e+00  2.1466e+01  6e+00  1e+00  3e-15
 2:  1.1208e+01  1.0202e+02  1e+01  9e-01  9e-14
 3:  3.2657e+01  1.9223e+02  3e+01  8e-01  6e-13
 4:  1.1376e+02  3.1957e+02  7e+01  6e-01  1e-12
 5:  3.4044e+02  4.2149e+02  6e+01  2e-01  2e-12
 6:  4.2867e+02  4.3311e+02  3e+00  9e-03  4e-12
 7:  4.3393e+02  4.3418e+02  3e-01  6e-04  1e-11
 8:  4.3424e+02  4.3424e+02  3e-03  6e-06  2e-11
 9:  4.3424e+02  4.3424e+02  3e-05

In [114]:
# auc1: 0.8557191054102763
# auc2: 0.8334530188299514
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import mode
import numpy as np
import pandas

def bag_data(train, test, n):
    return np.split(train, n), np.split(test, n)

def bag_predictions(model, X_bags, y_bags, x_test):
    predictions = np.empty((0, x_test.shape[0]))
    for x_bag, y_bag in  zip(X_bags, y_bags):
        temp_model = model
        temp_model.fit(x_bag, y_bag)
        prediction = temp_model.decision_function(x_test)
        predictions = np.append(predictions, [prediction], axis = 0)
    average_predictions = np.mean(predictions, axis = 0)
    return average_predictions
        

# Load data
d = pandas.read_csv('train.csv')
y = np.array(d.target)  # Labels
X = np.array(d.iloc[:,2:])  # Features

# Split into train/test folds
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.5, random_state = 0)

X_tr_bags, y_tr_bags = bag_data(X_tr, y_tr, 50)

# Linear SVM
linear_svm =  LinearSVC(random_state = 0, dual = False)

# Non-linear SVM (polynomial kernel)
poly_svm = SVC(random_state = 0, kernel = 'poly')

# Apply the SVMs to the test set
yhat1 = bag_predictions(linear_svm, X_tr_bags, y_tr_bags, X_te)
yhat2 = bag_predictions(poly_svm, X_tr_bags, y_tr_bags, X_te)

# Compute AUC
auc1 = roc_auc_score(y_te, yhat1)
auc2 = roc_auc_score(y_te, yhat2)

print(auc1)
print(auc2)

0.8557191054102763
0.8334530188299514
