# Challenge Scratchbook

* This notebook explores methods for the Kernel Methods for Machine Learning Kaggle [challenge](https://www.kaggle.com/c/kernel-methods-for-machine-learning-2018-2019/data).

* Note that this is a binary classification challenge.

Our first goal is to implement two baseline methods:
1. Random classification
2. All instances are 0s (Doing so we get an idea of the proportion of 0's in the public test set)
3. Implement the Simple Pattern Recognition Algorithm (SPR) from Learning with Kernels 

Before that, we have to implement some data loaders


Now that we are done with the above, our goal is to implement SVM with Gaussian kernel.

## Imports

In [5]:
import csv
import os
import numpy as np

## Paths and Globals

In [6]:
CWD = os.getcwd()
DATA_DIR = os.path.join(CWD, "data")
RESULT_DIR = os.path.join(CWD, "results")

FILES = {0: {"train_mat": "Xtr0_mat100.csv",
             "train": "Xtr0.csv",
             "test_mat": "Xte0_mat100.csv",
             "test": "Xte0.csv",
             "label": "Ytr0.csv"},
         1: {"train_mat": "Xtr1_mat100.csv",
             "train": "Xtr1.csv",
             "test_mat": "Xte1_mat100.csv",
             "test": "Xte1.csv",
             "label": "Ytr1.csv"},
         2: {"train_mat": "Xtr2_mat100.csv",
             "train": "Xtr2.csv",
             "test_mat": "Xte2_mat100.csv",
             "test": "Xte2.csv",
             "label": "Ytr2.csv"}}

## 0 entries

In [13]:
with open(os.path.join(RESULT_DIR, "results.csv"), 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    
    writer.writerow(["Id", "Bound"])
    for i in range(3000):
        writer.writerow([i, 0])

**Comment:**

* We get 0.51266 which means that the dataset is pretty balanced.

## SPR

In [37]:
class SPR:
    """
    This class implements the Simple Pattern Recognition algorithm found in the Learning with Kernel books
    """
    def __init__(self):
        self.m0 = 0
        self.m1 = 0
        self.b = 0

        
    def fit(self,X,y):
        """
        Fitting phase
        
        Parameters
        ------------
        - X : numpy.ndarray
            Data matrix
            
        - y : numpy.array
            Labels
        """
        
        self.X0 = X[y == 0]
        self.X1 = X[y == 1]
        
        self.m0 = len(self.X0)
        self.m1 = len(self.X1)
        
        self.b = 1/2 * (1/(self.m0**2)*np.sum(self.X0.dot(self.X0.T)) - 1/(self.m1**2)*np.sum(self.X1.dot(self.X1.T)))
        
        #self.b = 1/2 * (1/(self.m0**2)*np.sum([kernel(self.X_train[i],self.X_train[j]) for i in self.list0 for j in self.list0]) - 1/(self.m1)**2*np.sum([kernel(self.X_train[i],self.X_train[j]) for i in self.list1 for j in self.list1]))
    
    def predict(self,X):
        
        y_pred = np.zeros(len(X))
        
        for i in range(len(X)):
            val = 1 / self.m1 * np.sum(self.X1.dot(X[i])) - 1 / self.m0 * np.sum(self.X0.dot(X[i])) + self.b
            #val = 1/self.m1*np.sum([kernel(self.X_train[k],X[i]) for k in self.list1]) - 1/self.m0*np.sum([kernel(self.X_train[k],X[i]) for k in self.list0]) + self.b
            y_pred[i] = np.sign(val)/2 + 1/2
        return y_pred
    
        #val = 1 / self.m0 * np.sum(self.X0.dot(X.T)) + 1 / self.m1 * np.sum(self.X1.dot(X.T)) + self.b
        #y_pred = np.sign(val) / 2 + 1/2
        #return y_pred.astype("int")
    
    
    
    def score(self, y, y_pred):
        return np.sum(y[y == y_pred]) / len(y)
    

## Data loading refactoring

In [7]:
def load_data(file_id, mat=True):
    
    X_train = list()
    Y_train = list()
    X_test = list()
    
    dic = FILES[file_id]
    
    if mat:
        files = [dic["train_mat"], dic["label"], dic["test_mat"]]
    else:
        files = [dic["train"], dic["label"], dic["test"]]

    for file, l in zip(files, [X_train, Y_train, X_test]):
        with open(os.path.join(DATA_DIR, file), "r", newline="") as csvfile:
            if "Y" in file:
                reader = csv.reader(csvfile, delimiter=",")
                next(reader, None) # Skip the header
                for row in reader:
                    l.append(row[1])
            else:
                reader = csv.reader(csvfile, delimiter=" ")
                for row in reader:
                    l.append(row)
                
    X_train = np.array(X_train).astype("float")
    Y_train = np.array(Y_train).astype("int")
    X_test = np.array(X_test).astype("float")
    
    return X_train, Y_train, X_test

## Train and test on the different sets

In [41]:
results = np.zeros(3000)

for i in range(len(FILES)):
    X_train, Y_train, X_test = load_data(i)
    clf = SPR()
    clf.fit(X_train, Y_train)
    results[i*1000:i*1000 + 1000] = clf.predict(X_test)

## Save results

In [8]:
def save_results(filename, results):
    """
    Save results in a csv file
    
    Parameters
    -----------
    - filename : string
        Name of the file to be saved under the ``results`` folder
        
    - results : numpy.array
        Resulting array (0 and 1's)
    """
    
    assert filename.endswith(".csv"), "this is not a csv extension!"
    # Convert results to int
    results = results.astype("int")
    
    with open(os.path.join(RESULT_DIR, filename), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')

        # Write header
        writer.writerow(["Id", "Bound"]) 
        assert len(results) == 3000, "There is not 3000 predictions"
        # Write results
        for i in range(len(results)):
                writer.writerow([i, results[i]])

In [46]:
# Test the save results function
save_results("test_results.csv", results)

## SVM with Gaussian Kernel

In [71]:
def second_order_kernel(x,y,c): #c=0
    return (x.dot(y) + c)**2

# Define the Gaussian Kernel (or Radial Basis Function)

def rbf_kernel(x,y, gamma=10): #c=100
    return np.exp(-gamma*np.linalg.norm(x-y)**2)

In [100]:
# Load some data
X_train, Y_train, X_test = load_data(2)

# Define the kernel to use
kernel = rbf_kernel

# Define the kernel matrix K
K = np.array([[kernel(x, y) for x in X_train]
              for y in X_train])

K.shape

(2000, 2000)

In [91]:
from scipy import optimize

In [92]:
# transpose Y_train to fit the optimization formulation
y = Y_train * 2 - 1

###########################################
# Use scipy.optimize to solve the problem #
###########################################

n = len(y)
# Define λ
#λ = 1 / n
λ = 1 / 10000

# Define the loss function
f = lambda x: 1/2 * x.T.dot(K).dot(x) - y.T.dot(x)    
# Define the jacobian of the loss function
grad_f = lambda x: K.dot(x) - y

# Define the bounds (sequences of min, max)
# This depends on the sign of Y_train
bounds = [[0, y[i] / (2 * n * λ)] 
          if y[i] > 0
          else [y[i] / (2 * n * λ), 0]
          for i in range(n)]


x0 = np.zeros(n)
opts = {"maxiter": 15000}
res = optimize.minimize(f, x0, jac=grad_f, bounds=bounds, method="L-BFGS-B", options=opts)

In [93]:
# Apply the results to the test data points
n_test = len(X_test)
y_predict = np.zeros(n_test)

α = res["x"]

for i in range(n_test):
    y_predict[i] = np.sign(np.sum([α[j] * kernel(X_train[j], X_test[i]) for j in range(n)])) / 2 + 1/2
    
y_predict

array([0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1.,
       1., 1., 0., 1., 0.

In [94]:
np.unique(y_predict, return_counts=True)

(array([0., 1.]), array([657, 343]))

In [95]:
α = res["x"]

In [96]:
u, c = np.unique(α, return_counts=True)
c

array([751,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1, 344,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1, 758])

In [60]:
u

array([-2.5       , -2.49744715, -2.48645214, ...,  2.48677423,
        2.48958713,  2.5       ])

In [97]:
#u[u == 0]
c[u == 0]

array([344])

In [98]:
y_train_predict = np.zeros(n)

α = res["x"]

for i in range(n):
    y_train_predict[i] = np.sign(np.sum([α[j] * kernel(X_train[j], X_train[i]) for j in range(n)])) / 2 + 1/2

In [99]:
np.sum([Y_train == y_train_predict]) / n

0.6865