# <center> Stochastic Optimization and Automatic Differentiation for Machine Learning<br/><br/>SDCA<br/><br/>Zakarya Ali</center>



In this notebook, I implement the **SDCA (Stochastic Dual Coordinate Ascent)** algorithm (from the article [Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization by Shai Shalev-Shwartz  and Tong Zhang](http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf) to estimate Support Vector Machines. First, I apply the algorithm on randomly generated data, then I use a credit fraud dataset to compare **SDCA** compare it with **PEGASOS (Primal Estimated subGrAdient SOlver for SVM)**, a sub-gradient descent approach.

# Prerequisites

We call and create the tools we will need throughout the notebook.

In [None]:
%pylab inline
import numpy as np
import pandas as pd
from sklearn import preprocessing, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
np.random.seed(2018)

In [None]:
def hinge_losses(y, X, w, n_samples):
    """Compute the sum of individual hinge losses"""
    return np.sum(np.fmax(np.zeros(n_samples), np.ones(n_samples) - (y * (X.dot(w)))))

def get_accuracy(X, y, w):
    """Return the accuracy of linear SVM for a given parameter w"""
    preds = np.dot(X, w)
    preds[preds >= 0] = 1
    preds[preds < 0] = -1
    return accuracy_score(y, preds)

def convergence_plot(plot_data, labels, ylim=[None, None]):
    """plot the convergence of various primal curves"""
    fig=plt.figure(figsize=(12,8))
    ax = fig.add_subplot(1,1,1)
    if (ylim != [None, None]):
        ax.set_ylim(ylim)
    for i in range(len(labels)) :
        plt.plot(plot_data[i], label = labels[i])
    plt.title("Convergence plot")
    plt.ylabel("Primal Objective")
    plt.xlabel("Steps")
    plt.legend(loc="best")
    plt.show()

# 1. Algorithms

## 1.1. SDCA

In [None]:
def primal_param(X, alpha, lambda_, n_samples):
    """Return the primal parameter w associated to a dual parameter alpha"""
    return (1 / (lambda_ * n_samples)) * (np.dot(np.transpose(X), alpha))

def get_delta_alpha_q(X, y, alpha, q, lambda_, n_samples, w):
    """Compute SDCA update"""
    A = (1 / (lambda_ * n_samples)) * (np.dot(np.transpose(X[q]), X[q]))
    B = np.dot(np.transpose(X[q]) , w)
    delta_alpha_tilde_q = (y[q] - B) / A
    return y[q] * max(0 , min(1 , y[q]*(delta_alpha_tilde_q + alpha[q]))) - alpha[q] 

In [None]:
#Here is the code for the different SDCA implementation (random, cyclic and permutation)

def sdca_svm_random(X, y, n_samples, T_0, lambda_, nb_epochs=50):

    # initialization 
    alpha = np.zeros(n_samples)
    w_history = []
    primal_history = []
    w = primal_param(X, alpha, lambda_, n_samples)

    for t in range(n_samples * nb_epochs):
        q = np.random.randint(0, n_samples)
        # SDCA update step
        delta_alpha_q = get_delta_alpha_q(X, y, alpha, q, lambda_, n_samples, w)
        e = np.zeros(n_samples)
        e[q] = 1
        sdca_update = e * delta_alpha_q
        alpha = alpha + sdca_update
        w = primal_param(X, alpha, lambda_, n_samples)

        w_history.append(w)
        primal_history.append(hinge_losses(y, X, w, n_samples) / n_samples + (lambda_ / 2) * np.linalg.norm(w)**2)

    return asarray(w_history[T_0:]).mean(axis = 0), w_history, primal_history
    
def sdca_svm_permutation(X, y, n_samples, T_0, lambda_, nb_epochs=50):

    # initialization 
    alpha = np.zeros(n_samples)
    w_history = []
    primal_history = []
    w = primal_param(X, alpha, lambda_, n_samples)
    
    count = 0
    for t in range(nb_epochs):
        perm = np.random.permutation(n_samples)
        for q in perm :
            # SDCA update step
            delta_alpha_q = get_delta_alpha_q(X, y, alpha, q, lambda_, n_samples, w)
            e = np.zeros(n_samples)
            e[q] = 1
            sdca_update = e * delta_alpha_q
            alpha = alpha + sdca_update
            w = primal_param(X, alpha, lambda_, n_samples)

            w_history.append(w)
            primal_history.append(hinge_losses(y, X, w, n_samples) / n_samples + (lambda_ / 2) * np.linalg.norm(w)**2)
            count +=1

    return asarray(w_history[T_0:]).mean(axis = 0), w_history, primal_history

def sdca_svm_cyclic(X, y, n_samples, T_0, lambda_, nb_epochs=50):

    # initialization 
    alpha = np.zeros(n_samples)
    w_history = []
    primal_history = []
    w = primal_param(X, alpha, lambda_, n_samples)
    
    count = 0
    perm = np.random.permutation(n_samples)
    for t in range(nb_epochs):
        for q in perm :
            # SDCA update step
            delta_alpha_q = get_delta_alpha_q(X, y, alpha, q, lambda_, n_samples, w)
            e = np.zeros(n_samples)
            e[q] = 1
            sdca_update = e * delta_alpha_q
            alpha = alpha + sdca_update
            w = primal_param(X, alpha, lambda_, n_samples)

            w_history.append(w)
            primal_history.append(hinge_losses(y, X, w, n_samples) / n_samples + (lambda_ / 2) * np.linalg.norm(w)**2)
            count +=1
                
    return asarray(w_history[T_0:]).mean(axis = 0), w_history, primal_history

## 1.2. Pegasos

Main difference with SDCA : 
- We compute sub-gradients and not gradients at each steps 
- The step size is always $\frac{1}{\lambda t}$

We undersample the majority class in order to get a better training set.

In [None]:
rus = RandomUnderSampler(random_state=42, ratio={-1: 1000})
X_res, y_res = rus.fit_sample(X_train, y_train)
n_samples = X_res.shape[0]

In [None]:
plt.hist(y_res)
plt.title('Distribution of resampled credit card transaction (-1 = no fraud | 1 = fraud)')
plt.show()

In [None]:
y_res.shape

In [None]:
X_res.shape

## 3.1. SDCA

In [None]:
#SDCA Random
gen_opt_w_sdca_avg_r, gen_w_hist_sdca_avg_r, gen_primal_hist_sdca_avg_r = sdca_svm_random(X_res, y_res, 
                                                                                          n_samples, T_0=50 * n_samples // 2, 
                                                                                          lambda_=50) 
#SDCA Permutation
gen_opt_w_sdca_avg_p, gen_w_hist_sdca_avg_p, gen_primal_hist_sdca_avg_p = sdca_svm_permutation(X_res, y_res, 
                                                                                               n_samples, T_0=50 * n_samples // 2, 
                                                                                               lambda_=50)
#SDCA Cyclic
gen_opt_w_sdca_avg_c, gen_w_hist_sdca_avg_c, gen_primal_hist_sdca_avg_c = sdca_svm_cyclic(X_res, y_res, 
                                                                                          n_samples, T_0=50 * n_samples // 2, 
                                                                                          lambda_=50)

In [None]:
convergence_plot([gen_primal_hist_sdca_avg_r, gen_primal_hist_sdca_avg_p, gen_primal_hist_sdca_avg_c],
                 ["Random", "Permutation", "Cyclic"])

## 3.2. PEGASOS

In [None]:
#PEGASOS
gen_opt_w_peg_avg, gen_w_hist_peg_avg, gen_primal_hist_peg_avg = pegasos_svm(X_res, y_res, 
                                                                             n_samples, T_0=50 * n_samples // 2, 
                                                                             lambda_=50, proj=False)
#PEGASOS (Projected)
gen_opt_w_peg_avg_proj, gen_w_hist_peg_avg_proj, gen_primal_hist_peg_avg_proj = pegasos_svm(X_res, y_res, 
                                                                                            n_samples, T_0=50 * n_samples // 2, 
                                                                                            lambda_=50, proj=True)

In [None]:
convergence_plot([gen_primal_hist_peg_avg, gen_primal_hist_peg_avg_proj], 
                 ["Pegasos", "Proj"], 
                 ylim=[0.898, 0.9])

## 3.3. Accelerated mini-batch SDCA (ASDCA)

In [None]:
#Non projected PEGASOS
opt_w_peg, w_hist_peg, primal_hist_peg = pegasos_svm(X_train, y_train,
                                                        n_samples_train, T_0=50 * n_samples // 2,
                                                        lambda_=1 / n_samples, proj=False)
#Projected PEGASOS
opt_w_peg_p, w_hist_peg_p, primal_hist_peg_p = pegasos_svm(X_train, y_train,
                                                            n_samples_train, T_0=50 * n_samples // 2,
                                                            lambda_=1 / n_samples, proj=True)

In [None]:
convergence_plot([primal_hist_peg, primal_hist_peg_p], 
                 ["Non projected PEGASOS", "Projected PEGASOS"], 
                 ylim=[0,3])

In [None]:
# Prediction accuracy on test set
print("Non projected PEGASOS", get_accuracy(X_test, y_test, w_hist_peg[-1]))
print("Projected PEGASOS", get_accuracy(X_test, y_test, w_hist_peg_p[-1]))
print("Average PEGASOSO", get_accuracy(X_test, y_test, opt_w_peg))
print("Average projected PEGASOS", get_accuracy(X_test, y_test, opt_w_peg_p))
print("True", get_accuracy(X_test, y_test, beta))

## 2.3. Accelerated mini-batch SDCA

In [None]:
# ASDCA
opt_w_sdca_batch, w_hist_sdca_batch, primal_hist_sdca_batch = asdca_svm(X_train, y_train, 
                                                                        n_samples_train, T_0=50 * n_samples // 2, 
                                                                        lambda_=1 / n_samples, batch_size = 8, theta = 0.15)

In [None]:
convergence_plot(primal_hist_sdca_batch, 
                 ["ASDCA"])

In [None]:
# Computing prediction accuracy on test set
print("ASDCA", get_accuracy(X_test, y_test, w_hist_sdca_batch[-1]))
print("Average ASDCA", get_accuracy(X_test, y_test, opt_w_sdca_batch))
print("True", get_accuracy(X_test, y_test, beta))

## 2.4. Mini-batch PEGASOS

In [None]:
# Mini-batch PEGASOS
opt_w_peg_batch, w_hist_peg_batch, primal_hist_peg_batch = mini_batch_pegasos_svm(X_train, y_train, 
                                                                                  n_samples_train, T_0=50 * n_samples // 2,
                                                                                lambda_=1 / n_samples, batch_size=8, proj=False)
# Mini-batch PEGASOS (Projected)
opt_w_peg_batch_p, w_hist_peg_batch_p, primal_hist_peg_batch_p = mini_batch_pegasos_svm(X_train, y_train, 
                                                                                        n_samples_train, T_0=50 * n_samples // 2, 
                                                                                        lambda_=1 / n_samples, batch_size=8, proj=True)

In [None]:
convergence_plot([primal_hist_peg_batch, primal_hist_peg_batch_p], 
                 ["Mini-Batch Non projected PEGASOS", "Mini-Batch projected PEGASOS"],
                 ylim=[0,1])

In [None]:
# Computing prediction accuracy on test set
print("Mini Batch PEGASOS", get_accuracy(X_test, y_test, w_hist_peg_batch[-1]))
print("Mini Batch PEGASOS (Projected)", get_accuracy(X_test, y_test, w_hist_peg_batch_p[-1]))
print("Average Mini Batch PEGASOS", get_accuracy(X_test, y_test, opt_w_peg_batch))
print("Average Mini Batch PEGASOS (Projected)", get_accuracy(X_test, y_test, opt_w_peg_batch_p))
print("True", get_accuracy(X_test, y_test, beta))

## 2.5. Results comparison

In [None]:
# Train test split for this dataset :
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.33, stratify=y, random_state=2018)

In [None]:
plt.figure(figsize=(12,8))
y_train.hist()
plt.title('Distribution of resampled credit card transaction (-1 = no fraud | 1 = fraud)')
plt.show()

The problem is highly unbalanced (0.175% of credit fraud in the dataset)

In [None]:
# Convergence comparison :
convergence_plot([primal_hist_sdca_r, primal_hist_peg, primal_hist_sdca_batch, primal_hist_peg_batch],
                 ["SDCA", "PEGASOS", "ASDCA", "Mini-Batch PEGASOS"], 
                 ylim = [0, 2.5])

# 3. Application: Credit Fraud detection (highly unbalanced dataset)

This is a dataset from a Kaggle competition : https://www.kaggle.com/mlg-ulb/creditcardfraud/data

In [None]:
def rework_labels(label):
    result = label
    if label == 0:
        result = -1
    return result

data = pd.read_csv("data/creditcard.csv")
data = data.dropna()
print(data.shape)
X = data.drop(columns=["Class"])
y = data["Class"].apply(rework_labels)
data.head()

Our dataset contain 30 features and 284807 observations.

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Standardisation of features is required before SVM :
X_std = preprocessing.StandardScaler().fit_transform(X)

In [None]:
def partial_sub_gradient(w, X, y, q, lambda_):
    """Partial Subgradient for the hinge-loss"""
    if ( y[q]*np.dot(w, X[q]) < 1 ):
        return lambda_ * w - y[q] * X[q]
    else:
        return lambda_ * w

In [None]:
def pegasos_svm(X, y, n_samples, T_0, lambda_, nb_epochs=50, proj=False):
    """Compute the PEGASOS SVM for both projection and non projection options"""
    # initialization 
    w = np.zeros(X.shape[1])
    w_history = []
    primal_history = []
    
    for t in range(1, n_samples * nb_epochs):
        # compute Pegasos step size
        step_size = 1 / (lambda_ * t)
        # pick random sample
        q = np.random.randint(0, n_samples)
        # compute and apply Pegasos update rule
        w = w - step_size * partial_sub_gradient(w, X, y, q, lambda_)
        # projection step (optional)
        if (proj == True):
            w = min(1, 1 / (np.sqrt(lambda_) * np.linalg.norm(w))) * w
        
        w_history.append(w)        
        primal_history.append(hinge_losses(y, X, w, n_samples) / n_samples + (lambda_ / 2) * np.linalg.norm(w)**2)
        
    return asarray(w_history[T_0:]).mean(axis = 0), w_history, primal_history

 ## 1.3. Accelerated mini-batch SDCA (ASDCA)
For accelerated mini-batch SDCA, the smooth version of the hinge loss is used.

In [None]:
def partial_gradient_smooth(w, X, y, q):
    """Gradient of the smooth hinge loss"""
    results = np.zeros(X.shape[1])
    partial_gradient = y[q] * np.dot(X[q], w)
    
    if (0 <= partial_gradient <= 1 ):
        results = (partial_gradient - 1) * y[q] * X[q]
    elif (y[q] * np.dot(w, X[q]) < 0 ):
        results = - y[q] * X[q]
    
    return results

In [None]:
def asdca_svm(X, y, n_samples, T_0, lambda_, batch_size, theta=0.3, nb_epochs=50):

    # initialization
    alpha = np.zeros(shape = (X.shape[1], n_samples))
    bar_alpha = np.mean(alpha, axis=1)
    w = np.zeros(X.shape[1])
    w_history = []
    primal_history = []

    for t in range(n_samples * nb_epochs):
        u = (1 - theta) * w + (theta / lambda_) * bar_alpha
        # mini-batch sampling
        batch = np.random.choice(np.arange(0, n_samples), batch_size, replace=False)
        alpha_dif = []
        # update step
        for q in batch:
            old_alpha_q = alpha[:,q].copy()
            alpha[:,q] = (1 - theta) * alpha[:,q] - theta * partial_gradient_smooth(u, X, y, q)
            alpha_dif.append(alpha[:,q] - old_alpha_q)
        bar_alpha = bar_alpha + (1 / n_samples) * sum(alpha_dif, axis=0)
        w = (1 - theta) * w + (theta / lambda_) * bar_alpha
        
        w_history.append(w)
        primal_history.append(hinge_losses(y, X, w, n_samples) / n_samples + (lambda_ / 2) * np.linalg.norm(w)**2)

    return asarray(w_history[T_0:]).mean(axis = 0), w_history, primal_history


## 1.4. Mini-batch Pegasos

In [None]:
def mini_batch_pegasos_svm(X, y, n_samples, T_0, lambda_, batch_size, nb_epochs=50, proj=False):

    # initialization
    w = np.zeros(X.shape[1])
    w_history = []
    primal_history = []
    
    # main loop
    for t in range(1, n_samples * nb_epochs):
        step_size = 1 / (lambda_ * t)
        # mini-batch sampling
        batch = np.random.choice(np.arange(0, n_samples), batch_size, replace=False)
        sum_vect = []
        for q in batch:
            if (y[q] * np.dot(X[q], w) < 1):
                sum_vect.append(y[q] * X[q])
            else:
                sum_vect.append(np.zeros(X.shape[1]))
                
        w = w - step_size * (lambda_ * w - (1 / batch_size) * sum(sum_vect, axis=0))
        # Projection step
        if (proj == True):
            w = min(1, 1 / (np.sqrt(lambda_) * np.linalg.norm(w))) * w
            
        w_history.append(w)
        primal_history.append(hinge_losses(y, X, w, n_samples) / n_samples + (lambda_ / 2) * np.linalg.norm(w)**2)
    
    return asarray(w_history[T_0:]).mean(axis = 0), w_history, primal_history


# 2. Application: Simulated data
We now apply those algorithms on simulated data: 1000 observations with 100 features.

In [None]:
n_samples, n_features = 1000, 100
# Feature matrix
X = np.random.randn(n_samples, n_features)
# ground truth parameter
beta = np.random.randn(n_features)
# Binary label vector
y = np.dot(X, beta) + np.random.randn(n_samples)
y[y >= 0] = 1
y[y < 0] = -1

In [None]:
# We normalize the features before applying SVM
X_std = preprocessing.StandardScaler().fit_transform(X)

# Then we split our dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.33, random_state=42)
n_samples_train = X_train.shape[0]

## 2.1. SDCA

In [None]:
#ASDCA
gen_opt_w_sdca_batch_avg, gen_w_hist_sdca_batch_avg, gen_primal_hist_sdca_batch_avg = asdca_svm(X_res, y_res, 
                                                                                              n_samples, T_0=50 * n_samples // 2, 
                                                                                              lambda_=50, batch_size = 8, theta = 0.15)

In [None]:
convergence_plot([gen_primal_hist_sdca_batch_avg], 
                 ["ASDCA"], 
                 ylim=[0.905, 0.915])

## 3.4. Mini-batch PEGASOS

In [None]:
#Mini-batch PEGASOS
gen_opt_w_peg_batch_avg, gen_w_hist_peg_batch_avg, gen_primal_hist_peg_batch_avg = mini_batch_pegasos_svm(X_res, y_res, 
                                                                                                          n_samples, T_0=50 * n_samples // 2, 
                                                                                                          lambda_=50, batch_size=8, proj=False)
#Mini-batch PEGASOS (Projected)
gen_opt_w_peg_batch_avg_proj, gen_w_hist_peg_batch_avg_proj, gen_primal_hist_peg_batch_avg_proj = mini_batch_pegasos_svm(X_res, y_res, n_samples, 
                                                                                                                         T_0=50 * n_samples // 2, 
                                                                                                                         lambda_=50, batch_size=8, proj=False)

In [None]:
convergence_plot([gen_primal_hist_peg_batch_avg, gen_primal_hist_peg_batch_avg_proj], 
                ["Batch", "Proj"], 
                ylim=[0.898, 0.9])

## 3.5. Comparison

In [None]:
convergence_plot([gen_primal_hist_sdca_avg_r, gen_primal_hist_peg_avg, gen_primal_hist_sdca_batch_avg, gen_primal_hist_peg_batch_avg],
                 ["SDCA", "PEGASOS", "ASDCA", "Mini-Batch PEGASOS"], 
                 ylim = [0.8, 1])

## 3.6 Accuracy

In [None]:
# SDCA Random
opt_w_sdca_r, w_hist_sdca_r, primal_hist_sdca_r = sdca_svm_random(X_train, y_train,
                                                                            n_samples_train, T_0=50 * n_samples // 2,
                                                                            lambda_=1 / n_samples)
#SDCA Permutation
opt_w_sdca_p, w_hist_sdca_p, primal_hist_sdca_p = sdca_svm_permutation(X_train, y_train,
                                                                            n_samples_train, T_0=50 * n_samples // 2,
                                                                            lambda_=1 / n_samples)
#SDCA Cyclic
opt_w_sdca_c, w_hist_sdca_c, primal_hist_sdca_c = sdca_svm_cyclic(X_train, y_train,
                                                                            n_samples_train, T_0=50 * n_samples // 2,
                                                                            lambda_=1 / n_samples)

In [None]:
convergence_plot([primal_hist_sdca_r, primal_hist_sdca_p, primal_hist_sdca_c], 
                 ["Random", "Permutation", "Cyclic"])

In [None]:
# Prediction accuracy on test set
print("SDCA Random:", get_accuracy(X_test, y_test, w_hist_sdca_r[-1]))
print("SDCA Permutation:", get_accuracy(X_test, y_test, w_hist_sdca_p[-1]))
print("SDCA Cyclic:", get_accuracy(X_test, y_test, w_hist_sdca_c[-1]))
print("SDCA Average Random:", get_accuracy(X_test, y_test, opt_w_sdca_r))
print("SDCA Average Permutation:", get_accuracy(X_test, y_test, opt_w_sdca_p))
print("SDCA Average Cyclic:", get_accuracy(X_test, y_test, opt_w_sdca_c))
print("True:", get_accuracy(X_test, y_test, beta))

## 2.2. PEGASOS