In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Gradient descent

## Analytical gradient descent

In [2]:
def gradient_descent(x0, f, g, learning_rate=0.05, T=1000):
    """
    X: array of values
    f: function to apply
    g: gradient of the function
    learning_rate: Learning rate for each step
    T: number of iteration
    """
    x = x0
    t = 0
    while t <= T:
        x -= learning_rate * g(x)
        t +=1
    return x

In [3]:
def test_function(x):
    return 3*(x**2) + 4*x + 1

In [4]:
def test_function_gradient(x):
    return 6*x + 4

In [5]:
X = np.random.rand(10)

In [6]:
gradient_descent(10, test_function, test_function_gradient)

-0.6666666666666665

In [7]:
X

array([0.27341011, 0.02729229, 0.67185016, 0.16542925, 0.93261355,
       0.49692679, 0.48212734, 0.9388974 , 0.18673612, 0.7766184 ])

In [8]:
gradient_descent(X)

TypeError: gradient_descent() missing 2 required positional arguments: 'f' and 'g'

## Numerical gradient descent

In [9]:
def numerical_gradient_descent(x0,f,dx,learning_rate=0.05, T=1000):
    x = x0
    t = 0
    while t <= T:
        x -= learning_rate * (f(x + dx) - f(x)) / dx
        t+=1
    return x

In [10]:
numerical_gradient_descent(10, test_function, dx=0.01)

-0.6716666666666643

## Algorithmic Differentiation [with tensorflow]
- Check Tensorflow tutorial notebook about the use of the notebook.

In [11]:
def logistic_cost(ytrue, yhat):
    return np.log(1 + np.exp(-ytrue*yhat))

# Use rosenbrock's function to test gradient descent
# Compute gradient:
    # analytically
    # numerically
    # algorithmic differentiation

In [12]:
# Creating the stochastic gradient descent function
def stochastic_gradient_descent(X, ytrue, T=1000, epsilon=0.1, learning_rate=0.5): 
    nrow = len(ytrue)
    w = np.zeros(X.shape[1])  # initialisation du vecteur w avec les w_i = 0
    t = 0
    m = len(ytrue)
    
    cost = np.inf
    costs = []

    while (np.linalg.norm(np.gradient(cost)) > epsilon) or (t <= T):
        i = np.random.randint(nrow)
        y_t = ytrue[i] 
        X_t = X[i,:]
        prediction = np.dot(X_t.T, w)
        cost = np.sum(logistic_cost(prediction, y_t))
        costs.append(cost)
        w = w - learning_rate * (1/m * X_t.T.dot(prediction - y_t))
        t += 1 
    return w, costs 

In [13]:
# def BGFS(X, y, cost_func, epsilon=0.1, T=1000):
#     d = X.shape[1]
#     wt = np.zeros(d)
#     B0 = np.identity(d, dtype=np.float64)
#     prediction = np.dot(X.T, w)
#     cost = np.sum(logistic_cost(prediction, y))
#     w = w - np.gradient(1/m * X.T.dot(prediction - y_t))
#     p0 = -1 * np.gradient(cost)
#     t = 0
#     wt_new = wt
    
#     while np.abs(wt - wt_new) <= epsilon * 
    
# def BGFS(X, y, cost_func, epsilon=0.1, T=1000):
#     d = X.shape[1]
#     w = np.zeros(d)
#     wp = w
#     B0 = np.identity(d, dtype=np.float64)
    
    
#     NewLoss = cost_func(wt, X)
#     g = np.gradient()
#     prediction = np.dot(X.T, w)
#     cost = np.sum(logistic_cost(prediction, y))
#     prev_cost = cost
#     w = w - np.gradient(1/m * X.T.dot(prediction - y))
#     g = 
#     p0 = -1 * np.gradient(cost)
#     t = 0
#     wt_new = wt
    
#     while np.abs(wt - wt_new) <= epsilon * 

In [38]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=5)

In [15]:
X.shape

(10, 5)

In [16]:
y

array([1, 0, 1, 1, 1, 0, 0, 0, 1, 0])

In [17]:
w

NameError: name 'w' is not defined

In [18]:
t = np.array([i for i in range(1001)])

In [19]:
t.shape

(1001,)

In [20]:
len(costs)

NameError: name 'costs' is not defined

In [21]:
plt.plot(t, costs)

NameError: name 'costs' is not defined

In [22]:
np.zeros(10).shape

(10,)

# Some classification algorithms

In [23]:
# Testing the code for the perceptron
eta = 0.1
n, d = X.shape
w = np.zeros(d)

n, d = X.shape
w = np.zeros(d)
w0 = 0.
t = 0
while t <= 100:
    i = np.random.randint(n)
    y_t = y[i] 
    X_t = X[i,:]
    if (y_t * (np.dot(w,X_t.T) + w0)) <= 0:
        w0 += eta*y_t
        w += eta * y_t * X_t
    else:
        w = w
    t +=1

In [24]:
# Perceptron function
def perceptron(X, y, eta=0.1, T=1000):
    n, d = X.shape
    w = np.zeros(d)
    w0 = 0.
    t = 0
    while t <= T:
        i = np.random.randint(n)
        y_t = y[i] 
        X_t = X[i,:]
        print((w, w0))
        if (y_t * (np.dot(w,X_t.T) + w0)) <= 0:
            w0 += eta*y_t
            w += eta * y_t * X_t
        else:
            w = w
            w0 = w0
        t +=1
    return w, w0

In [39]:
w, w0 = perceptron(X, y, T=10)

(array([0., 0., 0., 0., 0.]), 0.0)
(array([0., 0., 0., 0., 0.]), 0.0)
(array([0., 0., 0., 0., 0.]), 0.0)
(array([0., 0., 0., 0., 0.]), 0.0)
(array([0., 0., 0., 0., 0.]), 0.0)
(array([ 0.21820352, -0.11732563,  0.20659043, -0.05607717,  0.03266471]), 0.1)
(array([ 0.21820352, -0.11732563,  0.20659043, -0.05607717,  0.03266471]), 0.1)
(array([ 0.21820352, -0.11732563,  0.20659043, -0.05607717,  0.03266471]), 0.1)
(array([ 0.21820352, -0.11732563,  0.20659043, -0.05607717,  0.03266471]), 0.1)
(array([ 0.21820352, -0.11732563,  0.20659043, -0.05607717,  0.03266471]), 0.1)
(array([ 0.21820352, -0.11732563,  0.20659043, -0.05607717,  0.03266471]), 0.1)


In [26]:
w0

0.2

In [45]:
sortie_test = (np.dot(w, X.T) + w0)

In [27]:
(y[5] * w @ X[5].T + w0) / np.linalg.norm(w)

0.6771316775261573

In [42]:
def activation(x):
    if x > 0: 
        return 1
    else: 
        return -1

In [57]:
type(w0) == np.dtype('float64')

True

In [46]:
sortie_test

array([-0.52517699, -0.27305774,  0.66000704,  1.52648826, -0.421868  ,
        0.20204216, -0.68468165, -0.68401282, -0.70013726,  0.79442962,
        0.10596325, -0.43000501, -0.46480789,  0.46763364,  0.01051363,
       -0.53934054,  0.73394098, -0.20674061, -0.61351956, -0.39636847,
        0.79722777,  0.34329584,  1.26032973, -0.0916505 , -0.03172506,
        1.04507921, -0.1817996 ,  0.53171765,  0.66242078,  0.75304859,
       -0.452702  , -0.41070607,  0.42238437,  0.73986322,  0.6505345 ,
       -0.45685948, -0.14282927,  0.75954681, -0.58202525, -0.53290902,
       -0.35688217, -0.37682572, -0.62096517,  0.42705852,  0.6732538 ,
       -0.20843121,  0.52257223,  1.18269318, -0.49072721,  1.03199629,
        1.33681499, -0.25215369, -0.27564412,  0.31104379,  0.2599486 ,
       -0.1155815 ,  0.35300142,  2.06203904, -0.32955854, -0.23198058,
       -0.38502562, -0.02816661,  0.80022362, -0.38023179,  0.52440308,
        0.7173559 , -0.47814171, -0.30043102, -0.18655112,  0.82

In [58]:
np.sign(sortie_test)

array([-1., -1.,  1.,  1., -1.,  1., -1., -1., -1.,  1.,  1., -1., -1.,
        1.,  1., -1.,  1., -1., -1., -1.,  1.,  1.,  1., -1., -1.,  1.,
       -1.,  1.,  1.,  1., -1., -1.,  1.,  1.,  1., -1., -1.,  1., -1.,
       -1., -1., -1., -1.,  1.,  1., -1.,  1.,  1., -1.,  1.,  1., -1.,
       -1.,  1.,  1., -1.,  1.,  1., -1., -1., -1., -1.,  1., -1.,  1.,
        1., -1., -1., -1.,  1.,  1., -1.,  1., -1.,  1.,  1., -1., -1.,
        1.,  1., -1.,  1., -1., -1.,  1., -1.,  1.,  1.,  1.,  1., -1.,
       -1., -1.,  1.,  1., -1.,  1.,  1., -1., -1.])

In [48]:
list(map(lambda x: activation(x), sortie_test))

[-1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1]

In [30]:
activation((y[5] * w @ X[5].T + w0) / np.linalg.norm(w))

1

In [6]:
np.random.random(10)

array([0.1986317 , 0.31725294, 0.58070686, 0.11693641, 0.93201599,
       0.58055408, 0.38162891, 0.20343095, 0.07359161, 0.91957794])

In [46]:
# making the adaline algorithm
def adaline(X, y, eta=0.1, T=1000):
    n, d = X.shape
    w = np.random.random(d)
    w0 = 0
    t = 0
    h = 0
    while t <= T:
        i = np.random.randint(n)
        yt = y[i]
        Xt = X[i,:]
        h = np.sign((w0 + np.dot(w,Xt.T)))
#         h = activation(h)
        w0 += eta*(yt - h)
        w += eta * (yt - h) * Xt
        t += 1
    return w, w0
    

In [47]:
adaline(X, y, T=1000)

(array([-1.06036228e-02,  4.03209240e-01, -2.79639867e-01, -9.91285892e-02,
         7.02710903e-02,  2.39968690e-01,  4.07756888e-02,  8.29421271e-01,
        -2.38973969e-01, -4.81040723e-01,  7.28099200e-01,  2.62684751e+00,
        -4.60356297e-01,  5.73880187e-01,  3.23100450e-01, -3.69506806e-01,
        -1.54382186e-01,  4.95485556e-01,  2.87166692e-01, -2.50065139e-01,
        -1.01653944e-01,  1.44675325e-02, -1.06761257e-01,  2.16266490e-01,
        -1.79380644e-03]), -0.4)

In [37]:
X.shape

(10, 5)

# Gradient conjugate