In [1]:
from sklearn.datasets import fetch_openml
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import random
from micrograd.engine import Value
from micrograd.nn import Neuron, Layer, MLP

np.random.seed(1337)
random.seed(1337)

In [2]:
def load_mnist():
    
    # load image
    X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
    one_hot = lambda x, k: np.array(x[:,None] == np.arange(k)[None, :], dtype=int)

    # rescale the data, use the traditional train/test split
    X_train, X_test = X[:60000], X[60000:]
    y_train, y_test = y[:60000], y[60000:]

    X_train = X_train/255.0
    X_test = X_test/255.0

    y_train = np.array([int(k) for k in y_train])
    y_test = np.array([int(k) for k in y_test])

    y_train = one_hot(y_train, 10)
    y_test = one_hot(y_test, 10)

    N_data = X_train.shape[0]

    return N_data, X_train, y_train, X_test, y_test

def save_images(images, filename, **kwargs):
    fig = plt.figure(1)
    fig.clf()
    ax = fig.add_subplot(111)
    plot_images(images, ax, **kwargs)
    fig.patch.set_visible(False)
    ax.patch.set_visible(False)
    plt.savefig(filename)

In [None]:
# Model hyper parameters
noise_dim = 10
gen_layer_sizes = [noise_dim, 200, 784]
dsc_layer_sizes = [784, 200, 1]

# Training parameters
param_scale = 0.001
batch_size = 100
num_epochs = 50
step_size_max = 0.01
step_size_min = 0.01

print('Loading training data')
N_data, X_train, y_train, X_test, y_test = load_mnist()
N, train_images, _, test_images, _ = N_data, X_train, y_train, X_test, y_test #load_mnist()

In [None]:
### Define geneerator, discriminator, and objective ###

def relu(x):       return np.maximum(0, x)
def sigmoid(x):    return 0.5 * (np.tanh(x) + 1.0)
def logsigmoid(x): return x - np.logaddexp(0, x)

def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples,
       one for each layer in the net."""
    return [(scale * rs.randn(m, n),   # weight matrix
             scale * rs.randn(n))      # bias vector
            for m, n in zip(layer_sizes[:-1], layer_sizes[1:])]

def batch_normalize(activations):
    mbmean = np.mean(activations, axis=0, keepdims=True)
    return (activations - mbmean) / (np.std(activations, axis=0, keepdims=True) + 1)

def neural_net_predict(params, inputs):
    """Params is a list of (weights, bias) tuples.
       inputs is an (N x D) matrix."""
    inpW, inpb = params[0]
    inputs = relu(np.dot(inputs, inpW) + inpb)
    for W, b in params[1:-1]:
        outputs = batch_normalize(np.dot(inputs, W) + b)
        inputs = relu(outputs)
    outW, outb = params[-1]
    outputs = np.dot(inputs, outW) + outb
    return outputs

def generate_from_noise(gen_params, num_samples, noise_dim, rs):
    noise = rs.rand(num_samples, noise_dim)
    samples = neural_net_predict(gen_params, noise)
    return sigmoid(samples)

def gan_objective(gen_params, dsc_params, real_data, num_samples, noise_dim, rs):
    fake_data = generate_from_noise(gen_params, num_samples, noise_dim, rs)
    logprobs_fake = logsigmoid(neural_net_predict(dsc_params, fake_data))
    logprobs_real = logsigmoid(neural_net_predict(dsc_params, real_data))
    return np.mean(logprobs_real) - np.mean(logprobs_fake)


### Define minimax version of adam optimizer ###

def adam_minimax(grad_both, init_params_max, init_params_min, callback=None, num_iters=100,
         step_size_max=0.001, step_size_min=0.001, b1=0.9, b2=0.999, eps=10**-8):
    """Adam modified to do minimiax optimization, for instance to help with
    training generative adversarial networks."""

    x_max, unflatten_max = flatten(init_params_max)
    x_min, unflatten_min = flatten(init_params_min)

    m_max = np.zeros(len(x_max))
    v_max = np.zeros(len(x_max))
    m_min = np.zeros(len(x_min))
    v_min = np.zeros(len(x_min))
    for i in range(num_iters):
        g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                       unflatten_min(x_min), i)
        g_max, _ = flatten(g_max_uf)
        g_min, _ = flatten(g_min_uf)

        if callback: callback(unflatten_max(x_max), unflatten_min(x_min), i,
                              unflatten_max(g_max), unflatten_min(g_min))

        m_max = (1 - b1) * g_max      + b1 * m_max  # First  moment estimate.
        v_max = (1 - b2) * (g_max**2) + b2 * v_max  # Second moment estimate.
        mhat_max = m_max / (1 - b1**(i + 1))    # Bias correction.
        vhat_max = v_max / (1 - b2**(i + 1))
        x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps)

        m_min = (1 - b1) * g_min      + b1 * m_min  # First  moment estimate.
        v_min = (1 - b2) * (g_min**2) + b2 * v_min  # Second moment estimate.
        mhat_min = m_min / (1 - b1**(i + 1))    # Bias correction.
        vhat_min = v_min / (1 - b2**(i + 1))
        x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps)
    return unflatten_max(x_max), unflatten_min(x_min)


In [None]:
# Model hyper-parameters
noise_dim = 10
gen_layer_sizes = [noise_dim, 200, 784]
dsc_layer_sizes = [784, 200, 1]

# Training parameters
param_scale = 0.001
batch_size = 100
num_epochs = 50
step_size_max = 0.01
step_size_min = 0.01

print("Loading training data...")

init_gen_params = init_random_params(param_scale, gen_layer_sizes)
init_dsc_params = init_random_params(param_scale, dsc_layer_sizes)

num_batches = int(np.ceil(len(train_images) / batch_size))
def batch_indices(iter):
    idx = iter % num_batches
    return slice(idx * batch_size, (idx+1) * batch_size)

# Define training objective
seed = npr.RandomState(0)
def objective(gen_params, dsc_params, iter):
    idx = batch_indices(iter)
    return gan_objective(gen_params, dsc_params, train_images[idx],
                         batch_size, noise_dim, seed)

# Get gradients of objective using autograd.
both_objective_grad = grad(objective, argnum=(0, 1))

print("     Epoch     |    Objective  |       Fake probability | Real Probability  ")
def print_perf(gen_params, dsc_params, iter, gen_gradient, dsc_gradient):
    if iter % 10 == 0:
        ability = np.mean(objective(gen_params, dsc_params, iter))
        fake_data = generate_from_noise(gen_params, 20, noise_dim, seed)
        real_data = train_images[batch_indices(iter)]
        probs_fake = np.mean(sigmoid(neural_net_predict(dsc_params, fake_data)))
        probs_real = np.mean(sigmoid(neural_net_predict(dsc_params, real_data)))
        print("{:15}|{:20}|{:20}|{:20}".format(iter//num_batches, ability, probs_fake, probs_real))
        save_images(fake_data, 'gan_samples.png', vmin=0, vmax=1)

# The optimizers provided can optimize lists, tuples, or dicts of parameters.
optimized_params = adam_minimax(both_objective_grad,
                                init_gen_params, init_dsc_params,
                                step_size_max=step_size_max, step_size_min=step_size_min,
                                num_iters=num_epochs * num_batches, callback=print_perf)

In [None]:
# Adam
from micrograd.engine import Value
import math

alpha = 0.01
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-8
def func(x):
	return x*x -4*x + 4
def grad_func(x):
	return 2*x - 4
theta_0 = Value(0)
m_t = 0 
v_t = 0 
t = 0

while (1):
	t+=1
#	g_t = grad_func(theta_0)		#computes the gradient of the stochastic function
	f_pass = func(theta_0)
	f_pass.backward()
	g_t = theta_0.grad
	m_t = beta_1*m_t + (1-beta_1)*g_t	#updates the moving averages of the gradient
	v_t = beta_2*v_t + (1-beta_2)*(g_t*g_t)	#updates the moving averages of the squared gradient
	m_cap = m_t/(1-(beta_1**t))		#calculates the bias-corrected estimates
	v_cap = v_t/(1-(beta_2**t))		#calculates the bias-corrected estimates
	theta_0_prev = theta_0								
	theta_0 = theta_0 - (alpha*m_cap)/(math.sqrt(v_cap)+epsilon)	#updates the parameters
	if(theta_0.data == theta_0_prev.data):		#checks if it is converged or not
		break


In [5]:
N_data, X_train, y_train, X_test, y_test = load_mnist()

In [None]:
gen_layer_sizes = [noise_dim, 200, 784]
dsc_layer_sizes = [784, 200, 1]

In [None]:
gen_model = MLP(2, gen_layer_sizes) # 2-layer neural network
dsc_model = MLP(2, dsc_layer_sizes) # 2-layer neural network

In [6]:
model = MLP(2, [784, 200, 10])

In [7]:
inputs = [list(map(Value, xrow)) for xrow in X_train[:2]]

# forward the model to get scores
scores = list(map(model, inputs))

In [None]:
# losses = [(1 + -yi*scorei).relu() for yi, scorei in zip(yb, scores)]
v = scores[0][2].relu().log()

In [None]:
# loss function
def loss(batch_size=None):
    
    # inline DataLoader :)
    if batch_size is None:
        Xb, yb = X, y
    else:
        ri = np.random.permutation(X.shape[0])[:batch_size]
        Xb, yb = X[ri], y[ri]
    
    inputs = [list(map(Value, xrow)) for xrow in Xb]
    
    # forward the model to get scores
    scores = list(map(model, inputs))
    
    # Cross entropy loss
    losses = [scorei[yi].relu().log() for yi, scorei in zip(yb, scores)]
    total_loss = sum(losses) * (1.0 / len(losses))
    
    # also get accuracy
    accuracy = [(yi > 0) == (scorei.data > 0) for yi, scorei in zip(yb, scores)]
    return total_loss, sum(accuracy) / len(accuracy)

total_loss, acc = loss()
print(total_loss, acc)

In [10]:
y_train[0]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [None]:
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(64, nz, 1, 1, device=device)

# Establish convention for real and fake labels during training
real_label = 1
fake_label = 0

# Setup Adam optimizers for both G and D
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

In [None]:
input_list = np.random.randn(32, noise_dim)[:2]
input_list = [list(map(Value, xrow)) for xrow in input_list]
temp = list(map(gen_model, input_list))

In [None]:
def generator_loss():
    
    
def discriminator_loss():

In [None]:
# dsc loss

fake_data = generate_from_noise(gen_params, num_samples, noise_dim, rs)
logprobs_fake = logsigmoid(neural_net_predict(dsc_params, fake_data))
logprobs_real = logsigmoid(neural_net_predict(dsc_params, real_data))
logsigmoid()


# gen loss