# Reseach Perspectives on Machine Learning

## 1943 - Perceptron

The perceptron was originally a machine made to do binary classification on images. Let's build a little simulation to see if we can get the perceptron to recognize two images.

### Data prep

In [None]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

#original perceptron was hooked up to ~400 photocells. let's increase ours a bit to 1600 photocells, so images of size 40x40
#load in our images
img1 = Image.open('img1_bw.jpg')
img1 = np.array(img1)
print(img1.shape)
img2 = Image.open('img2_bw.jpg')
img2 = np.array(img2)
print(img2.shape)

#images are represented as a 2d array of pixel values. lets visualize them
plt.figure(0)
plt.imshow(img1, cmap='gray')
plt.figure(1)
plt.imshow(img2, cmap='gray')

In [None]:
#the perceptron worked on binary data, so we need to convert our pixel values to binary values. let's assign a cutoff 
img1_bin = (img1 > 130).astype(int)
img2_bin = (img2 > 130).astype(int)
plt.figure(0)
plt.imshow(img1_bin, cmap='gray')
plt.figure(1)
plt.imshow(img2_bin, cmap='gray')

### Simulation setup

The perceptron connected each photosensor to its own "neuron". So with an image of size 40x40, we have 1600 photosensors, and thus need 1600 neurons. We will initialize the weights and use the following update rule to train the perceptron.

$$Δw_i = η (y - ŷ) x_i$$
$$Δb = η (y - ŷ)$$

Where:
- \( Δw_i \) is the change applied to weight \( i \)
- \( η \) is the learning rate
- \( y \) is the true label
- \( ŷ \) is the predicted label
- \( x_i \) is the \( i \)-th input value
- \( Δb \) is the change applied to the bias

In [None]:
#Let's make a Perceptron class
class Perceptron:
    def __init__(self, input_size: int):
        self.weights = np.random.normal(0,0.1, (input_size+1)) #add 1 for the bias term, or in this specific case, the "threshold"

    def step_activation(self, x: np.ndarray):
        return 1 if x >= 0 else 0
    
    def predict(self, inputs: np.ndarray):
        potential = np.dot(inputs, self.weights[:-1] + self.weights[-1])
        return self.step_activation(potential)
    
    def train(self, train_data: np.ndarray, labels: np.ndarray, num_epochs: int, learning_rate: float):
        for _ in range(num_epochs):
            for x,y in zip(train_data, labels):
                yhat = self.predict(x)
                #apply update rule
                self.weights[:-1] += learning_rate * (y - yhat) * x #learn the weights
                self.weights[-1] += learning_rate * (y - yhat) #learn the bias

### Train a perceptron
Given our two images, can it recognize which one is which?

In [None]:
#organize data
x = [img1_bin.flatten(), img2_bin.flatten()]
y = [0.0, 1.0] #0 for matt, 1 for robin

#create an instance of our model from our class
model = Perceptron(input_size=1600)

#train the model
model.train(train_data=x,
            labels=y,
            num_epochs=10,
            learning_rate=0.01)

#test if our model learned
print(f'Matt = 0, model predicted {model.predict(img1_bin.flatten())}')
print(f'Robin = 1, model predicted {model.predict(img2_bin.flatten())}')

Unless something terribly unlikely happened, it worked!

## 1968 - Perceptron - linear vs nonlinear

Let's try out two datasets. One will be easliy linearly separable, the other won't. Let's see what the perceptron learns for each.

In [None]:
from sklearn.datasets import make_blobs, make_moons

#make a linearly separable 2d dataset
x_linear, y_linear = make_blobs(n_samples=100, n_features=2, centers=2)
x_linear -= np.mean(x_linear, axis=0)


#make a linearly INseparable 2d dataset
x_nonlinear, y_nonlinear = make_moons(n_samples = 100)

Train a perceptron for each dataset

In [None]:
model_linear = Perceptron(2)
model_nonlinear = Perceptron(2)

model_linear.train(x_linear, y_linear, 1000, 0.001)
model_nonlinear.train(x_nonlinear, y_nonlinear, 1000, 0.001)

Visualize each model's decision boundary

In [None]:
plt.figure(0)
plt.title('Linear dataset')
plt.scatter(x_linear[:,0], x_linear[:,1], c = y_linear, cmap ='winter')
x1_min, x1_max = x_linear[:, 0].min() - 1, x_linear[:, 0].max() + 1
x1_values = np.linspace(x1_min, x1_max, 100)
x2_values = - (model_linear.weights[1] / model_linear.weights[2]) * x1_values - (model_linear.weights[0] / model_linear.weights[2])
plt.plot(x1_values, x2_values, color='black');

plt.figure(1)
plt.title('Non-linear dataset')
plt.scatter(x_nonlinear[:,0], x_nonlinear[:,1], c = y_nonlinear, cmap ='winter')
x1_min, x1_max = x_nonlinear[:, 0].min() - 1, x_nonlinear[:, 0].max() + 1
x1_values = np.linspace(x1_min, x1_max, 100)
x2_values = - (model_nonlinear.weights[1] / model_nonlinear.weights[2]) * x1_values - (model_nonlinear.weights[0] / model_nonlinear.weights[2])
plt.plot(x1_values, x2_values, color='black');

Even a simple nonlinearity in the dataset is enough to make the perceptron's predictions useless.

## 1982 - Hopfield Networks

Train a hopfield network to recognize handwritten numbers

In [None]:
#Get the MNIST dataset
from sklearn import datasets
import matplotlib.pyplot as plt

digits = datasets.load_digits()
_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, label in zip(axes, digits.images, digits.target):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Training: %i" % label)

x = digits['data']
x_zeros = (digits['data'][digits['target'] == 0] > 10).astype(int)
x_fives = (digits['data'][digits['target'] == 5] > 10).astype(int)

### Define hopfield network

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

class HopfieldNetwork:      
    def __init__(self, size):
        self.size = size
        self.threshold = 0
        self.weights = np.zeros((size, size))

    def train(self, data):
        """Train the network's weights based on provided data."""
        num_samples = len(data)
        
        # Calculate mean activity
        mean_activity = np.mean(data)
        
        # Calculate weights using the Hebb rule
        for sample in tqdm(data):
            delta = sample - mean_activity
            self.weights += np.outer(delta, delta)
        
        # Zero out diagonal and normalize by the number of samples
        np.fill_diagonal(self.weights, 0)
        self.weights /= num_samples

    def predict(self, data, iterations=20, threshold=0):
        """Predict the network's output for given data."""
        return [self._update(sample, iterations, threshold) for sample in tqdm(data)]
    
    def _update(self, state, iterations, threshold):
        """Update the state of the network."""
        energies = []
        states = []
        prev_energy = self._energy(state)
        energies.append(prev_energy)
        for _ in range(iterations):
            for _ in range(self.size):
                idx = np.random.randint(0, self.size)
                state[idx] = np.sign(self.weights[idx].T @ state - threshold)
                energies.append(self._energy(state))
                states.append(state.copy())
            if self._energy(state) == prev_energy:
                break
        return states, energies
    
    def _energy(self, state):
        """Calculate the energy of the current state."""
        return -0.5 * state @ self.weights @ state + np.sum(state * self.threshold)

    def plot_weights(self):
        """Visualize the network's weights."""
        plt.imshow(self.weights, cmap='coolwarm')
        plt.colorbar()
        plt.title("Network Weights")
        plt.show()

### Train it

In [None]:
model = HopfieldNetwork(64) #8x8 images
model.train(x_zeros) #train on ONLY zeros

### Test model capabilities
If we initialize the model "state" with a different digit, does it converge to a zero?

In [None]:
test_input = np.array([(x[1] > 10).astype(int)])
plt.figure(0)
plt.title('Initial input (1)', fontsize=20)
plt.imshow(np.reshape(test_input, (8,8)), cmap ='gray');

plt.figure(1)
fig, axs = plt.subplots(1,5, figsize=(20,5));
fig.suptitle('Model activations', fontsize=20)
states, energy = model.predict(test_input.copy(), iterations=5)[0]
for s in range(5):
    axs[s].imshow(np.reshape(states[s*64],(8,8)), cmap='gray');

plt.figure(5, figsize=(20,5))
plt.title('Model energy', fontsize=20)
plt.ylabel('Energy')
plt.xlabel('Update Step')
plt.plot(energy);

The model tends to fall into a stable energy state, which we have learned to represent "zero". It memorizes the patterns it sees, and encodes them as weights!

They are not very powerful at recognizing patterns compared to modern methods, but the concepts here are quite deep.