# Computer Exercise #02 - The Perceptron Learning Algorithm ##

Included below is the core code for this computer exercise.  You will need set the parameters of the classes according to what you would like to do.  If you accept the default parameters, make sure you understand what they are, and make sure you understand what parameters may be set.

For repeatable experiments, you may want to consider setting the random number seed when generating datasets.



## The Perceptron Algorithm

Here is a simple percepton learning algorithm that you may modify to perform the pocket algorithm.

In [13]:
def pocket_perceptron(X, y_enter, eta = 1, eps = 20):

    # Recoding instances of 0 to -1 so the 
    # range of target values is +-1
    y = np.where(y_enter == 0, -1, 1)

    def emp_risk():
        risk = 0
        for i, x in enumerate(X):
            risk += max(0, -y[i]*np.dot(X[i], w))
        return risk
    
    w = np.zeros(len(X[0]))
    eta = eta
    epochs = eps
    pocket, pocket_error = w, float('inf')
    
    for t in range(epochs):
        for i, x in enumerate(X):
            if (np.dot(X[i], w)*y[i]) <= 0:
                w = w + eta*X[i]*y[i]
                test_error = emp_risk()
                if test_error < pocket_error:
                    pocket_error = test_error
                    pocket = w
    return pocket

In [2]:
def plot_decision_boundary(X, y, weights, dataset_num, data_type):
    # Generate a grid of points
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), 
                         np.linspace(y_min, y_max, 500))

    # Compute the decision boundary
    Z = np.dot(np.c_[xx.ravel(), yy.ravel()], weights)
    Z = Z.reshape(xx.shape)

    # Save figure in working directory
    plt.title(f"Decision Boundary Dataset: {dataset_num}\n{data_type.title()} Algorithm Method")
    
    
    # Plot decision boundary (contour line) if there is a clear separation
    plt.contour(xx, yy, Z, levels=[0], colors='black', linestyles='--')

    # Plot the points
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)

    # Add a legend
    plt.legend(handles=scatter.legend_elements()[0], labels=['Class -1', 'Class 1'])

    # Set plot title and labels
    
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.savefig(f"dec_bound_{dataset_num}_{data_type}.jpg")
    plt.show()

## Common Imports ###

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.datasets import make_circles

## Data Sets

### Data Set #1

In [None]:
# from first feature take all elements
# from second feature take elements 0


# X1[:,0].shape

In [None]:
X1 = np.random.rand(100,2)
y1 = (-0.5*X1[:,0] + X1[:,1] -0.25 > 0).astype(int)
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.scatter(X1[:, 0], X1[:, 1], c=y1, alpha=0.9, edgecolors='black')

Question p.4: PLA target values need to be converted to $y = \pm1$. This is because PLA is based on stochastic gradient descent and the solution update, shown below, would become zero for observations where y is equal to 0. The $\pm1$ is to allow the update to move in the correct direction after a classifcation error.
$$\overrightarrow{w_{n+1}} = \overrightarrow{w_n} + \mu*y_k*\overrightarrow{x_k}$$

The Perceptron class doesn't care what the target values are. However, the target values need to be in a binary format to allow the Perceptron class to function properly.

In [None]:
clf = Perceptron()
x_train, x_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.25)
print(f"min: {min(y1)} and max: {max(y1)}")
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

### Data Set #2
Shown along with a scatter plot is the decision boundary.

In [None]:
X2 = np.random.rand(100,2)
y2 = (X2[:,1] > 4*(X2[:,0]-0.5)**2).astype(int)
plt.scatter(X2[:, 0], X2[:, 1], c=y2, alpha=0.9, edgecolors='black')
#Plot decision boundary
x1 = np.linspace(0, 1, 400)
x2 = 4 * (x1 - 0.5) ** 2
plt.plot(x1, x2, color='green')
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.grid(True)

### Data Set #3

In [None]:
from sklearn.datasets import make_moons
X3,y3 = make_moons(n_samples=100, shuffle=True, noise=0.1)
plt.scatter(X3[:, 0], X3[:, 1], c=y3, alpha=0.9, edgecolors='black')
plt.grid(True)
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

### Data Set #4

In [None]:
from sklearn.datasets import make_circles
X4,y4 = make_circles(n_samples=100, shuffle=True, noise=0.1,factor=0.5)
plt.scatter(X4[:, 0], X4[:, 1], c=y4, alpha=0.9, edgecolors='black')
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.grid(True)

# Training and Test Sets

You may want to put in your own value for `test_size` instead of accepting the default value.

### Important Note ###
In the `train_test_splot` class you will need to put in the desired dataset, e.g. `(X1,y1)` or `(X2,y2)` in the place of `(X,y)`, or redefine `(X,y)`. 

### PLA Experiments

### Dataset #1

To do:
1. Scatter plot of test set and discriminant function
2. Plot of learning curve
3. Expected risk of classifier

In [7]:
def pocket_error(x_train, x_test, y_train, y_test):
    pocket_coef = pocket_perceptron(x_train, y_train)
    print(pocket_coef)
    y_guess_pocket = np.where(np.dot(x_test,pocket_coef)>0, 1, 0)
    y_pocket_sum = np.sum(y_guess_pocket)
    y_test_sum = np.sum(y_test)
    print(f"Pocket error: {abs(y_pocket_sum - y_test_sum) / len(y_test)}")

In [5]:
def learning_curve_plot_pla(data_num, epochs, train_x, test_x, train_y, test_y,  tol = 1e-3):
    learning = []
    for i in range(1, epochs+1):
        learning_perceptron = Perceptron(max_iter = i, tol = tol)
        learning_perceptron.fit(train_x, train_y)
        learning.append(1 - learning_perceptron.score(test_x, test_y))
    learning = np.array(learning)
    plt.title(f"Classification Error Dataset Number: {data_num}")
    plt.axis([1,epochs,0,1])
    plt.xticks(np.arange(1,epochs+1,1))
    plt.yticks(np.arange(0,1.1,0.1))
    plt.xlabel("Epochs")
    plt.ylabel("Error")
    plt.plot(learning)
    plt.savefig(f"pla_learning_curve_d_{data_num}.jpg")
    plt.show()

In [22]:
np.random.seed(23)
X1 = np.random.rand(100,2)
y1 = (-0.5*X1[:,0] + X1[:,1] -0.25 > 0).astype(int)

# Dataset #1
# test size of 20%
# tol set to None, done because this dataset is linearly separable so convergence theorem gurantees PLA will converge; however 
# this decision suffers from data snooping since I've seen the data. When I did this the PLA always used max_iter as a stopping
# criterion so the PLA didn't converge. Set tol to 1e-12 because the PLA should converge and setting a lower threshold approximates
# convergence and it converges after 13 epochs.
# Left eta0 at default value because convergence theorem states that we'll theoretically have convergence with any step size
# Running this with different random seeds changes the accuracy of the model on both the training and test sets; however, the 
# model still converges. For example, a random seed of 7 results in error rates of 0.2 and 0.05 for the test and training data, respectively.
# This could be an instance where the stopping condition is kicking in too early because if I change tol to None the training and test
# error are reduced to 0, but the number of epochs increases to the max_iter. 
# Switching tol from 1e-12 to 1 reduces the number of epochs to 6 and increases the test accuracy to 95%

x1_pla_train, x1_pla_test, y1_pla_train, y1_pla_test = train_test_split(X1, y1, test_size = 0.2)
pla_1 = Perceptron(tol = 1e-12)
pla_1.fit(x1_pla_train, y1_pla_train)
# print(pla_1.n_iter_)
# print(pla_1.score(x1_pla_test,y1_pla_test))
# print(pla_1.score(x1_pla_train,y1_pla_train))
pocket_error(x1_pla_train, x1_pla_test, y1_pla_train, y1_pla_test)
print(f"1 epoch: {pocket_perceptron(x1_pla_train, y1_pla_train,0.0001,1)}")
print(pla_1.coef_)

[-0.51106848  0.51055049]
Pocket error: 0.05
1 epoch: [-5.11068478e-05  5.10550488e-05]
[[-2.22493757  4.12214274]]


In [None]:
learning_curve_plot_pla(1, 20, x1_pla_train, x1_pla_test, y1_pla_train, y1_pla_test, 1e-12)

In [None]:
plot_decision_boundary(X1,y1, pla_1.coef_[0], 1, "perceptron")
plot_decision_boundary(X1,y1, pocket_perceptron(x1_pla_train, y1_pla_train), 1, "pocket")

### Dataset #2

To do:
1. Rerun after setting tol=None then answer #2 subquestions
2. Plot learning curve
3. Scatterplot

In [None]:
np.random.seed(0)
X2 = np.random.rand(100,2)
y2 = (X2[:,1] > 4*(X2[:,0]-0.5)**2).astype(int)

# Dataset #2
# Made the test size 25% of the data
# We know by the data construction the data are not linearly separable, therefore, PLA will never converge theoretically
# I kept the default value of the stopping criterion, tol, because we know PLA will not converge, however, I increased the
# max number of iterations to see if the default stopping criterion would result in a premature exit from the SGD.
# USing the default value of the stopping criterion resulted in the model converging which shouldn't happen in theory; therefore,
# I reduced tol to a smaller number, but max_iter was never reached.
# Surprisingly, PLA converged after 6 epochs.
# The accuracy of PLA was 0.63 and 0.8 on the test and training data, respectively. The small accuracy suggests we need to 
# try a non-linear model
# Changing the random seed changes the number of epochs and the accuracy of the model on both the training and test data; however, 
# it doesn't fix the underwhelming performance of PLA on the test data 

x2_pla_train, x2_pla_test, y2_pla_train, y2_pla_test = train_test_split(X2, y2, test_size = 0.25)
pla_2 = Perceptron(max_iter = 100000)
pla_2.fit(x2_pla_train, y2_pla_train)
print(pla_2.n_iter_)
print(pla_2.score(x2_pla_test, y2_pla_test))
print(pla_2.score(x2_pla_train, y2_pla_train))
pocket_error(x2_pla_train, x2_pla_test, y2_pla_train, y2_pla_test)

In [None]:
learning_curve_plot_pla(2, 20, x2_pla_train, x2_pla_test, y2_pla_train, y2_pla_test)

### Dataset #3
To do:
1. Rerun to avoid convergence and answer #2 subparts
2. Scatterplots
3. Learning Curve

In [None]:
np.random.seed(0)
X3,y3 = make_moons(n_samples=100, shuffle=True, noise=0.1)

# Dataset #3
# Using test dataset size of 0.2
# The data are not linearly separable therefore, theoretically, PLA should not converge
# Initially, I set max_iter to 10000 expecting PLA not to converge and left tol as the default. This was not the case
# as PLA converged in 7 epochs with a training score of 0.85. Afterward, I decreased tol to 1e-10 and reran the model, but
# was met with convergence in the same number of epochs and the same training accuracy
# Running PLA with different random seeds changing the number of epochs required for convergence and the mdoel accuracy
# but did not fundamentally change anything
# The training and test accuracy were identical at 0.85

x3_pla_train, x3_pla_test, y3_pla_train, y3_pla_test = train_test_split(X3, y3, test_size = 0.2)
pla_3 = Perceptron(max_iter = 10000, tol = 1e-10)
pla_3.fit(x3_pla_train, y3_pla_train)
print(pla_3.n_iter_)
print(pla_3.score(x3_pla_test, y3_pla_test))
print(pla_3.score(x3_pla_train, y3_pla_train))
pocket_error(x3_pla_train, x3_pla_test, y3_pla_train, y3_pla_test)

In [None]:
learning_curve_plot_pla(3, 20, x3_pla_train, x3_pla_test, y3_pla_train, y3_pla_test, 1e-10)

### Dataset #4

To do
1. Rerun to avoid convergence and answer #2 subparts
2. Scatterplots
3. Learning Curve

In [None]:
np.random.seed(0)
X4,y4 = make_circles(n_samples=100, shuffle=True, noise=0.1,factor=0.5)

# Dataset #4
# Using default test split
# We know from the data construction the data are not linearly separable and they will not converge in theory
# Begin by setting max iter to 10,000 under the assumption there will not be convergence so we'll let the 
# algorithm run.
# The algorithm converges but with test accuracy of 0.28 and a train accuracy of 0.55. These abysmal test results indicate
# the linear perceptron model is not suited for the underlying data and we should build a new model. Changing the random seed
# causes the number of epochs and accuracy to change slightly, but the conclusion remains the model isn't suited for the data

x4_pla_train, x4_pla_test, y4_pla_train, y4_pla_test = train_test_split(X4, y4)
pla_4 = Perceptron(max_iter = 10000, tol = 0.00001)
pla_4.fit(x4_pla_train, y4_pla_train)
print(pla_4.n_iter_)
print(pla_4.score(x4_pla_test, y4_pla_test))
print(pla_4.score(x4_pla_train, y4_pla_train))
pocket_error(x4_pla_train, x4_pla_test, y4_pla_train, y4_pla_test)

In [None]:
learning_curve_plot_pla(4, 20, x4_pla_train, x4_pla_test, y4_pla_train, y4_pla_test, 0.00001)

In [None]:
plot_decision_boundary(X4,y4, pla_4.coef_[0], 4, "perceptron")
plot_decision_boundary(X4,y4, pocket_perceptron(x4_pla_train, y4_pla_train), 4, "pocket")

# Nonlinear PLA

## Augmenting a Feature Vector

Here is an example of adding polynomial features, $x_1^2$, $x_1x_2$, and $x_2^2$ to the feature vector **x** to create a feature vector of length 5 plus the bias.  For a higher-order polynomial, change the value of `degree`.

### Dataset 1 Non-linear PLA

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
np.random.seed(7)
X1 = np.random.rand(100,2)
y1 = (-0.5*X1[:,0] + X1[:,1] -0.25 > 0).astype(int)

x1_pla_train, x1_pla_test, y1_pla_train, y1_pla_test = train_test_split(X1, y1, test_size = 0.2)

# Dataset 1, keeping degree to 2 because data is linearly separable so non-linear classifier inappropriate
# PLA converges after 20 epochs and the accuracy of the model is 0.95 and 0.9875 for test and training data respectively.
# This construction takes more epochs to converge, but has higher test accuracy than the linear model using the same random seed
# Using different random number seeds changes the number of epochs and errors, but doesn't fundamentally change the results.
# Using a highly nonlinear perceptron, degrees = 10 results in 14 epochs and a training accuracy of 0.9.


poly_pla_1 = PolynomialFeatures(degree=2, include_bias=True)
x1_poly_pla_train = poly_pla_1.fit_transform(x1_pla_train)
x1_poly_pla_test = poly_pla_1.fit_transform(x1_pla_test)

perceptron_poly_1 = Perceptron(tol = 1e-9)
perceptron_poly_1.fit(x1_poly_pla_train, y1_pla_train)

print(perceptron_poly_1.n_iter_)
print(perceptron_poly_1.score(x1_poly_pla_test, y1_pla_test))
print(perceptron_poly_1.score(x1_poly_pla_train, y1_pla_train))
pocket_error(x1_poly_pla_train, x1_poly_pla_test, y1_pla_train, y1_pla_test)

### Dataset 2 Non-linear PLA

In [None]:
np.random.seed(0)
X2 = np.random.rand(100,2)
y2 = (X2[:,1] > 4*(X2[:,0]-0.5)**2).astype(int)

x2_pla_train, x2_pla_test, y2_pla_train, y2_pla_test = train_test_split(X2, y2, test_size = 0.25)

# Dataset # 2
# Kept the polynomial features at a degree of 2 because we know the data are constructed quadratically
# Increased the number of epoch stopping condition, reduced the stopping criterion, and reduced the multiplier. All of
# this was done to better fit the quadratic data
# With the quadratic data I end up converging in 6 epochs, with a higher test accuracy than training, but the results
# are equivalent to the linear classifier, 0.8 test accuracy
# Increasing the Polynomial features degrees to 3 increases the test accuracy to 0.88

poly_pla_2 = PolynomialFeatures(degree=2, include_bias=True)
x2_train_poly_pla_2, x2_test_poly_pla_2 = poly_pla_2.fit_transform(x2_pla_train), poly_pla_2.fit_transform(x2_pla_test)

perceptron_poly_2 = Perceptron(tol = 1e-6, max_iter = 10000, eta0 = 0.000001, n_iter_no_change = 370)
perceptron_poly_2.fit(x2_train_poly_pla_2, y2_pla_train)

print(perceptron_poly_2.n_iter_)
print(perceptron_poly_2.score(x2_test_poly_pla_2, y2_pla_test))
print(perceptron_poly_2.score(x2_train_poly_pla_2, y2_pla_train))
pocket_error(x2_train_poly_pla_2, x2_test_poly_pla_2, y2_pla_train, y2_pla_test)

### Dataset 3 Non-Linear PLA

In [None]:
np.random.seed(0)
X3,y3 = make_moons(n_samples=100, shuffle=True, noise=0.1)

# Dataset #3
# Using test dataset size of 0.2
# The data are not linearly separable therefore, linear PLA should not converge
# Trying polynomial features of 3 and 5. Knowing the structure of the data beforehand, violating the data snooping 
# principle, the data seem to be shaped in this fashion.
# Leaving the perceptron with all default parameters
# With polynomial of 3 the training and test accuracy were 1 and the algorithm ran 15 epochs
# Changing the random seed does little to alter this conclusion
# Using degree of 5 drops the test accuracy to 0.95 and the number of epochs to 14

x3_pla_train, x3_pla_test, y3_pla_train, y3_pla_test = train_test_split(X3, y3, test_size = 0.2)
poly_pla_3 = PolynomialFeatures(degree=3, include_bias = True)
x3_train_poly_pla3, x3_test_poly_pla3 = poly_pla_3.fit_transform(x3_pla_train), poly_pla_3.fit_transform(x3_pla_test)

perceptron_poly_3 = Perceptron()
perceptron_poly_3.fit(x3_train_poly_pla3, y3_pla_train)

print(perceptron_poly_3.n_iter_)
print(perceptron_poly_3.score(x3_test_poly_pla3, y3_pla_test))
print(perceptron_poly_3.score(x3_train_poly_pla3, y3_pla_train))
pocket_error(x3_train_poly_pla3, x3_test_poly_pla3, y3_pla_train, y3_pla_test)

### Dataset 4 Non-Linear PLA

In [None]:
np.random.seed(0)
X4,y4 = make_circles(n_samples=100, shuffle=True, noise=0.1,factor=0.5)

# Dataset #4
# Using default test split
# Knowing the data are circles I will keep the perceptron default values
# Using polynomial degrees from 2-6 result in test accuracies of 1


x4_pla_train, x4_pla_test, y4_pla_train, y4_pla_test = train_test_split(X4, y4)

to_df = {'degrees':[2,3,4,5,6],
         'test accuracy':[],
         'train accuracy':[],
         'epochs':[], 
         'pocket error':[]
        }

for i in range(2,7):
    poly_pla_4 = PolynomialFeatures(degree=i, include_bias = True)
    x4_train_poly_pla_4, x4_test_poly_pla_4 = poly_pla_4.fit_transform(x4_pla_train), poly_pla_4.fit_transform(x4_pla_test)
    perceptron_poly_4 = Perceptron()
    perceptron_poly_4.fit(x4_train_poly_pla_4, y4_pla_train)
    
    
    to_df['test accuracy'].append(perceptron_poly_4.score(x4_test_poly_pla_4, y4_pla_test))
    to_df['train accuracy'].append(perceptron_poly_4.score(x4_train_poly_pla_4, y4_pla_train))
    to_df['epochs'].append(perceptron_poly_4.n_iter_)
    to_df['pocket error'].append(pocket_error(x4_train_poly_pla_4, x4_test_poly_pla_4, y4_pla_train, y4_pla_test))
table = pd.DataFrame(to_df)
display(table)

### Shoelace Theorem

In [None]:
def shoelace_area(x1, y1, x2, y2, x3, y3):
    # Calculate the area using the Shoelace formula
    area = abs(x1 * y2 + x2 * y3 + x3 * y1 - (y1 * x2 + y2 * x3 + y3 * x1)) / 2
    return area