## Validation


In [1]:
import numpy as np

#### Loading the data
We use the data provided in the files in.dta and out.dta (also used for Homework # 6). Each line of the files corresponds to a two-dimensional input x = (x1, x2), so that X = R^2, followed by the corresponding label from Y = {-1,1}.

In [2]:
train = np.loadtxt("in.dta")
test = np.loadtxt("out.dta")

#### Defining functions for linear regression with nonlinear transformation
We apply linear regression with a nonlinear transformation for classifcation (without regularization).

In [3]:
def nonlinear_transform(data):
    """Perform nonlinear transformation as per q2 spec"""
    result = []
    for row in data:
        x1 = row[0]
        x2 = row[1]

        result.append([1, x1, x2, np.multiply(x1, x1), np.multiply(x2, x2),
                       np.multiply(x1, x2), np.abs(x1 - x2), np.abs(x1 + x2)])

    return np.array(result)


def extract_labels(dataset):
    """Return correct classifications from dataset"""
    return dataset[:, 2]


def linreg(dataset, y):
    """Return weights from linear regression"""
    pseudo_inverse = np.linalg.pinv(dataset)
    weights = pseudo_inverse.dot(y)

    return weights


def linreg_weight_decay(dataset, y, reg_factor):
    """Return weights from linear regression with weight decay"""
    a = dataset.T.dot(dataset) + (np.identity(dataset.shape[1]) * reg_factor)
    b = np.linalg.inv(a)
    c = b.dot(dataset.T)
    weights = c.dot(y)

    return weights


def evaluate_points(dataset, line):
    """Return list classifying points in dataset as above or below line"""

    return np.sign(dataset.dot(line))


def calculate_error(dataset, weights, y):
    """Calculate error in weights"""
    output = evaluate_points(dataset, weights)
    comparison = np.equal(output, y)

    number_false = 0.0
    for c in comparison:
        if c == False:
            number_false += 1

    return number_false / len(y)

#### Splitting the data into training and validation sets

In [4]:
training = train[:25]
validation = train[25:]

## Question 1
Using validation set to select between five models that apply linear regression to the nonlinear transformation 0 through k, where k is the number of transformations.  
  
First, we apply whole nonlinear transform to training and validation sets. 

In [5]:
training_labels = extract_labels(training)
training_transformed = nonlinear_transform(training)

validation_labels = extract_labels(validation)
validation_transformed = nonlinear_transform(validation)

And then apply linear regression to the relevant part of the transformed data for each k and record the validation error for each.

In [6]:
validation_errors = {}
for k in [3, 4, 5, 6, 7]:
   training_data = training_transformed[:, :k+1]
   validation_data = validation_transformed[:, :k+1]

   regression_weights = linreg(training_data, training_labels)
   validation_errors[k] = calculate_error(validation_data, regression_weights,
                                          validation_labels)

print("Validation errors: ", validation_errors)

Validation errors:  {3: 0.3, 4: 0.5, 5: 0.2, 6: 0.0, 7: 0.1}


Classification error on the validation set is smallest for k = 6

## Question 2
Evaluating the out of sample classifiation error on the 5 models from Question 1. 

In [7]:
training_labels = extract_labels(training)
training_transformed = nonlinear_transform(training)

validation_labels = extract_labels(validation)
validation_transformed = nonlinear_transform(validation)

test_labels = extract_labels(test)
test_transformed = nonlinear_transform(test)

validation_errors = {}
out_sample_errors = {}

for k in [3, 4, 5, 6, 7]:
    training_data = training_transformed[:, :k+1]
    validation_data = validation_transformed[:, :k+1]
    test_data = test_transformed[:, :k+1]

    regression_weights = linreg(training_data, training_labels)

    validation_errors[k] = calculate_error(validation_data, regression_weights,
                                           validation_labels)
    out_sample_errors[k] = calculate_error(test_data, regression_weights,
                                           test_labels)

print("Validation errors: ", validation_errors)
print("Out of sample errors: ", out_sample_errors)

Validation errors:  {3: 0.3, 4: 0.5, 5: 0.2, 6: 0.0, 7: 0.1}
Out of sample errors:  {3: 0.42, 4: 0.416, 5: 0.188, 6: 0.084, 7: 0.072}


## Question 3
Reversing the role of training and validation sets. Train with last 10 examples and validate on first 25. 

In [8]:
training_labels = extract_labels(validation)
training_transformed = nonlinear_transform(validation)

validation_labels = extract_labels(training)
validation_transformed = nonlinear_transform(training)

validation_errors = {}

for k in [3, 4, 5, 6, 7]:
   training_data = training_transformed[:, :k+1]
   validation_data = validation_transformed[:, :k+1]

   regression_weights = linreg(training_data, training_labels)

   validation_errors[k] = calculate_error(validation_data, regression_weights,
                                          validation_labels)

print("Validation errors: ", validation_errors)

Validation errors:  {3: 0.28, 4: 0.36, 5: 0.2, 6: 0.08, 7: 0.12}


## Question 4
Evaluating the out of sample classifiation error on the 5 models from Question 3. 

In [9]:
training_labels = extract_labels(validation)
training_transformed = nonlinear_transform(validation)

validation_labels = extract_labels(training)
validation_transformed = nonlinear_transform(training)

test_labels = extract_labels(test)
test_transformed = nonlinear_transform(test)

validation_errors = {}
out_sample_errors = {}

for k in [3, 4, 5, 6, 7]:
   training_data = training_transformed[:, :k+1]
   validation_data = validation_transformed[:, :k+1]
   test_data = test_transformed[:, :k+1]

   regression_weights = linreg(training_data, training_labels)

   validation_errors[k] = calculate_error(validation_data, regression_weights,
                                          validation_labels)
   out_sample_errors[k] = calculate_error(test_data, regression_weights,
                                          test_labels)

print("Validation errors: ", validation_errors)
print("Out of sample errors: ", out_sample_errors)

Validation errors:  {3: 0.28, 4: 0.36, 5: 0.2, 6: 0.08, 7: 0.12}
Out of sample errors:  {3: 0.396, 4: 0.388, 5: 0.284, 6: 0.192, 7: 0.196}


## Question 5

Comparing the out of sample classification errors for the models chosen in Questions 1 and 3.

Eout for k=6 in Question 3 = 0.192  
Eout for k=6 in Question 1 = 0.084  

## Question 6
#### Validation bias

Let e1 and e2 be independent random variables, distributed uniformly over the
interval [0, 1]. Let e = min(e1, e2). The expected values of e1, e2, e are closest to

In [10]:
def validation_bias(runs):
    data = np.zeros((runs, 3))

    for row in data:
        row[0] = np.random.uniform(0.0, 1.0)
        row[1] = np.random.uniform(0.0, 1.0)
        row[2] = min(row[1], row[0])

    return data

runs = 100000
data = validation_bias(runs)

expected_e1 = np.mean(data[:, 0])
expected_e2 = np.mean(data[:, 1])
expected_min = np.mean(data[:, 2])

print("Runs: {0}".format(runs))
print("Expected e1 = {0}".format(expected_e1))
print("Expected e2 = {0}".format(expected_e2))
print("Expected min = {0}".format(expected_min))

Runs: 100000
Expected e1 = 0.5011582027679854
Expected e2 = 0.4990039527090463
Expected min = 0.3339748374582504


## Question 8
#### PLA vs SVM

In [11]:
import numpy as np
import quadprog as qp


def create_dataset(number_of_points):
    """Return dataset of random points in form x0=1, x1, x2"""
    ones = np.ones((number_of_points, 1))
    points = np.random.uniform(-1.0, 1.0, size=(number_of_points, 2))
    return np.concatenate((ones, points), axis=1)


def create_f(points):
    """Return coeficients of random straight line x0=1, m, c"""
    points = np.random.uniform(-1.0, 1.0, size=(points, 2))
    p0 = 1.0
    b = [-p0, -p0]
    w1, w2 = np.linalg.solve(points, b)
    return np.array([p0, w1, w2])


def evaluate_points(dataset, line):
    """Return list classifying points in dataset as above or below line"""
    return np.sign(dataset.dot(line))


def create_weights(dataset):
    """Return empty weight vector of appropriate size for dataset"""
    length = len(dataset[0])
    return np.zeros(length, int)


def check_classifications(dataset, weights, y):
    """Return list of misclassified points in dataset"""
    misclassified_points = []

    for point_index in range(len(dataset)):
        if np.sign(dataset[point_index].dot(weights)) != y[point_index]:
            misclassified_points.append(point_index)

    return misclassified_points


def nudge(dataset, y, weights, misclassified_points):
    """Update weights using a random misclassified point"""
    point_index = np.random.choice(misclassified_points)
    weights = weights + y[point_index] * dataset[point_index]
    return weights


def compare_weights(weights_1, weights_2, runs):
    test_points = create_dataset(runs)
    labels_1 = evaluate_points(test_points, weights_1)
    labels_2 = evaluate_points(test_points, weights_2)
    print("l1: " + str(len(labels_1)))
    print("l2: " + str(len(labels_2)))

    differences = 0
    for point in range(runs):
        if labels_1[point] == labels_2[point]:
            differences += 1

    return differences / runs


def run_perceptron(number_of_points):
    """Return weights from PLA after all points classified correctly"""

    # Ensure all points not on same side of line

    while True:
        dataset = create_dataset(number_of_points)
        target_function = create_f(2)
        labels = evaluate_points(dataset, target_function)
        if not np.all(labels == labels[0]):
            break

    weights = create_weights(dataset)

    while True:
        misclassified_points = check_classifications(dataset, weights, labels)
        if misclassified_points:
            weights = nudge(dataset, labels, weights, misclassified_points)
        else:
            break

    return compare_weights(weights, target_function, 1000000)

In [12]:
def create_G(dataset, y):
    points = dataset[:, 1:]
    G = np.zeros((points.shape[0], points.shape[0]))
    for row in range(points.shape[0]):
        for col in range(points.shape[0]):
            val = (y[row] * y[col]) * points[row].dot(points[col])

            G[row][col] = val

    return G

def create_a(N):
    return np.full((N, 1), -1.)

def create_C(y, N):
    return np.hstack((-y.reshape((N, 1)), np.identity(N)))

def create_b(N):
    return np.full((N, 1), 0.)

def SVM(dataset, y):
    N = dataset.shape[0]
    G = create_G(dataset, y)
    a = create_a(N)
    C = create_C(y, N)
    b = create_b(N)

    return qp.solve_qp(G, a, C, b, meq=1)


# Run SVM
dataset = create_dataset(10)
target_function = create_f(2)
labels = evaluate_points(dataset, target_function)
result = SVM(dataset, labels)

# Run Perceptron
x = run_perceptron(1000)

print(result)
print(x)

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)