Benjamin Ye  
CS/CNE/EE 156a: Learning Systems (Fall 2023)  
November 13, 2023

## Homework 7

In [1]:
import sys

import numpy as np
import pandas as pd
import requests
from sklearn import svm

CWD = globals()['_dh'][0]
sys.path.insert(0, str(CWD.parents[1]))
from cs156a import (LinearRegression, Perceptron, target_function_random_line,
                    generate_data, validate_binary)

DATA_DIR = CWD.parents[1] / "data"
rng = np.random.default_rng()

### Problems 1–5

In [2]:
DATA_DIR.mkdir(exist_ok=True)
raw_data = {}
for prefix in ["in", "out"]:
    if not (DATA_DIR / f"{prefix}.dta").exists():
        r = requests.get(f"http://work.caltech.edu/data/{prefix}.dta")
        with open(DATA_DIR / f"{prefix}.dta", "wb") as f:
            f.write(r.content)
    raw_data[prefix] = np.loadtxt(DATA_DIR / f"{prefix}.dta")

ns = (25, len(raw_data["in"]) - 25)
data = np.array_split(raw_data["in"], (ns[0],))
transform_funcs = (
    lambda x: np.ones((len(x), 1), dtype=float), 
    lambda x: x,
    lambda x: x[:, :1] ** 2, 
    lambda x: x[:, 1:] ** 2, 
    lambda x: np.prod(x, axis=1, keepdims=True), 
    lambda x: np.abs(x[:, :1] - x[:, 1:]), 
    lambda x: np.abs(x[:, :1] + x[:, 1:])
)
reg = LinearRegression(
    vf=validate_binary, 
    transform=lambda x: np.hstack(tuple(f(x) for f in transform_funcs[:k])),
    rng=rng
)
df = pd.DataFrame(columns=["split", "k", "training error", 
                           "validation error", "out-of-sample error"])
for i in range(2):
    for k in np.arange(3, 8):
        E_train = reg.train(data[i][:, :-1], data[i][:, -1])
        E_validate = reg.get_error(data[1 - i][:, :-1], data[1 - i][:, -1])
        E_out = reg.get_error(raw_data["out"][:, :-1], 
                                raw_data["out"][:, -1])
        df.loc[len(df)] = (f"{ns[i]}:{ns[1 - i]}", k, 
                           E_train, E_validate, E_out)
(df.style.hide(axis="index")
         .format("{:.3f}", subset=["training error", "validation error", 
                                   "out-of-sample error"])
         .set_caption("Linear regression with nonlinear transformation"))

split,k,training error,validation error,out-of-sample error
25:10,3,0.44,0.3,0.42
25:10,4,0.32,0.5,0.416
25:10,5,0.08,0.2,0.188
25:10,6,0.04,0.0,0.084
25:10,7,0.04,0.1,0.072
10:25,3,0.4,0.28,0.396
10:25,4,0.3,0.36,0.388
10:25,5,0.2,0.2,0.284
10:25,6,0.0,0.08,0.192
10:25,7,0.0,0.12,0.196


### Problem 6

In [3]:
x = rng.uniform(size=(10_000_000, 2))
e_1, e_2 = x.mean(axis=0)
e = x.min(axis=1).mean()
print("The expected values for paired independent uniform random "
      f"variables and their minimum are {e_1:.6f}, "
      f"{e_2:.6f}, and {e:.6f}, respectively.")

The expected values for paired independent uniform random variables and their minimum are 0.499952, 0.500101, and 0.333384, respectively.


### Problems 8–10

In [4]:
N_runs = 1_000
f = target_function_random_line(rng=rng)
pla = Perceptron(vf=validate_binary)
clf = svm.SVC(C=np.finfo(float).max, kernel="linear")
df = pd.DataFrame(columns=["N", "SVM > perceptron",
                           "number of support vectors"])
for N_train in (10, 100):
    N_test = 99 * N_train
    counters = np.zeros(2, dtype=float)
    for _ in range(N_runs):
        while True:
            x_train, y_train = generate_data(N_train, f, bias=True, 
                                             rng=rng)
            if not np.allclose(y_train, y_train[0]):
                break
        x_test, y_test = generate_data(N_test, f, bias=True, rng=rng)
        pla.train(x_train, y_train)
        clf.fit(x_train[:, 1:], y_train)
        counters += (
            1 - clf.score(x_test[:, 1:], y_test) 
                < pla.get_error(x_test, y_test),
            clf.n_support_.sum()
        )
    counters /= N_runs
    df.loc[len(df)] = N_train, *counters
(df.style.hide(axis="index")
         .format({"N": "{:.0f}", "SVM > perceptron": "{:.1%}", 
                  "number of support vectors": "{:.0f}"})
         .set_caption("Comparison of perceptron and support vector "
                      "machine (SVM)"))

N,SVM > perceptron,number of support vectors
10,59.2%,3
100,64.4%,3
