### Imports

In [None]:
import scipy as sci
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import colors
%matplotlib widget

### Read Data

In [None]:
file = 'Iris.csv'
data = pd.read_csv(file, index_col=0, header=0).reset_index(drop=True)
N_S = len(data.index)
indices = data.index

### Preprocess

In [None]:
X_S, Y_S = data.iloc[:, :-1], data.iloc[:, -1]

def normalize(points):
    min = points.min(axis=0)
    scale = points.max(axis=0) - min
    def denormalize(points):
        return points * scale + min
    return (points - min) / scale, denormalize

X_S, denormalize = normalize(X_S)

def label_to_int(data):
    labels = data.unique()
    label_map = dict(zip(labels, range(len(labels))))
    return data.apply(lambda x: label_map[x]), labels

Y_S, labels = label_to_int(Y_S)

### Learner measures

In [None]:
# lower bound to sample size
def PAC_eta(H_norm, delta, epsilon):
    return (np.ln(H_norm) - np.ln(delta))/epsilon

# lower bound to generalization error
def PAC_delta(H_norm, epsilon, eta):
    return H_norm/np.exp(eta*epsilon)


# Sampling

### Sampling Distributions

In [None]:
def pdf(pdf, x, N): return pd.Series({
    "Uniform": sci.stats.uniform.pdf(x, 0, N),
    "Triangular": sci.stats.triang.pdf(x, 0.5, -1, N+1),
    "Normal": sci.stats.norm.pdf(x, N//2, N//6)}[pdf], index=x)

plt.figure()
pdf_names = ["Uniform", "Triangular", "Normal"]
for name in pdf_names:
    F = pdf(name, indices, N_S)
    plt.plot(indices, F, label=name)
plt.legend()
plt.title("Probability Density Functions")
plt.xlabel("Sample Index")

plt.figure()
for name in pdf_names:
    F = pdf(name, indices, N_S)
    I = np.log2(1/F)
    H = np.sum(-F*np.log2(F))
    plt.plot(indices, I, label=f"{name}: {H:.2f}")
plt.legend(title="$\\bf{Entropy}$")
plt.title("Information content")
plt.xlabel("Sample Index")

### Comparison of samples

In [None]:
def set_axis_labels(ax, x, y, z):
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_zlabel(z)

for name in pdf_names:
    plt.figure()
    ax = plt.axes(projection='3d', title=f"{name} Sampling")
    sample = Y_S.sample(frac=0.6, weights=pdf(name, indices, N_S))
    total = 0
    for (k, v) in enumerate(labels):
        matches = (sample == k).reindex(indices, fill_value=False)
        total += np.sum(matches)
        points = X_S[matches]
        ax.scatter(*zip(*points.iloc[:, 0:3].values.tolist()),
                   label=f"{v}: {len(points)}")
    plt.legend(title=f"Samples: {total}")
    set_axis_labels(ax, *points.columns[0:3])


# Training

In [None]:
# gets B such that ||Y-XB||^2 is minimized
def linear_regression(X, Y):
    # Adds a 1s column to X to represent a constant parameter
    X = np.append(X, np.ones((X.shape[0], 1)), axis=1)
    Beta = np.asmatrix(X.T @ X).I @ X.T @ Y
    return Beta


weights = pdf("Uniform", indices, N_S)
sets = {"train": int(0.6*N_S), 
        "test": int(0.2*N_S),
        "validation": int(0.2*N_S)}

for k, v in sets.items():
    Y = Y_S[weights.index].sample(n=v, weights=weights)
    sets[k] = (X_S.iloc[Y.index, :], Y)
    weights = weights.drop(Y.index)

X, Y = sets["train"]

Beta = linear_regression(X, Y)

for k in sets:
    X, Y = sets[k]
    X = X.copy()
    X["ones"] = np.ones(X.shape[0])
    Z = np.round(X @ Beta)

    plt.figure()
    ax = plt.axes(projection='3d', title=f"Linear Regression {k}")
    TP_total = 0
    FP_total = 0

    error_str = lambda T, F: f"{F}/{F+T} ({F/(F+T):.2%})"
    for (k, v) in enumerate(labels):
        TP = (Z == k) & (Y == k)
        TP_sum = np.sum(TP)
        TP_total += TP_sum
        FP = (Z == k) & (Y != k)
        FP_sum = np.sum(FP)
        FP_total += FP_sum

        color = ax._get_lines.get_next_color()
        if TP_sum > 0:
            ax.scatter(*zip(*X[TP].iloc[:,0:3].values),
                    label=f"{v}: {error_str(TP_sum, FP_sum)}", color=color)
        if FP_sum > 0:
            ax.scatter(*zip(*X[FP].iloc[:, 0:3].values.tolist()), marker="x", color=color)
    plt.legend(title=f"Errors: {error_str(TP_total, FP_total)}")
    set_axis_labels(ax, *X.columns[0:3])
