# Bayesian linear regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import subprocess
# from google.protobuf.internal.decoder import _DecodeVarint32
import sys
sys.path.insert(0, '..')
# from proto.py.algorithm_state_pb2 import AlgorithmState

In [None]:
# Initialize true parameters
dim = 2
alphas = [np.array(dim*[-3]), np.array(dim*[+0]), np.array(dim*[+3])]
sigma2 = 1

In [None]:
# Utility to save files with Unix-like newlines
def save_np(filename, npobj):
    with open(filename, 'wb') as f:
        np.savetxt(f, npobj, fmt='%1.5f')

In [None]:
# Generate data
rng = 20201124
np.random.seed(rng)
n = 200
xx = np.random.uniform(low=-2.0, high=3.0, size=(n, dim))
cc = int(n/3)*[0] + int(n/3)*[1] + (n - 2*int(n/3))*[2]
yy = np.zeros(n)
for i in range(n):
    mu = np.dot(xx[i, :], alphas[cc[i]])
    y = np.random.normal(loc=mu, scale=sigma2)
    yy[i] = y
# Save to file
save_np("../resources/csv/in/logsb_cov_mix.csv", xx)
save_np("../resources/csv/in/logsb_data.csv", yy)

In [None]:
# Generate grid points
xx_grid = np.matrix(dim*[-2.7])
lin = np.arange(-4.0, +4.1, 0.5)
yy_grid = np.array(np.meshgrid(lin, lin)).T.reshape(-1, 2)
# Save to file
save_np("../resources/csv/in/logsb_grid_cov_mix.csv", xx_grid)
save_np("../resources/csv/in/logsb_grid_data.csv", yy_grid)

In [None]:
# Run the executable # TODO
cmd = ["../build/run",
    "Neal2", str(rng), "0", "1000", "100",
    "LinRegUni", "../resources/asciipb/lin_reg_uni_fixed.asciipb",
    "DP", "../resources/asciipb/dp_gamma_prior.asciipb",
    "../lru.recordio",
    "../resources/csv/in/data_lru.csv",  "../resources/csv/in/covs_grid_lru.csv",
    "../resources/csv/out/lru_dens.csv", "../resources/csv/out/lru_mass.csv",
    "../resources/csv/out/lru_nclu.csv", "../resources/csv/out/lru_clus.csv",
    "../resources/csv/in/covs_lru.csv",  "../resources/csv/in/covs_grid_lru.csv"
]
subprocess.run(cmd, capture_output=True)

## Simulation study

In [None]:
# Utility to read file collector, courtesy of
# github.com/mberaha/utils/blob/master/proto_utils/py/recordio.py
def readManyFromFile(filename, msgType):
    out = []
    with open(filename, "rb") as fp:
        buf = fp.read()
    n = 0
    while n < len(buf):
        msg_len, new_pos = _DecodeVarint32(buf, n)
        n = new_pos
        msg_buf = buf[n:n+msg_len]
        try:
            msg = msgType()
            msg.ParseFromString(msg_buf)
            out.append(msg)
            n += msg_len
        except Exception as e:
            break
    return out

In [None]:
# Read chain
chain = readManyFromFile('../lru.recordio', MarginalState)

Compare original betas and regression betas of some iterations:

In [None]:
betas_print = []
for i in (0, 2, 1):
    betas_print.append(["%1.1f"%float(b) for b in betas[i]])

print("Original betas:")
print(betas_print)

print("Chain betas of iterations with 3 clusters:")
for state in chain:
    if len(state.cluster_states) == 3:
        betas_chain = []
        for clus in state.cluster_states:
            beta = clus.lin_reg_univ_ls_state.regression_coeffs.data
            betas_chain.append(["%1.1f"%b for b in beta])
        print(betas_chain, f"(iteration n. {state.iteration_num})", sep="\t")

Compare true and posterior clustering:

In [None]:
# Read posterior clustering
cc_post = np.loadtxt('../resources/csv/out/lru_clus.csv')
cc_post = [int(_) for _ in cc_post]

In [None]:
idxs = [i for i in range(n)]

size_true = len(set(cc))
cmap1 = plt.cm.get_cmap('hsv', size_true+1)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,5))
for i in idxs:
    ax1.scatter(i, i, color=cmap1(cc[i]))
ax1.set_title(f"True clustering, with {size_true} clusters")

size_post = len(set(cc_post))
cmap2 = plt.cm.get_cmap('hsv', size_post+1)
for i in idxs:
    ax2.scatter(i, i, color=cmap2(cc_post[i]))
ax2.set_title(f"Posterior clustering, with {size_post} clusters")