# Bayesian linear regression

In [None]:
import numpy as np
import subprocess

In [None]:
# Initialize true parameters
dim = 3
betas = [np.array(dim*[-3]), np.array(dim*[+0]), np.array(dim*[+3])]
sigma2 = 1

In [None]:
# Utility to save files with Unix-like newlines
def save_np(filename, npobj):
    with open(filename, 'wb') as f:
        np.savetxt(f, npobj, fmt='%1.5f')

In [None]:
# Generate data
rng = 20201124
np.random.seed(rng)
n = 100
xx = np.random.normal(loc=0.0, scale=1.0, size=(n, dim))
cc = np.random.randint(low=0, high=3, size=n)
yy = np.zeros(n)
for i in range(n):
    mu = np.dot(xx[i, :], betas[cc[i]])
    y = np.random.normal(loc=mu, scale=sigma2)
    yy[i] = y

# Generate grid points
np.random.seed(rng)
yy_grid = np.arange(-5.0, +5.0, 0.1)
xx_grid = np.random.normal(loc=0.0, scale=1.0, size=(yy_grid.size, dim))

In [None]:
# Save data and grid points to file
save_np("../resources/csv/in/covs_lru.csv", xx)
save_np("../resources/csv/in/data_lru.csv", yy)
save_np("../resources/csv/in/covs_grid_lru.csv", xx_grid)
save_np("../resources/csv/in/grid_lru.csv", yy_grid)

In [None]:
# Run the executable
cmd = ["../build/run",
    "N2", str(rng), "0", "1000", "100",
    "LinRegUni", "../resources/asciipb/lin_reg_univ_fixed.asciipb",
    "DP", "../resources/asciipb/dp_gamma_prior.asciipb",
    "",
    "../resources/csv/in/data_lru.csv",  "../resources/csv/in/covs_grid_lru.csv",
    "../resources/csv/out/lru_dens.csv", "../resources/csv/out/lru_mass.csv",
    "../resources/csv/out/lru_nclu.csv", "../resources/csv/out/lru_clus.csv",
    "../resources/csv/in/covs_lru.csv",  "../resources/csv/in/covs_grid_lru.csv"
]
subprocess.run(cmd, capture_output=True)

## Simulation study

In [None]:
# Utility to read file collector, courtesy of
# github.com/mberaha/utils/blob/master/proto_utils/py/recordio.py
from google.protobuf.internal.decoder import _DecodeVarint32

def readManyFromFile(filename, msgType):
    out = []
    with open(filename, "rb") as fp:
        buf = fp.read()
    n = 0
    while n < len(buf):
        msg_len, new_pos = _DecodeVarint32(buf, n)
        n = new_pos
        msg_buf = buf[n:n+msg_len]
        try:
            msg = msgType()
            msg.ParseFromString(msg_buf)
            out.append(msg)
            n += msg_len
        except Exception as e:
            break
    return out

In [None]:
# Read chain
import sys
sys.path.insert(0, '..')
from proto.py.ls_state_pb2 import LinRegUnivLSState

chain = readManyFromFile('../lru.recordio', LinRegUnivLSState)

In [None]:
print(chain)

TODO:
* Compare the posterior similarity matrix with the true similarity matrix,
  the posterior number of clusters, the values of the betas at some iteration
  (they should match b_1, ... b_3 roughly)
* Check that the MSE on the training set is lower than the MSE you
  would get using a standard linear regression

## Vs regular linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(xx, yy)
mse_sk = mean_squared_error(yy, model.predict(xx))
print(mse_sk)  # 12.13