In [1]:
import numpy as np

Generate the data.

In [2]:
vocab_size = 10 # either vocab size or embedding size. 
# ^ it is not easy to generate samples on the probability simplex that satisfy a gaussian difference, so we stick to embeddings/unconstrained logits.
m = 3 # number of prompts

In [13]:
# construct groundtruth theta "accuracies"
theta = np.random.random(m)*50 # all positive for now. Larger theta = less noisy LFs.
print("Theta:", theta)

# construct mu and sigma for multivariate gaussian formulation of the model.
sigma_diag = np.zeros(m*vocab_size)
for i in range(len(sigma_diag)):
    prompt_idx = int(i / vocab_size)
    sigma_diag[i] = 1/(2 * theta[prompt_idx])

print("Sigma diag:", sigma_diag)

sigma = np.diag(sigma_diag)
mu = np.zeros(m * vocab_size)

print("mu:", mu) # Zero mean

Theta: [21.44408149  9.07740938 27.70640861]
Sigma diag: [0.02331646 0.02331646 0.02331646 0.02331646 0.02331646 0.02331646
 0.02331646 0.02331646 0.02331646 0.02331646 0.05508179 0.05508179
 0.05508179 0.05508179 0.05508179 0.05508179 0.05508179 0.05508179
 0.05508179 0.05508179 0.01804637 0.01804637 0.01804637 0.01804637
 0.01804637 0.01804637 0.01804637 0.01804637 0.01804637 0.01804637]
mu: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]


In [18]:
n = 10000
all_lfs_y = []
all_diffs = []
count = 0
while count < n:
    # Construct LF votes by sampling a "diff" according to MV gaussian, sampling a random groundtruth y
    # and setting LF = diff + y
    diff = np.random.multivariate_normal(mu, sigma)
    all_diffs.append(diff)

    y = np.random.random(vocab_size)
    y_repeated = np.tile(y, reps= m)

    lfs = (y_repeated + diff).reshape((m, vocab_size))
    lfs_y = np.concatenate([lfs, y.reshape((-1, vocab_size))], axis=0)

    all_lfs_y.append(lfs_y)
    count += 1

all_lfs_y = np.array(all_lfs_y)
all_lfs_y.shape # The first three rows of the second dimension corresponds to LF, and the fourth corresponds to y

(10000, 4, 10)

Mean parameter estimation.

In [19]:
def triplet(i, j, k):
    diff_ij = (np.linalg.norm(all_lfs_y[:, i, :] - all_lfs_y[:, j, :], axis=1, ord=2)**2).mean()
    diff_ik = (np.linalg.norm(all_lfs_y[:, i, :] - all_lfs_y[:, k, :], axis=1, ord=2)**2).mean()
    diff_jk = (np.linalg.norm(all_lfs_y[:, j, :] - all_lfs_y[:, k, :], axis=1, ord=2)**2).mean()

    return 0.5*(diff_ij + diff_ik - diff_jk)

In [20]:
diff = np.zeros(m)

for i in range(m):
    other_idxs = np.delete(np.arange(m), i)
    j, k = np.random.choice(other_idxs, size=2, replace=False)
    diff[i] = triplet(i, j, k)

    print(diff[i], (np.linalg.norm(all_lfs_y[:, i, :] - all_lfs_y[:, m, :], axis=1)**2).mean())

0.23288287180858308 0.23369112730365055
0.5578053959351905 0.554497767481926
0.1796984318051682 0.18107175815446175


Convert to canonical parameters.

In [21]:
# convert mean parameters to canonical parameters 
theta_estimate = vocab_size/(2*diff)

print(theta_estimate, theta)

[21.47002036  8.96369959 27.82439418] [21.44408149  9.07740938 27.70640861]


Solve optimization problem at inference time.

In [22]:
sample_lfs = all_lfs_y[0][:m]
true_y = all_lfs_y[0][m]

In [23]:
predicted_y = 1/theta_estimate.sum() * sample_lfs.T.dot(theta_estimate)
print(predicted_y, true_y)

[0.53693616 0.43080623 0.68649778 0.67329944 0.17628504 0.54484161
 0.46845506 0.9088682  0.19074946 0.6546315 ] [0.54041229 0.4833929  0.62790673 0.53810944 0.20376202 0.55361529
 0.38772711 0.87666378 0.44508623 0.61130735]
