In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as dist

The function takes a random variable Z ~exp(a) [for clarity regarding the notation, here the
parameter 'a' is intended such that E(Z) = a]. Given input data X as integers, the
 output of function y is Y= X*Z

Starting with a parameter of my choice, I constructed some synthetic data.
My goal was then to create a model able to guess that starting paramater

the trick was to call a random vairiable epsilon with uniform distribution that, rearrenged,
 behaves like a exponential distribution. This trick involves computing the inverse of the
 CDF of the exp distribution.

 in brief if eps~unif(0,1)  and Z = -ln(1-eps)/a --> Z~exp(a)
 moreover y would now be Y= x*Z = x* (-ln(1-eps)/a) = x/a * (-ln(1-eps))

 I can now apply the reparametrization trick with a forward (deterministic) function
 that depends on x and a (differentiable!!), and compute values of eps in order to get
 reliable y_pred values.

In [5]:
torch.manual_seed(0)

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.a = nn.Parameter(torch.tensor([1.0]))  # Linear layer for parameter 'b'

    def forward(self, x):
        y_pred = x*self.a

        return y_pred

x_train = torch.randint(0, 21, (1, 10000)).float()  # Random input data
a_true = torch.tensor(5)      # True value of parameter 'a'
Z_train = torch.distributions.exponential.Exponential(1/a_true).sample(x_train.shape)# Generate Z from exp distribution
y_train = x_train * Z_train     # Compute y from the stochastic function

# Instantiate the model
model = MyModel()

# Define optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Training loop
epochs = 1000
eps = torch.rand_like(x_train)
for epoch in range(epochs):
    eps = torch.rand_like(x_train)
    Z = -torch.log(1-eps)
    y_pred= model(x_train) * Z


    loss = (y_pred.mean()-y_train.mean())**2
    #worth of mention that is quite meaningless here to compute the MSE of predicted vs true
    # values due to the fact that the difference between single predictions can simply derive
    # from the stochastic process. Instead look at mean (think about expected value of y that
    # depend on parameter a)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

print(y_train.mean(), y_pred.mean())
print("True parameter 'a':", a_true.item(),"Learned parameter 'a':", model.a.item())

Epoch [100/1000], Loss: 1.137718677520752
Epoch [200/1000], Loss: 1.6731791496276855
Epoch [300/1000], Loss: 0.16615675389766693
Epoch [400/1000], Loss: 0.5110507011413574
Epoch [500/1000], Loss: 0.048011913895606995
Epoch [600/1000], Loss: 0.09210830181837082
Epoch [700/1000], Loss: 3.226335684303194e-05
Epoch [800/1000], Loss: 0.09387179464101791
Epoch [900/1000], Loss: 0.2512795627117157
Epoch [1000/1000], Loss: 0.10957447439432144
tensor(51.1566) tensor(51.4876, grad_fn=<MeanBackward0>)
True parameter 'a': 5 Learned parameter 'a': 5.0678629875183105
