In [1]:
from PIL import Image
from pix2tex.cli import LatexOCR

img = Image.open('math2.png')
model = LatexOCR()
print(model(img))

a_{n+1}=a_{n}-\frac{1}{c_{n}},b_{n+1}=b_{n}+\frac{1}{d_{n}},c_{n+1}=c_{n}+\frac{1}{a_{n+1}},d_{n+1}=d_{n}+\frac{1}{b_{n+1}}.


![](./math2.png)
$$
a_{n+1}=a_{n}-\frac{1}{c_{n}},b_{n+1}=b_{n}+\frac{1}{d_{n}},c_{n+1}=c_{n}+\frac{1}{a_{n+1}},d_{n+1}=d_{n}+\frac{1}{b_{n+1}}.
$$

In [2]:
img = Image.open('math3.png')
print(model(img))

\ 2\ll\alpha f(\alpha+\varepsilon)+(\alpha+\varepsilon)f(\alpha)\mathop{\bf\Xi}\le\frac{\alpha}{\alpha+\varepsilon}+\left(\alpha+\varepsilon\right)f(\alpha)


![](math3.png)
$$
\ 2\ll\alpha f(\alpha+\varepsilon)+(\alpha+\varepsilon)f(\alpha)\mathop{\bf\Xi}\le\frac{\alpha}{\alpha+\varepsilon}+\left(\alpha+\varepsilon\right)f(\alpha)
$$

$$
2<x f(x+\varepsilon)+(x+\varepsilon) f(x) \leq \frac{x}{x+\varepsilon}+(x+\varepsilon) f(x)
$$

In [4]:
import torch

def monotonic_activation(p):
    return p  # Identity function for simplicity

def grad_drop_backward(A, L_funcs, leak_params):
    n = len(L_funcs)  # Number of tasks
    grads = []
    
    # First, calculate the gradients for each task
    for i in range(n):
        G_i = torch.sign(A) * torch.autograd.grad(L_funcs[i](A), A, create_graph=True)[0]  # Equation for G_i
        if G_i.requires_grad:
            G_i = G_i.sum(dim=0)  # Sum over the batch dimension if required
        grads.append(G_i)
    
    # Calculate P
    abs_grad_sum = sum(torch.abs(g) for g in grads)
    grad_sum = sum(grads)
    P = 0.5 * (1 + grad_sum / abs_grad_sum)
    
    # Sample U, a tensor with the same shape as P
    U = torch.rand_like(P)
    
    # Calculate masks and apply GradDrop
    new_grad = 0
    for i in range(n):
        M_i = ((monotonic_activation(P) > U).float() * (grads[i] > 0).float()
               + (monotonic_activation(P) < U).float() * (grads[i] < 0).float())
        new_grad += (leak_params[i] + (1 - leak_params[i]) * M_i) * torch.autograd.grad(L_funcs[i](A), A, create_graph=True)[0]
    
    return new_grad

# Example usage:
# Assuming A is the input activation tensor, L_funcs is a list of loss functions, and leak_params is a list of leak parameters
# A = torch.randn((batch_size, num_features), requires_grad=True)
# L_funcs = [loss1, loss2, ..., lossN]  # Replace with actual loss functions
# leak_params = [0.1, 0.2, ..., 0.1]  # Replace with actual leak parameters
# new_grad = grad_drop_backward(A, L_funcs, leak_params)
# Here, we would use new_grad to update the weights in the network


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple neural network with a single linear layer
class SimpleNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleNet, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        return self.linear(x)

# Define the GradDrop hook
def grad_drop_hook(grad, P, leak_param):
    U = torch.rand_like(P)
    M = ((monotonic_activation(P) > U).float() * (grad > 0).float()
         + (monotonic_activation(P) < U).float() * (grad < 0).float())
    return leak_param + (1 - leak_param) * M * grad

# Test case setup
input_size = 5
output_size = 3
batch_size = 2

# Initialize the neural network
net = SimpleNet(input_size, output_size)

# Create some dummy input data
x = torch.randn(batch_size, input_size)

# Define two dummy loss functions for two different tasks
def task1_loss(output):
    target = torch.ones(batch_size, output_size)
    return torch.nn.functional.mse_loss(output, target)

def task2_loss(output):
    target = torch.zeros(batch_size, output_size)
    return torch.nn.functional.mse_loss(output, target)

# List of loss functions
L_funcs = [task1_loss, task2_loss]

# Leak parameters for GradDrop
leak_params = [0.0, 0.0]  # Pure GradDrop in this case

# Forward pass through the network
output = net(x)

# Compute gradients for each task
task_gradients = []
for i, L_func in enumerate(L_funcs):
    net.zero_grad()
    loss = L_func(output)
    loss.backward(retain_graph=True)
    gradients = []
    for p in net.parameters():
        if p.grad is not None:
            gradients.append(p.grad.clone())
    task_gradients.append(gradients)

# Apply GradDrop
for i, p in enumerate(net.parameters()):
    if p.requires_grad:
        # Compute P for this parameter
        abs_grad_sum = sum(torch.abs(g[i]) for g in task_gradients)
        grad_sum = sum(g[i] for g in task_gradients)
        P = 0.5 * (1 + grad_sum / abs_grad_sum)
        
        # Apply the GradDrop hook as a function since hooks cannot be used outside backward()
        new_grad = grad_drop_hook(p.grad, P, leak_params[0])  # Assuming the same leak_param for simplicity
        p.grad = new_grad

# Update parameters
optimizer = optim.SGD(net.parameters(), lr=0.1)
optimizer.step()

# Print the updated weights and biases
for name, param in net.named_parameters():
    print(f"{name}: {param}")


linear.weight: Parameter containing:
tensor([[ 0.1920, -0.0759, -0.3001, -0.1595,  0.1161],
        [-0.3508,  0.3999, -0.0971, -0.0475,  0.3453],
        [ 0.2742,  0.2636, -0.0997,  0.1964, -0.4187]], requires_grad=True)
linear.bias: Parameter containing:
tensor([ 0.3550, -0.0889, -0.1403], requires_grad=True)
