# LaTeX Symbol Recognition

This notebook trains a simple machine learning model to recognize hand-drawn LaTeX symbols. The model is a convolutional neural network (CNN) trained on the [detexify](https://github.com/kirel/detexify-data) dataset of images of LaTeX symbols.

## Running
This notebook is designed for Google Colab, which provides free training resources, but it can be easily adapted to other platforms.


- Upload the training data to Google Drive (see `prepare_data.py` for more details). The file should be located as set by `file_path` in the cell below.
- Connect to a Colab instance, training is faster for ones with a GPU
- Run all cells
- Download the finished model

**Adapted from:** [https://j3698.github.io/extexify/training-the-symbol-recognizer](https://j3698.github.io/extexify/training-the-symbol-recognizer)

In [None]:
import tarfile
import os


# mount drive to import training data
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Colab Notebooks/images.tar.xz'

output_dir = '/content/images'
os.makedirs(output_dir, exist_ok=True)

# extract training data
with tarfile.open(file_path, 'r:xz') as tar:
    tar.extractall(path=output_dir)

In [None]:
import os

# Directories to check
directories = [
    '/content/images/train',
    '/content/images/val',
    '/content/images/test'
]
import os

def remove_empty_files_and_folders(directory):
    for root, dirs, files in os.walk(directory, topdown=False):
        # Remove empty files
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if os.path.isfile(file_path) and os.path.getsize(file_path) == 0:
                print(f"Removing empty file: {file_path}")
                os.remove(file_path)

        # Remove empty folders
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            if not os.listdir(dir_path):  # Check if the folder is empty
                print(f"Removing empty folder: {dir_path}")
                os.rmdir(dir_path)

# Remove empty folders from each specified directory
for dir_path in directories:
    remove_empty_files_and_folders(dir_path)

print("Empty folders removed.")

In [None]:
# needed for exporting model in onnx
!pip install onnx onnxscript

In [None]:
import os
import torch
import torch.optim as optim
import torch.nn as nn
try:
    from tqdm import tqdm
except:
    pass
import torch.optim.lr_scheduler as lr_scheduler
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader


batch_size = 512 if torch.cuda.is_available() else 2
epochs = 20
step_size = 21
output_size = 1072

class Model(nn.Module):
    def __init__(self):
        super().__init__()

        self.layers = nn.Sequential(
                nn.Conv2d(3, 64, 3, 1, 1), nn.ReLU(), nn.BatchNorm2d(64),
                nn.Conv2d(64, 64, 3, 2, 1), nn.ReLU(), nn.BatchNorm2d(64),
                nn.Conv2d(64, 128, 3, 2, 1), nn.ReLU(), nn.BatchNorm2d(128),
                nn.Conv2d(128, 256, 3, 2, 1), nn.ReLU(), nn.BatchNorm2d(256),
                nn.Conv2d(256, 512, 3, 2, 1), nn.ReLU(), nn.BatchNorm2d(512),

                nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten(),
                nn.Linear(512, output_size)
        )

    def forward(self, x):
        return self.layers(x)


def train_model(run_name, model, criterion, optimizer, scheduler,\
                epochs, train_loader, val_loader, test_loader):
    if torch.cuda.is_available():
        model.cuda()

    best_top5 = 0
    for e in range(epochs):
        _, top5 = validate(model, val_loader)
        if top5 > best_top5:
            best_top5 = top5
            save(f"{run_name}.pt", model, optimizer, scheduler, e)
            dummy_input = torch.randn(1, *train_loader.dataset[0][0].shape).cuda() if torch.cuda.is_available() else torch.randn(1, *train_loader.dataset[0][0].shape)
            torch.onnx.export(model, dummy_input, f"{run_name}.onnx", export_params=True)
            print("Saved model")

        train_epoch(model, optimizer, criterion, train_loader, scheduler)


def save(filename, model, optimizer, scheduler, epoch):
    torch.save({
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "epoch": epoch
    }, filename)


def train_epoch(model, optimizer, criterion, train_loader, scheduler):
    total = 0
    correct = 0
    correct5 = 0

    model.train()

    bar = tqdm(train_loader)
    for x, y in bar:
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()

        optimizer.zero_grad()

        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()

        total += len(y)
        correct += topk_correct(out, y, 1)
        correct5 += topk_correct(out, y, 5)
        update_bar(bar, correct, correct5, total)

    scheduler.step()


@torch.no_grad()
def validate(model, eval_loader):
    total = 0
    correct = 0
    correct5 = 0

    model.eval()

    bar = tqdm(eval_loader)
    for x, y in bar:
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()

        out = model(x)

        total += len(y)
        correct += topk_correct(out, y, 1)
        correct5 += topk_correct(out, y, 5)
        update_bar(bar, correct, correct5, total)

    return correct / total, correct5 / total


def update_bar(bar, correct, correct5, total):
    postfix = {"top1": 100 * correct / total,\
               "top5": 100 * correct5 / total}
    bar.set_postfix(postfix)


def topk_correct(out, y, k):
    topk = torch.topk(out, k, dim = 1).indices
    return torch.any(topk == y[:, None], dim = 1).sum().item()

# loss function
criterion = nn.CrossEntropyLoss()

# datasets
dataset_train = ImageFolder("/content/images/train", transform = ToTensor())
dataset_val = ImageFolder("/content/images/val", transform = ToTensor())
dataset_test = ImageFolder("/content/images/test", transform = ToTensor())

assert len(dataset_train.classes) == len(dataset_val.classes) == len(dataset_test.classes), "Datasets should have equal amounts of classes"
print("Classes:")
print(dataset_train.classes)

# for loading data into batches
train_loader = DataLoader(dataset_train, batch_size = batch_size,\
                          shuffle = True, num_workers = os.cpu_count())
val_loader = DataLoader(dataset_val, batch_size = batch_size, shuffle = False)
test_loader = DataLoader(dataset_test, batch_size = batch_size, shuffle = False)

run_name = "Test"
model = Model()
optimizer = optim.Adam(model.parameters(), weight_decay = 1e-4)
scheduler = lr_scheduler.StepLR(optimizer, step_size)

train_model(run_name, model, criterion, optimizer, \
            scheduler, epochs, train_loader, val_loader, test_loader)
