In [2]:
import os
from PIL import Image, ImageDraw, ImageFont
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

In [10]:
# Generate the dataset of characters for training the OCR model

FONT_PATH = "font_spanish_license_plate\MESPREG.ttf"  # the font
OUTPUT_DIR = "models/dataset_chars"
MODEL_DIR = "models"
CHARS = "0123456789BCDFGHJKLMNPQRSTVWXYZ"
IMAGES_PER_CHAR = 200
IMG_SIZE = (28, 28)
BATCH_SIZE = 64
EPOCHS = 20
LR = 0.001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Generate the dataset
def generate_dataset():
    print("Generating dataset...")
    for split in ["train", "val"]:
        split_dir = os.path.join(OUTPUT_DIR, split)
        os.makedirs(split_dir, exist_ok=True)
        for c in CHARS:
            char_dir = os.path.join(split_dir, c)
            os.makedirs(char_dir, exist_ok=True)
            for i in range(IMAGES_PER_CHAR):
                img = Image.new("L", IMG_SIZE, color=255)
                draw = ImageDraw.Draw(img)
                try:
                    font = ImageFont.truetype(FONT_PATH, 24)
                except Exception as e:
                    raise ValueError(f"No se pudo cargar la fuente: {FONT_PATH}") from e
                bbox = font.getbbox(c)
                w = bbox[2] - bbox[0]
                h = bbox[3] - bbox[1]
                draw.text(((IMG_SIZE[0]-w)/2 - bbox[0], (IMG_SIZE[1]-h)/2 - bbox[1]), c, fill=0, font=font)
                
                img.save(os.path.join(char_dir, f"{c}_{i}.png"))
    print("Dataset done.")


# Define the OCR model
class CharRecognizer(nn.Module):
    def __init__(self, num_classes=len(CHARS)):
        super(CharRecognizer, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.fc1 = nn.Linear(64*5*5, 128)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

def get_data_loaders():
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    train_dataset = datasets.ImageFolder(os.path.join(OUTPUT_DIR, "train"), transform=transform)
    val_dataset = datasets.ImageFolder(os.path.join(OUTPUT_DIR, "val"), transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    return train_loader, val_loader

# Train
def train_model():
    model = CharRecognizer().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)

    train_loader, val_loader = get_data_loaders()

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {avg_loss:.4f}")

        # Evaluación en validation set
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(DEVICE), labels.to(DEVICE)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        acc = 100 * correct / total
        print(f"Validation Accuracy: {acc:.2f}%")

    # Save the model
    model_path = os.path.join(MODEL_DIR, "char_recognizer.pt")
    torch.save(model.state_dict(), model_path)
    print(f"Modelo entrenado guardado en: {model_path}")
    return model_path


  FONT_PATH = "font_spanish_license_plate\MESPREG.ttf"  # the font


In [11]:
generate_dataset()
trained_model_path = train_model()

Generating dataset...
Dataset done.


  FONT_PATH = "font_spanish_license_plate\MESPREG.ttf"  # the font


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x9216 and 1600x128)