In [1]:
import pandas as pd
import os
from datasets import DatasetDict, Dataset
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, pipeline
import evaluate
import gc
import torch

2025-08-01 16:26:59.186553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754065619.441425    3663 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754065619.509008    3663 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
from sklearn.model_selection import train_test_split

# Configuración de rutas
base_path = "/kaggle/input/nombre-column/rows_Nombre/rows_Nombre/rows_Nombre - copia"

# 1. Recopilar todos los datos
data = []
for folder_num in range(4, 30):  # Carpetas del 4 al 29
    folder_path = os.path.join(base_path, str(folder_num))
    
    if not os.path.exists(folder_path):
        print(f"Advertencia: No existe la carpeta {folder_path}")
        continue
    
    # Procesar cada imagen en la carpeta
    for img_file in sorted(os.listdir(folder_path)):
        if img_file.endswith('.png'):
            img_path = os.path.join(folder_path, img_file)
            label = os.path.splitext(img_file)[0]  # Elimina .png
            
            data.append({
                'folder': folder_num,
                'image_path': img_path,
                'label': label
            })

# 2. Crear DataFrame
df = pd.DataFrame(data)

# Verificar que las imágenes existen
df['exists'] = df['image_path'].apply(os.path.exists)
print(f"Imágenes encontradas: {df['exists'].sum()}/{len(df)}")
df = df[df['exists']].drop(columns=['exists'])

# 3. Dividir en train/test (estratificado por carpeta)
train_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42,
    stratify=df['folder']  # Mantener proporción por carpeta
)

# 4. Crear datasets HuggingFace
def create_dataset(dataframe):
    return Dataset.from_dict({
        'image': [Image.open(path).convert('RGB') for path in dataframe['image_path']],
        'text': list(dataframe['label'])
    })

dataset = DatasetDict({
    'train': create_dataset(train_df),
    'test': create_dataset(test_df)
})

# 5. Verificación
print("\nResumen del dataset:")
print(f"Entrenamiento: {len(dataset['train'])} ejemplos")
print(f"Prueba: {len(dataset['test'])} ejemplos")
print("\nEjemplo del primer elemento:")
print(dataset['train'][0]['text'])  # Debería mostrar el nombre del archivo sin .png

Imágenes encontradas: 1300/1300

Resumen del dataset:
Entrenamiento: 1040 ejemplos
Prueba: 260 ejemplos

Ejemplo del primer elemento:
Ramona d. De Colombo Siani


In [3]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

def prepare_examples(batch):
    images = batch['image']
    texts = batch['text']
    
    # Procesar imágenes
    pixel_values = processor(images, return_tensors="pt").pixel_values
    
    # Procesar textos
    labels = processor.tokenizer(texts, padding="max_length", max_length=64).input_ids
    
    # Reemplazar padding token id por -100 para ignorar en loss
    labels = [[label if label != processor.tokenizer.pad_token_id else -100 for label in labels_example] for labels_example in labels]
    
    batch['pixel_values'] = pixel_values
    batch['labels'] = labels
    
    return batch

# Aplicar el preprocesamiento
train_dataset = dataset['train'].map(prepare_examples, batched=True, batch_size=8)
eval_dataset = dataset['test'].map(prepare_examples, batched=True, batch_size=8)

# Configurar formato para PyTorch
train_dataset.set_format(type='torch', columns=['pixel_values', 'labels'])
eval_dataset.set_format(type='torch', columns=['pixel_values', 'labels'])

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Map:   0%|          | 0/1040 [00:00<?, ? examples/s]

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

In [4]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

# Configurar parámetros del modelo
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

# Configuración para generación
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
cer_metric = evaluate.load('cer')
 
def compute_cer(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
 
 
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
 
 
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
 
    return {"cer": cer}

training_args = Seq2SeqTrainingArguments(
output_dir="./trocr-trained",
per_device_train_batch_size=8,  # Aumentado
per_device_eval_batch_size=8,
num_train_epochs=15,
fp16=True,
learning_rate=5e-5,
gradient_accumulation_steps=2,  # Útil si hay límite de GPU
lr_scheduler_type="cosine",  # Mejor convergencia
warmup_ratio=0.1,
weight_decay=0.01,
eval_strategy="epoch",
eval_steps=100,
logging_strategy="epoch",
logging_steps=50,
save_strategy="epoch",
save_total_limit=3,
load_best_model_at_end=True,
metric_for_best_model="cer",
greater_is_better=False,
predict_with_generate=True,
report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    processing_class=processor.image_processor,
    args=training_args,
    compute_metrics=compute_cer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
gc.collect()
torch.cuda.empty_cache()

res = trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss,Cer
1,2.8467,5.042746,0.62664
2,1.9468,4.518231,0.567962




In [None]:
# Guardar el modelo entrenado
trainer.save_model("trocr-trained-custom")
processor.save_pretrained("trocr-trained-custom")

In [None]:
images_dir = "/kaggle/input/image-test/"

# Cargar el modelo entrenado
ocr = pipeline("image-to-text", model="./trocr-trained-custom")

# Probar con una imagen de test
test_image = Image.open(os.path.join(images_dir, "image_test_3.png")).convert("RGB")
prediction = ocr(test_image)
print(f"Predicción: {prediction[0]['generated_text']}")