In [None]:
!pip install \
    transformers==4.34.0 \
    datasets==2.14.6 \
    evaluate==0.4.1 \
    pytorch-lightning==2.0.9 \
    torch==2.0.1 \
    torchvision==0.15.2 \
    jiwer==3.0.3 \
    pandas==2.1.0 \
    numpy==1.24.4 \
    -q

In [38]:
# Cargar el CSV
csv_path = "/kaggle/input/hwr-test/output_labels.csv"
images_dir = "/kaggle/input/hwr-test/output_lines"

# Leer el CSV
df = pd.read_csv(csv_path)

# Verificar que las imágenes existen
def check_image_exists(row):
    return os.path.exists(os.path.join(images_dir, row['filename']))

df = df[df.apply(check_image_exists, axis=1)]


# Dividir en train/test (ajusta según necesites)
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Crear datasets HuggingFace
def create_dataset(dataframe):
    dataset = Dataset.from_dict({
        'image': [Image.open(os.path.join(images_dir, fname)).convert('RGB') for fname in dataframe['filename']],
        'text': list(dataframe['label'])
    })
    return dataset

dataset = DatasetDict({
    'train': create_dataset(train_df),
    'test': create_dataset(test_df)
})

In [39]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

def prepare_examples(batch):
    images = batch['image']
    texts = batch['text']
    
    # Procesar imágenes
    pixel_values = processor(images, return_tensors="pt").pixel_values
    
    # Procesar textos
    labels = processor.tokenizer(texts, padding="max_length", max_length=64).input_ids
    
    # Reemplazar padding token id por -100 para ignorar en loss
    labels = [[label if label != processor.tokenizer.pad_token_id else -100 for label in labels_example] for labels_example in labels]
    
    batch['pixel_values'] = pixel_values
    batch['labels'] = labels
    
    return batch

# Aplicar el preprocesamiento
train_dataset = dataset['train'].map(prepare_examples, batched=True, batch_size=8)
eval_dataset = dataset['test'].map(prepare_examples, batched=True, batch_size=8)

# Configurar formato para PyTorch
train_dataset.set_format(type='torch', columns=['pixel_values', 'labels'])
eval_dataset.set_format(type='torch', columns=['pixel_values', 'labels'])

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [40]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

# Configurar parámetros del modelo
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

# Configuración para generación
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate

cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    # Agrega esto a tus compute_metrics
    wer_metric = evaluate.load("wer")
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    
    return {"cer": cer, "wer": wer}
    
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-trained",
    per_device_train_batch_size=4,  # Aumentar si tu GPU lo permite
    per_device_eval_batch_size=4,
    num_train_epochs=10,  # Más épocas
    fp16=True,
    learning_rate=3e-5,  # Tasa de aprendizaje más baja
    eval_strategy="steps",
    eval_steps=200,  # Evaluar con menos frecuencia
    save_steps=500,
    logging_steps=50,
    warmup_steps=100,  # Añadir warmup
    weight_decay=0.01,  # Regularización
    save_total_limit=2,
    predict_with_generate=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

  trainer = Seq2SeqTrainer(


In [88]:
import gc
gc.collect()
torch.cuda.empty_cache()

# Iniciar entrenamiento
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=50, training_loss=0.23392311096191407, metrics={'train_runtime': 64.8758, 'train_samples_per_second': 6.166, 'train_steps_per_second': 0.771, 'total_flos': 3.539581802643456e+17, 'train_loss': 0.23392311096191407, 'epoch': 10.0})

In [89]:
# Guardar el modelo entrenado
trainer.save_model("trocr-trained-custom")
processor.save_pretrained("trocr-trained-custom")

[]

In [93]:
from transformers import pipeline

images_dir = "/kaggle/input/image-test/"

# Cargar el modelo entrenado
ocr = pipeline("image-to-text", model="./trocr-trained-custom")

# Probar con una imagen de test
test_image = Image.open(os.path.join(images_dir, "image_test_3.png")).convert("RGB")
prediction = ocr(test_image)
print(f"Predicción: {prediction[0]['generated_text']}")

Device set to use cuda:0


Predicción: Bla, Gili Vide
