In [None]:
# Entrenamiento del Modelo de Clasificación de Páginas de PDF

Este notebook entrena un modelo de CNN para identificar las primeras páginas en documentos PDF.

In [None]:
# Importar librerías necesarias
import sys
import os
import json
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout

# Configurar rutas
PROCESSED_DIR = '../data/processed'
LABELS_FILE = '../data/labels.json'
MODEL_DIR = '../data/models'

In [None]:
# Verificar existencia de archivos y directorios necesarios
print("Verificando archivos y directorios...")
if not os.path.exists(LABELS_FILE):
    raise FileNotFoundError(f"No se encontró el archivo de etiquetas: {LABELS_FILE}")
if not os.path.exists(PROCESSED_DIR):
    raise FileNotFoundError(f"No se encontró el directorio de imágenes procesadas: {PROCESSED_DIR}")
os.makedirs(MODEL_DIR, exist_ok=True)
print("✓ Verificación completada")

In [4]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Entrenamiento del Modelo de Clasificación de Páginas de PDF\n",
    "\n",
    "Este notebook entrena un modelo de CNN para identificar las primeras páginas en documentos PDF."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Importar librerías necesarias\n",
    "import sys\n",
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "import cv2\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout\n",
    "\n",
    "# Configurar rutas\n",
    "PROCESSED_DIR = '../data/processed/2-TITULOS-15-DE-NOVIEMBRE-2024-1-30'\n",
    "LABELS_FILE = '../data/labels.json'\n",
    "MODEL_DIR = '../data/models'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Verificar existencia de archivos y directorios necesarios\n",
    "print(\"Verificando archivos y directorios...\")\n",
    "if not os.path.exists(LABELS_FILE):\n",
    "    raise FileNotFoundError(f\"No se encontró el archivo de etiquetas: {LABELS_FILE}\")\n",
    "if not os.path.exists(PROCESSED_DIR):\n",
    "    raise FileNotFoundError(f\"No se encontró el directorio de imágenes procesadas: {PROCESSED_DIR}\")\n",
    "os.makedirs(MODEL_DIR, exist_ok=True)\n",
    "print(\"✓ Verificación completada\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Cargar etiquetas\n",
    "print(\"Cargando etiquetas...\")\n",
    "with open(LABELS_FILE, 'r') as f:\n",
    "    labels = json.load(f)\n",
    "print(f\"✓ Se cargaron etiquetas para {len(labels)} PDFs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Preparar datos para entrenamiento\n",
    "print(\"Preparando datos para entrenamiento...\")\n",
    "X = []\n",
    "y = []\n",
    "\n",
    "for pdf_name, info in labels.items():\n",
    "    pdf_dir = os.path.join(PROCESSED_DIR, pdf_name)\n",
    "    if not os.path.exists(pdf_dir):\n",
    "        print(f\"Advertencia: No se encontró el directorio para {pdf_name}\")\n",
    "        continue\n",
    "        \n",
    "    total_pages = info['total_pages']\n",
    "    target_pages = info['target_pages']\n",
    "    \n",
    "    for page in range(total_pages):\n",
    "        img_path = os.path.join(pdf_dir, f'page_{page}.png')\n",
    "        if not os.path.exists(img_path):\n",
    "            print(f\"Advertencia: No se encontró la página {page} de {pdf_name}\")\n",
    "            continue\n",
    "            \n",
    "        # Cargar y preprocesar imagen\n",
    "        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)\n",
    "        img = cv2.resize(img, (224, 224))\n",
    "        img = img / 255.0\n",
    "        X.append(img[..., np.newaxis])\n",
    "        y.append(1 if page in target_pages else 0)\n",
    "\n",
    "X = np.array(X)\n",
    "y = np.array(y)\n",
    "\n",
    "print(f\"✓ Datos preparados:\")\n",
    "print(f\"  - Total de imágenes: {len(X)}\")\n",
    "print(f\"  - Primeras páginas: {sum(y)}\")\n",
    "print(f\"  - Otras páginas: {len(y) - sum(y)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Dividir datos en entrenamiento y validación\n",
    "print(\"Dividiendo datos en conjuntos de entrenamiento y validación...\")\n",
    "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "print(f\"✓ Conjunto de entrenamiento: {len(X_train)} imágenes\")\n",
    "print(f\"✓ Conjunto de validación: {len(X_val)} imágenes\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Definir el modelo\n",
    "print(\"Construyendo modelo...\")\n",
    "model = Sequential([\n",
    "    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),\n",
    "    MaxPooling2D((2, 2)),\n",
    "    Conv2D(64, (3, 3), activation='relu'),\n",
    "    MaxPooling2D((2, 2)),\n",
    "    Conv2D(64, (3, 3), activation='relu'),\n",
    "    Flatten(),\n",
    "    Dense(64, activation='relu'),\n",
    "    Dropout(0.5),\n",
    "    Dense(1, activation='sigmoid')\n",
    "])\n",
    "\n",
    "model.compile(\n",
    "    optimizer='adam',\n",
    "    loss='binary_crossentropy',\n",
    "    metrics=['accuracy']\n",
    ")\n",
    "\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Entrenar modelo\n",
    "print(\"Iniciando entrenamiento...\")\n",
    "history = model.fit(\n",
    "    X_train, y_train,\n",
    "    epochs=20,\n",
    "    batch_size=32,\n",
    "    validation_data=(X_val, y_val),\n",
    "    callbacks=[\n",
    "        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),\n",
    "        tf.keras.callbacks.ModelCheckpoint(\n",
    "            os.path.join(MODEL_DIR, 'best_model.h5'),\n",
    "            save_best_only=True\n",
    "        )\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Visualizar resultados\n",
    "plt.figure(figsize=(12, 4))\n",
    "\n",
    "# Gráfico de pérdida\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.plot(history.history['loss'], label='Training Loss')\n",
    "plt.plot(history.history['val_loss'], label='Validation Loss')\n",
    "plt.title('Model Loss')\n",
    "plt.xlabel('Epoch')\n",
    "plt.ylabel('Loss')\n",
    "plt.legend()\n",
    "\n",
    "# Gráfico de precisión\n",
    "plt.subplot(1, 2, 2)\n",
    "plt.plot(history.history['accuracy'], label='Training Accuracy')\n",
    "plt.plot(history.history['val_accuracy'], label='Validation Accuracy')\n",
    "plt.title('Model Accuracy')\n",
    "plt.xlabel('Epoch')\n",
    "plt.ylabel('Accuracy')\n",
    "plt.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Guardar gráficos\n",
    "plt.savefig(os.path.join(MODEL_DIR, 'training_history.png'))\n",
    "print(\"✓ Gráficos guardados en\", os.path.join(MODEL_DIR, 'training_history.png'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# Entrenamiento del Modelo de Clasificación de Páginas de PDF\n',
    '\n',
    'Este notebook entrena un modelo de CNN para identificar las primeras páginas en documentos PDF.']},
  {'cell_type': 'code',
   'execution_count': None,
   'metadata': {},
   'source': ['# Importar librerías necesarias\n',
    'import sys\n',
    'import os\n',
    'import json\n',
    'import numpy as np\n',
    'import cv2\n',
    'import matplotlib.pyplot as plt\n',
    'from sklearn.model_selection import train_test_split\n',
    'import tensorflow as tf\n',
    'from tensorflow.keras.models import Sequential\n',
    'from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout\n',
    '\n',
    '# Configurar rutas\n',
    "PROCESSED_DIR = '../data/processed/2-TITULOS-15-DE-NOVIEMBRE-2024-1-30'\n",
    "LABELS_FILE = '../data/labels.json'\n",
    "MODEL_DIR = '../data/models'"]},
  {'cell_type': 'code',
   'execu