# Preparación de Datos con PySpark

Este notebook realiza la preparación de datos usando PySpark para procesar documentos PDF.

In [None]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf, col
import numpy as np
from pdf2image import convert_from_path
import cv2

# Configurar Spark
spark = SparkSession.builder \
    .appName("PDFPreprocessing") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Configurar directorios
RAW_DIR = 'data/raw'
PROCESSED_DIR = 'data/processed'
os.makedirs(PROCESSED_DIR, exist_ok=True)

In [None]:
def preprocess_image(image, target_size=(224, 224)):
    """Preprocesa una imagen para el modelo"""
    try:
        # Convertir a escala de grises
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            gray = image
            
        # Redimensionar
        resized = cv2.resize(gray, target_size)
        
        # Normalizar
        normalized = resized.astype(np.float32) / 255.0
        
        return normalized
    except Exception as e:
        print(f"Error preprocesando imagen: {str(e)}")
        return None

In [None]:
def process_pdf(pdf_path):
    """Procesa un PDF y retorna información de sus páginas"""
    try:
        # Convertir PDF a imágenes
        pages = convert_from_path(pdf_path)
        pdf_name = os.path.basename(pdf_path).replace('.pdf', '')
        
        # Crear directorio para este PDF
        pdf_dir = os.path.join(PROCESSED_DIR, pdf_name)
        os.makedirs(pdf_dir, exist_ok=True)
        
        # Procesar cada página
        page_info = []
        for i, page in enumerate(pages):
            # Guardar imagen
            page_path = os.path.join(pdf_dir, f'page_{i}.png')
            page.save(page_path)
            
            # Convertir a array y preprocesar
            img_array = np.array(page)
            processed = preprocess_image(img_array)
            
            page_info.append({
                'pdf_name': pdf_name,
                'page_number': i,
                'path': page_path,
                'features': processed.flatten().tolist() if processed is not None else None
            })
            
        return page_info
    except Exception as e:
        print(f"Error procesando {pdf_path}: {str(e)}")
        return []

In [None]:
# Schema para el DataFrame
schema = StructType([
    StructField("pdf_name", StringType(), False),
    StructField("page_number", IntegerType(), False),
    StructField("path", StringType(), False),
    StructField("features", ArrayType(FloatType()), True)
])

# Procesar PDFs y crear DataFrame
pdf_files = [f for f in os.listdir(RAW_DIR) if f.endswith('.pdf')]
all_pages = []

for pdf in pdf_files:
    pdf_path = os.path.join(RAW_DIR, pdf)
    print(f"Procesando {pdf}...")
    pages = process_pdf(pdf_path)
    all_pages.extend(pages)

# Crear DataFrame
df = spark.createDataFrame(all_pages, schema)

# Guardar DataFrame procesado
df.write.mode("overwrite").parquet(os.path.join(PROCESSED_DIR, "processed_pages.parquet"))

In [None]:
# Mostrar estadísticas del procesamiento
print(f"Total de PDFs procesados: {len(pdf_files)}")
print(f"Total de páginas procesadas: {df.count()}")
print("\nDistribución de páginas por PDF:")
df.groupBy("pdf_name").count().show()

# Verificar valores nulos o problemas
print("\nVerificación de calidad:")
df.summary().show()