In [None]:
# Paso 1: Guardar los Resultados de las Predicciones

import os
import sys
import ctypes
import cv2
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.feature import StandardScalerModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def check_hadoop_libraries():
    hadoop_home = os.environ.get('HADOOP_HOME')
    if not hadoop_home:
        print("Error: HADOOP_HOME no está configurado.")
        return False

    # Verificar la existencia de winutils.exe
    winutils_path = os.path.join(hadoop_home, 'bin', 'winutils.exe')
    if not os.path.exists(winutils_path):
        print(f"Error: {winutils_path} no existe.")
        return False

    # Verificar la existencia de hadoop.dll
    hadoop_dll_path = os.path.join(hadoop_home, 'bin', 'hadoop.dll')
    if not os.path.exists(hadoop_dll_path):
        print(f"Error: {hadoop_dll_path} no existe.")
        return False

    # Intentar cargar hadoop.dll
    try:
        ctypes.cdll.LoadLibrary(hadoop_dll_path)
        print("Biblioteca nativa de Hadoop cargada correctamente.")
    except OSError as e:
        print(f"Error: No se pudo cargar la biblioteca nativa de Hadoop: {e}")
        return False

    return True

# Verificar las bibliotecas necesarias de Hadoop antes de ejecutar el script
if not check_hadoop_libraries():
    sys.exit(1)

# Configuración del entorno
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Configurar las bibliotecas nativas de Hadoop
os.environ['HADOOP_HOME'] = 'C:\\Program Files\\winutils'
os.environ['JAVA_HOME'] = 'C:\\Program Files\\Java\\jdk-21'
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['HADOOP_HOME'], 'bin')
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['JAVA_HOME'], 'bin')

# Inicializar Spark
spark = SparkSession.builder \
    .appName("PDFFirstPagePredictor") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.executor.extraJavaOptions", "-Djava.library.path=" + os.path.join(os.environ['HADOOP_HOME'], 'bin')) \
    .getOrCreate()

def calculate_metrics(predicted_pages, target_pages):
    true_positives = len(set(predicted_pages) & set(target_pages))
    false_positives = len(set(predicted_pages) - set(target_pages))
    false_negatives = len(set(target_pages) - set(predicted_pages))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

def predict_first_pages(pdf_dir, model_path, scaler_path, target_pages, output_file):
    # Verificar si el modelo y el escalador existen
    if not os.path.exists(model_path):
        print(f"Error: El modelo no existe en la ruta {model_path}")
        return
    if not os.path.exists(scaler_path):
        print(f"Error: El escalador no existe en la ruta {scaler_path}")
        return

    # Cargar el modelo y el escalador
    lr_model = LogisticRegressionModel.load(model_path)
    scaler_model = StandardScalerModel.load(scaler_path)

    # Procesar las páginas del PDF
    image_data = []
    for page_num in range(len(os.listdir(pdf_dir))):
        image_path = os.path.join(pdf_dir, f'page_{page_num}.png')
        if os.path.exists(image_path):
            # Leer la imagen y convertirla a un vector
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if image is not None:
                resized = cv2.resize(image, (224, 224))
                normalized = resized.astype(np.float32) / 255.0
                features = normalized.flatten().tolist()
                image_data.append((image_path, features))

    # Crear DataFrame
    schema = StructType([
        StructField("path", StringType(), False),
        StructField("features", ArrayType(FloatType()), True)
    ])
    df = spark.createDataFrame(image_data, schema)

    # Convertir features a vector
    array_to_vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())
    df = df.withColumn("features_vector", array_to_vector_udf("features"))

    # Escalar características
    df_scaled = scaler_model.transform(df)

    # Realizar predicciones
    predictions = lr_model.transform(df_scaled)

    # Filtrar las primeras páginas predichas
    first_pages = predictions.filter(col("prediction") == 1).select("path").collect()

    # Guardar resultados en un archivo de texto
    with open(output_file, 'w') as f:
        for row in first_pages:
            f.write(row["path"] + '\n')

    # Mostrar resultados
    print("Páginas predichas como primeras páginas:")
    for row in first_pages:
        print(row["path"])

    # Calcular precisión
    predicted_pages = [int(row["path"].split('_')[-1].split('.')[0]) for row in first_pages]
    precision, recall, f1_score = calculate_metrics(predicted_pages, target_pages)
    print(f"Precisión: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1_score:.4f}")

# Ejemplo de uso
pdf_dir = 'data/pdf_pages'
model_path = 'data/models/logistic_regression_model'
scaler_path = 'data/processed/scaler_model'
target_pages = [
      0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 114, 124,
      134, 140, 146, 152, 162, 170, 178, 184, 190, 198, 206, 214, 222, 244,
      254, 264, 274, 284, 290, 296, 306, 312, 322, 332, 346, 356, 360
    ]
output_file = 'predicted_first_pages.txt'
predict_first_pages(pdf_dir, model_path, scaler_path, target_pages, output_file)

spark.stop()

Biblioteca nativa de Hadoop cargada correctamente.
Páginas predichas como primeras páginas:
data/pdf_pages\page_0.png
data/pdf_pages\page_10.png
data/pdf_pages\page_20.png
data/pdf_pages\page_30.png
data/pdf_pages\page_40.png
data/pdf_pages\page_50.png
data/pdf_pages\page_60.png
data/pdf_pages\page_70.png
data/pdf_pages\page_80.png
data/pdf_pages\page_90.png
data/pdf_pages\page_100.png
data/pdf_pages\page_110.png
data/pdf_pages\page_114.png
data/pdf_pages\page_116.png
data/pdf_pages\page_124.png
data/pdf_pages\page_134.png
data/pdf_pages\page_140.png
data/pdf_pages\page_146.png
data/pdf_pages\page_152.png
data/pdf_pages\page_162.png
data/pdf_pages\page_170.png
data/pdf_pages\page_178.png
data/pdf_pages\page_184.png
data/pdf_pages\page_190.png
data/pdf_pages\page_198.png
data/pdf_pages\page_206.png
data/pdf_pages\page_214.png
data/pdf_pages\page_222.png
data/pdf_pages\page_244.png
data/pdf_pages\page_254.png
data/pdf_pages\page_264.png
data/pdf_pages\page_274.png
data/pdf_pages\page_284

In [1]:
import os
import sys
import ctypes
import cv2
import json
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.feature import StandardScalerModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def check_hadoop_libraries():
    hadoop_home = os.environ.get('HADOOP_HOME')
    if not hadoop_home:
        print("Error: HADOOP_HOME no está configurado.")
        return False

    # Verificar la existencia de winutils.exe
    winutils_path = os.path.join(hadoop_home, 'bin', 'winutils.exe')
    if not os.path.exists(winutils_path):
        print(f"Error: {winutils_path} no existe.")
        return False

    # Verificar la existencia de hadoop.dll
    hadoop_dll_path = os.path.join(hadoop_home, 'bin', 'hadoop.dll')
    if not os.path.exists(hadoop_dll_path):
        print(f"Error: {hadoop_dll_path} no existe.")
        return False

    # Intentar cargar hadoop.dll
    try:
        ctypes.cdll.LoadLibrary(hadoop_dll_path)
        print("Biblioteca nativa de Hadoop cargada correctamente.")
    except OSError as e:
        print(f"Error: No se pudo cargar la biblioteca nativa de Hadoop: {e}")
        return False

    return True

# Verificar las bibliotecas necesarias de Hadoop antes de ejecutar el script
if not check_hadoop_libraries():
    sys.exit(1)

# Configuración del entorno
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Configurar las bibliotecas nativas de Hadoop
os.environ['HADOOP_HOME'] = 'C:\\Program Files\\winutils'
os.environ['JAVA_HOME'] = 'C:\\Program Files\\Java\\jdk-21'
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['HADOOP_HOME'], 'bin')
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['JAVA_HOME'], 'bin')

# Inicializar Spark
spark = SparkSession.builder \
    .appName("PDFFirstPagePredictor") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.executor.extraJavaOptions", "-Djava.library.path=" + os.path.join(os.environ['HADOOP_HOME'], 'bin')) \
    .getOrCreate()

def load_target_pages(labels_path, document_name):
    with open(labels_path, 'r') as f:
        labels = json.load(f)
    return labels[document_name]["target_pages"]

def calculate_metrics(predicted_pages, target_pages):
    true_positives = len(set(predicted_pages) & set(target_pages))
    false_positives = len(set(predicted_pages) - set(target_pages))
    false_negatives = len(set(target_pages) - set(predicted_pages))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

def predict_first_pages(pdf_dir, model_path, scaler_path, target_pages, output_file):
    # Verificar si el modelo y el escalador existen
    if not os.path.exists(model_path):
        print(f"Error: El modelo no existe en la ruta {model_path}")
        return
    if not os.path.exists(scaler_path):
        print(f"Error: El escalador no existe en la ruta {scaler_path}")
        return

    # Cargar el modelo y el escalador
    lr_model = LogisticRegressionModel.load(model_path)
    scaler_model = StandardScalerModel.load(scaler_path)

    # Procesar las páginas del PDF
    image_data = []
    for page_num in range(len(os.listdir(pdf_dir))):
        image_path = os.path.join(pdf_dir, f'page_{page_num}.png')
        if os.path.exists(image_path):
            # Leer la imagen y convertirla a un vector
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if image is not None:
                resized = cv2.resize(image, (224, 224))
                normalized = resized.astype(np.float32) / 255.0
                features = normalized.flatten().tolist()
                label = 1 if page_num in target_pages else 0
                image_data.append((image_path, label, features))

    # Crear DataFrame
    schema = StructType([
        StructField("path", StringType(), False),
        StructField("label", IntegerType(), False),
        StructField("features", ArrayType(FloatType()), True)
    ])
    df = spark.createDataFrame(image_data, schema)

    # Convertir features a vector
    array_to_vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())
    df = df.withColumn("features_vector", array_to_vector_udf("features"))

    # Escalar características
    df_scaled = scaler_model.transform(df)

    # Realizar predicciones
    predictions = lr_model.transform(df_scaled)

    # Filtrar las primeras páginas predichas
    first_pages = predictions.filter(col("prediction") == 1).select("path").collect()

    # Guardar resultados en un archivo de texto
    with open(output_file, 'w') as f:
        for row in first_pages:
            f.write(row["path"] + '\n')

    # Mostrar resultados
    print("Páginas predichas como primeras páginas:")
    for row in first_pages:
        print(row["path"])

    # Calcular precisión
    predicted_pages = [int(row["path"].split('_')[-1].split('.')[0]) for row in first_pages]
    precision, recall, f1_score = calculate_metrics(predicted_pages, target_pages)
    print(f"Precisión: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1_score:.4f}")

    return first_pages, df_scaled

# Ejemplo de uso
pdf_dir = 'data/pdf_pages'
model_path = 'data/models/logistic_regression_model'
scaler_path = 'data/processed/scaler_model'
labels_path = 'data/labels.json'
document_name = "2-TITULOS-15-DE-NOVIEMBRE-2024"
target_pages = load_target_pages(labels_path, document_name)
output_file = 'predicted_first_pages.txt'
first_pages, df_scaled = predict_first_pages(pdf_dir, model_path, scaler_path, target_pages, output_file)

# Realizar Tuning del Modelo
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Definir el modelo de regresión logística
lr = LogisticRegression(featuresCol="scaled_features", labelCol="label")

# Definir el evaluador
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# Definir la cuadrícula de hiperparámetros
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.maxIter, [10, 20, 50]) \
    .build()

# Definir el validador cruzado
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Preparar datos para entrenamiento
final_df = df_scaled.select("label", "scaled_features")
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

# Entrenar el modelo con validación cruzada
cvModel = crossval.fit(train_df)

# Evaluar el modelo
predictions = cvModel.transform(test_df)
roc_auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {roc_auc:.4f}")

# Guardar el mejor modelo
best_model = cvModel.bestModel
best_model.write().overwrite().save(os.path.join('data/models', "best_logistic_regression_model"))

spark.stop()

Biblioteca nativa de Hadoop cargada correctamente.
Páginas predichas como primeras páginas:
data/pdf_pages\page_0.png
data/pdf_pages\page_10.png
data/pdf_pages\page_20.png
data/pdf_pages\page_30.png
data/pdf_pages\page_40.png
data/pdf_pages\page_50.png
data/pdf_pages\page_60.png
data/pdf_pages\page_70.png
data/pdf_pages\page_80.png
data/pdf_pages\page_90.png
data/pdf_pages\page_100.png
data/pdf_pages\page_110.png
data/pdf_pages\page_114.png
data/pdf_pages\page_116.png
data/pdf_pages\page_124.png
data/pdf_pages\page_134.png
data/pdf_pages\page_140.png
data/pdf_pages\page_146.png
data/pdf_pages\page_152.png
data/pdf_pages\page_162.png
data/pdf_pages\page_170.png
data/pdf_pages\page_178.png
data/pdf_pages\page_184.png
data/pdf_pages\page_190.png
data/pdf_pages\page_198.png
data/pdf_pages\page_206.png
data/pdf_pages\page_214.png
data/pdf_pages\page_222.png
data/pdf_pages\page_244.png
data/pdf_pages\page_254.png
data/pdf_pages\page_264.png
data/pdf_pages\page_274.png
data/pdf_pages\page_284

Py4JJavaError: An error occurred while calling o204.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 6402.0 failed 1 times, most recent failure: Lost task 7.0 in stage 6402.0 (TID 62071) (Loboreapper executor driver): java.net.SocketException: Connection reset
	at java.base/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:318)
	at java.base/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:346)
	at java.base/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:796)
	at java.base/java.net.Socket$SocketInputStream.read(Socket.java:1099)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:291)
	at java.base/java.io.BufferedInputStream.read1(BufferedInputStream.java:347)
	at java.base/java.io.BufferedInputStream.implRead(BufferedInputStream.java:420)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:399)
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:208)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1160)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1213)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:320)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:187)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:167)
	at org.apache.spark.rdd.OrderedRDDFunctions.$anonfun$sortByKey$1(OrderedRDDFunctions.scala:64)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.OrderedRDDFunctions.sortByKey(OrderedRDDFunctions.scala:63)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.x$3$lzycompute(BinaryClassificationMetrics.scala:192)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.x$3(BinaryClassificationMetrics.scala:181)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.confusions$lzycompute(BinaryClassificationMetrics.scala:183)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.confusions(BinaryClassificationMetrics.scala:183)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.createCurve(BinaryClassificationMetrics.scala:275)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.roc(BinaryClassificationMetrics.scala:106)
	at org.apache.spark.mllib.evaluation.BinaryClassificationMetrics.areaUnderROC(BinaryClassificationMetrics.scala:126)
	at org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate(BinaryClassificationEvaluator.scala:101)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.net.SocketException: Connection reset
	at java.base/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:318)
	at java.base/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:346)
	at java.base/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:796)
	at java.base/java.net.Socket$SocketInputStream.read(Socket.java:1099)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:291)
	at java.base/java.io.BufferedInputStream.read1(BufferedInputStream.java:347)
	at java.base/java.io.BufferedInputStream.implRead(BufferedInputStream.java:420)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:399)
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:208)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1160)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1213)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)


In [None]:
# Paso 3: Realizar Tuning del Modelo

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Definir el evaluador
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# Definir la cuadrícula de hiperparámetros
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.maxIter, [10, 20, 50]) \
    .build()

# Definir el validador cruzado
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Entrenar el modelo con validación cruzada
cvModel = crossval.fit(train_df)

# Evaluar el modelo
predictions = cvModel.transform(test_df)
roc_auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {roc_auc:.4f}")

# Guardar el mejor modelo
best_model = cvModel.bestModel
best_model.write().overwrite().save(os.path.join(MODEL_DIR, "best_logistic_regression_model"))

In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import os
import sys
import ctypes
import cv2
import json
import numpy as np
import PyPDF2
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.feature import StandardScalerModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Configuración del Entorno y Carga de Bibliotecas
def check_hadoop_libraries():
    hadoop_home = os.environ.get('HADOOP_HOME')
    if not hadoop_home:
        print("Error: HADOOP_HOME no está configurado.")
        return False

    winutils_path = os.path.join(hadoop_home, 'bin', 'winutils.exe')
    if not os.path.exists(winutils_path):
        print(f"Error: {winutils_path} no existe.")
        return False

    hadoop_dll_path = os.path.join(hadoop_home, 'bin', 'hadoop.dll')
    if not os.path.exists(hadoop_dll_path):
        print(f"Error: {hadoop_dll_path} no existe.")
        return False

    try:
        ctypes.cdll.LoadLibrary(hadoop_dll_path)
        print("Biblioteca nativa de Hadoop cargada correctamente.")
    except OSError as e:
        print(f"Error: No se pudo cargar la biblioteca nativa de Hadoop: {e}")
        return False

    return True

if not check_hadoop_libraries():
    sys.exit(1)

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['HADOOP_HOME'] = 'C:\\Program Files\\winutils'
os.environ['JAVA_HOME'] = 'C:\\Program Files\\Java\\jdk-21'
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['HADOOP_HOME'], 'bin')
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['JAVA_HOME'], 'bin')

spark = SparkSession.builder \
    .appName("PDFFirstPagePredictor") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.executor.extraJavaOptions", "-Djava.library.path=" + os.path.join(os.environ['HADOOP_HOME'], 'bin')) \
    .getOrCreate()

# Definición de Funciones Auxiliares
def load_target_pages(labels_path, document_name):
    with open(labels_path, 'r') as f:
        labels = json.load(f)
    return labels[document_name]["target_pages"]

def calculate_metrics(predicted_pages, target_pages):
    true_positives = len(set(predicted_pages) & set(target_pages))
    false_positives = len(set(predicted_pages) - set(target_pages))
    false_negatives = len(set(target_pages) - set(predicted_pages))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

def predict_first_pages(pdf_dir, model_path, scaler_path, target_pages, output_file):
    if not os.path.exists(model_path):
        print(f"Error: El modelo no existe en la ruta {model_path}")
        return
    if not os.path.exists(scaler_path):
        print(f"Error: El escalador no existe en la ruta {scaler_path}")
        return

    lr_model = LogisticRegressionModel.load(model_path)
    scaler_model = StandardScalerModel.load(scaler_path)

    image_data = []
    for page_num in range(len(os.listdir(pdf_dir))):
        image_path = os.path.join(pdf_dir, f'page_{page_num}.png')
        if os.path.exists(image_path):
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if image is not None:
                resized = cv2.resize(image, (224, 224))
                normalized = resized.astype(np.float32) / 255.0
                features = normalized.flatten().tolist()
                label = 1 if page_num in target_pages else 0
                image_data.append((image_path, label, features))

    schema = StructType([
        StructField("path", StringType(), False),
        StructField("label", IntegerType(), False),
        StructField("features", ArrayType(FloatType()), True)
    ])
    df = spark.createDataFrame(image_data, schema)

    array_to_vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())
    df = df.withColumn("features_vector", array_to_vector_udf("features"))

    df_scaled = scaler_model.transform(df)

    predictions = lr_model.transform(df_scaled)

    first_pages = predictions.filter(col("prediction") == 1).select("path").collect()

    with open(output_file, 'w') as f:
        for row in first_pages:
            f.write(row["path"] + '\n')

    print("Páginas predichas como primeras páginas:")
    for row in first_pages:
        print(row["path"])

    predicted_pages = [int(row["path"].split('_')[-1].split('.')[0]) for row in first_pages]
    precision, recall, f1_score = calculate_metrics(predicted_pages, target_pages)
    print(f"Precisión: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1_score:.4f}")

    return predicted_pages, df_scaled

def split_pdf(input_pdf_path, output_dir, first_pages):
    os.makedirs(output_dir, exist_ok=True)

    with open(input_pdf_path, 'rb') as input_pdf_file:
        pdf_reader = PyPDF2.PdfReader(input_pdf_file)
        total_pages = len(pdf_reader.pages)

        first_pages.append(total_pages)

        for i in range(len(first_pages) - 1):
            start_page = first_pages[i]
            end_page = first_pages[i + 1]

            pdf_writer = PyPDF2.PdfWriter()
            for page_num in range(start_page, end_page):
                pdf_writer.add_page(pdf_reader.pages[page_num])

            output_pdf_path = os.path.join(output_dir, f'segment_{i + 1}.pdf')
            with open(output_pdf_path, 'wb') as output_pdf_file:
                pdf_writer.write(output_pdf_file)

            print(f'Segmento {i + 1} guardado en {output_pdf_path}')

# Ejecución del Modelo y Predicción de Primeras Páginas
pdf_dir = 'data/pdf_pages'
model_path = 'data/models/logistic_regression_model'
scaler_path = 'data/processed/scaler_model'
labels_path = 'data/labels.json'
document_name = "2-TITULOS-15-DE-NOVIEMBRE-2024"
target_pages = load_target_pages(labels_path, document_name)
output_file = 'predicted_first_pages.txt'
predicted_pages, df_scaled = predict_first_pages(pdf_dir, model_path, scaler_path, target_pages, output_file)

# Corte del PDF Basado en las Predicciones
input_pdf_path = 'data/raw/2-TITULOS-15-DE-NOVIEMBRE-2024.pdf'  # Asegúrate de que esta ruta sea correcta y el archivo exista
output_dir = 'data/output'
split_pdf(input_pdf_path, output_dir, predicted_pages)

# Realizar Tuning del Modelo
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(featuresCol="scaled_features", labelCol="label")

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.maxIter, [10, 20, 50]) \
    .build()

crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

final_df = df_scaled.select("label", "scaled_features")
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

cvModel = crossval.fit(train_df)

predictions = cvModel.transform(test_df)
roc_auc = evaluator.evaluate(predictions)
print(f"ROC AUC: {roc_auc:.4f}")

best_model = cvModel.bestModel
best_model.write().overwrite().save(os.path.join('data/models', "best_logistic_regression_model"))

spark.stop()

Biblioteca nativa de Hadoop cargada correctamente.
Páginas predichas como primeras páginas:
data/pdf_pages\page_0.png
data/pdf_pages\page_10.png
data/pdf_pages\page_20.png
data/pdf_pages\page_30.png
data/pdf_pages\page_40.png
data/pdf_pages\page_50.png
data/pdf_pages\page_60.png
data/pdf_pages\page_70.png
data/pdf_pages\page_80.png
data/pdf_pages\page_90.png
data/pdf_pages\page_100.png
data/pdf_pages\page_110.png
data/pdf_pages\page_114.png
data/pdf_pages\page_116.png
data/pdf_pages\page_124.png
data/pdf_pages\page_134.png
data/pdf_pages\page_140.png
data/pdf_pages\page_146.png
data/pdf_pages\page_152.png
data/pdf_pages\page_162.png
data/pdf_pages\page_170.png
data/pdf_pages\page_178.png
data/pdf_pages\page_184.png
data/pdf_pages\page_190.png
data/pdf_pages\page_198.png
data/pdf_pages\page_206.png
data/pdf_pages\page_214.png
data/pdf_pages\page_222.png
data/pdf_pages\page_244.png
data/pdf_pages\page_254.png
data/pdf_pages\page_264.png
data/pdf_pages\page_274.png
data/pdf_pages\page_284

Py4JJavaError: An error occurred while calling o20503.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 6524.0 failed 1 times, most recent failure: Lost task 13.0 in stage 6524.0 (TID 63201) (Loboreapper executor driver): java.net.SocketException: Connection reset
	at java.base/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:318)
	at java.base/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:346)
	at java.base/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:796)
	at java.base/java.net.Socket$SocketInputStream.read(Socket.java:1099)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:291)
	at java.base/java.io.BufferedInputStream.read1(BufferedInputStream.java:347)
	at java.base/java.io.BufferedInputStream.implRead(BufferedInputStream.java:420)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:399)
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:208)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1160)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1213)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2488)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1202)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1196)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$2(RDD.scala:1289)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1256)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1242)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1242)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:61)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:47)
	at breeze.optimize.CachedDiffFunction.calculate(CachedDiffFunction.scala:24)
	at breeze.optimize.FirstOrderMinimizer.calculateObjective(FirstOrderMinimizer.scala:53)
	at breeze.optimize.FirstOrderMinimizer.initialState(FirstOrderMinimizer.scala:47)
	at breeze.optimize.FirstOrderMinimizer.iterations(FirstOrderMinimizer.scala:99)
	at org.apache.spark.ml.classification.LogisticRegression.trainImpl(LogisticRegression.scala:1005)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:634)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:497)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:287)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.net.SocketException: Connection reset
	at java.base/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:318)
	at java.base/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:346)
	at java.base/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:796)
	at java.base/java.net.Socket$SocketInputStream.read(Socket.java:1099)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:291)
	at java.base/java.io.BufferedInputStream.read1(BufferedInputStream.java:347)
	at java.base/java.io.BufferedInputStream.implRead(BufferedInputStream.java:420)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:399)
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:208)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1160)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1176)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1213)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)


In [1]:
import os
import sys
import ctypes
import cv2
import numpy as np
import PyPDF2
from pdf2image import convert_from_path
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.feature import StandardScalerModel

# Configuración del Entorno y Carga de Bibliotecas
def check_hadoop_libraries():
    hadoop_home = os.environ.get('HADOOP_HOME')
    if not hadoop_home:
        print("Error: HADOOP_HOME no está configurado.")
        return False

    winutils_path = os.path.join(hadoop_home, 'bin', 'winutils.exe')
    if not os.path.exists(winutils_path):
        print(f"Error: {winutils_path} no existe.")
        return False

    hadoop_dll_path = os.path.join(hadoop_home, 'bin', 'hadoop.dll')
    if not os.path.exists(hadoop_dll_path):
        print(f"Error: {hadoop_dll_path} no existe.")
        return False

    try:
        ctypes.cdll.LoadLibrary(hadoop_dll_path)
        print("Biblioteca nativa de Hadoop cargada correctamente.")
    except OSError as e:
        print(f"Error: No se pudo cargar la biblioteca nativa de Hadoop: {e}")
        return False

    return True

if not check_hadoop_libraries():
    sys.exit(1)

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['HADOOP_HOME'] = 'C:\\Program Files\\winutils'
os.environ['JAVA_HOME'] = 'C:\\Program Files\\Java\\jdk-21'
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['HADOOP_HOME'], 'bin')
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['JAVA_HOME'], 'bin')

spark = SparkSession.builder \
    .appName("PDFFirstPagePredictor") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.executor.extraJavaOptions", "-Djava.library.path=" + os.path.join(os.environ['HADOOP_HOME'], 'bin')) \
    .getOrCreate()

# Definición de Funciones Auxiliares
def convert_pdf_to_images(pdf_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    images = convert_from_path(pdf_path)
    for i, image in enumerate(images):
        image_path = os.path.join(output_dir, f'page_{i}.png')
        image.save(image_path, 'PNG')
    return len(images)

def predict_first_pages(pdf_dir, model_path, scaler_path, output_file):
    if not os.path.exists(model_path):
        print(f"Error: El modelo no existe en la ruta {model_path}")
        return
    if not os.path.exists(scaler_path):
        print(f"Error: El escalador no existe en la ruta {scaler_path}")
        return

    lr_model = LogisticRegressionModel.load(model_path)
    scaler_model = StandardScalerModel.load(scaler_path)

    image_data = []
    for page_num in range(len(os.listdir(pdf_dir))):
        image_path = os.path.join(pdf_dir, f'page_{page_num}.png')
        if os.path.exists(image_path):
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if image is not None:
                resized = cv2.resize(image, (224, 224))
                normalized = resized.astype(np.float32) / 255.0
                features = normalized.flatten().tolist()
                image_data.append((image_path, 0, features))  # Label is not used

    schema = StructType([
        StructField("path", StringType(), False),
        StructField("label", IntegerType(), False),
        StructField("features", ArrayType(FloatType()), True)
    ])
    df = spark.createDataFrame(image_data, schema)

    array_to_vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())
    df = df.withColumn("features_vector", array_to_vector_udf("features"))

    df_scaled = scaler_model.transform(df)

    predictions = lr_model.transform(df_scaled)

    first_pages = predictions.filter(col("prediction") == 1).select("path").collect()

    with open(output_file, 'w') as f:
        for row in first_pages:
            f.write(row["path"] + '\n')

    print("Páginas predichas como primeras páginas:")
    for row in first_pages:
        print(row["path"])

    predicted_pages = [int(row["path"].split('_')[-1].split('.')[0]) for row in first_pages]
    return predicted_pages

def split_pdf(input_pdf_path, output_dir, first_pages):
    os.makedirs(output_dir, exist_ok=True)

    with open(input_pdf_path, 'rb') as input_pdf_file:
        pdf_reader = PyPDF2.PdfReader(input_pdf_file)
        total_pages = len(pdf_reader.pages)

        first_pages.append(total_pages)

        for i in range(len(first_pages) - 1):
            start_page = first_pages[i]
            end_page = first_pages[i + 1]

            pdf_writer = PyPDF2.PdfWriter()
            for page_num in range(start_page, end_page):
                pdf_writer.add_page(pdf_reader.pages[page_num])

            output_pdf_path = os.path.join(output_dir, f'segment_{i + 1}.pdf')
            with open(output_pdf_path, 'wb') as output_pdf_file:
                pdf_writer.write(output_pdf_file)

            print(f'Segmento {i + 1} guardado en {output_pdf_path}')

# Rutas de Entrada y Salida
pdf_path = 'data/raw/2-TITULOS-15-DE-NOVIEMBRE-2024.pdf'  # Ruta del archivo PDF de entrada
images_output_dir = 'data/PDFTEST'  # Directorio donde se guardarán las imágenes
model_path = 'data/models/logistic_regression_model'  # Ruta del modelo
scaler_path = 'data/processed/scaler_model'  # Ruta del escalador
output_file = 'predicted_first_pages2.txt'  # Archivo donde se guardarán las predicciones
output_dir = 'data/output1'  # Directorio donde se guardarán los PDFs cortados

# Convertir el PDF en Imágenes
num_pages = convert_pdf_to_images(pdf_path, images_output_dir)

# Hacer la Predicción de las Primeras Páginas
predicted_pages = predict_first_pages(images_output_dir, model_path, scaler_path, output_file)

# Cortar el PDF Basado en las Predicciones
split_pdf(pdf_path, output_dir, predicted_pages)

spark.stop()

Biblioteca nativa de Hadoop cargada correctamente.
Páginas predichas como primeras páginas:
data/PDFTEST\page_0.png
data/PDFTEST\page_10.png
data/PDFTEST\page_20.png
data/PDFTEST\page_30.png
data/PDFTEST\page_40.png
data/PDFTEST\page_50.png
data/PDFTEST\page_60.png
data/PDFTEST\page_70.png
data/PDFTEST\page_80.png
data/PDFTEST\page_90.png
data/PDFTEST\page_100.png
data/PDFTEST\page_110.png
data/PDFTEST\page_114.png
data/PDFTEST\page_116.png
data/PDFTEST\page_124.png
data/PDFTEST\page_134.png
data/PDFTEST\page_140.png
data/PDFTEST\page_146.png
data/PDFTEST\page_152.png
data/PDFTEST\page_162.png
data/PDFTEST\page_170.png
data/PDFTEST\page_178.png
data/PDFTEST\page_184.png
data/PDFTEST\page_190.png
data/PDFTEST\page_198.png
data/PDFTEST\page_206.png
data/PDFTEST\page_214.png
data/PDFTEST\page_222.png
data/PDFTEST\page_244.png
data/PDFTEST\page_254.png
data/PDFTEST\page_264.png
data/PDFTEST\page_274.png
data/PDFTEST\page_284.png
data/PDFTEST\page_290.png
data/PDFTEST\page_296.png
data/PDFT