In [0]:
%pip install openpyxl
import pandas as pd
import openpyxl
from pyspark.sql import SparkSession
import os
import numpy as np

In [0]:

xlsx_path = os.path.abspath(os.path.join("..", "source_files", "2023-2024.xlsx"))

pdf = pd.read_excel(
    xlsx_path,
    header=None,
    dtype=str,
    na_filter=False
)
# Slice de la sección con headers, reemplaza "" o " " con nan.
headers_raw = pdf.iloc[[1,2], :].replace(r'^\s*$', np.nan, regex=True)
filled = headers_raw.iloc[0,4:].ffill()
# Asigna a la fila 0 la lista con headers completos.
headers_raw.iloc[0, 4:] = filled

row0 = headers_raw.iloc[0].fillna('').astype(str)
row1 = headers_raw.iloc[1].fillna('').astype(str)

# combine: if row0 is non-empty, prefix it (plus a space) to row1; else leave row1 alone
combined = np.where(
    row0 != "",
    row0 + "_" + row1,
    row1
)
headers_raw.iloc[1] = combined
headers_raw.iloc[1, 0] = 'consecutivo'
headers = (
    headers_raw
    .iloc[1]                                # second row
    .astype(str)                            # ensure strings
    .str.strip()                            # trim whitespace
    .str.lower()                            # lowercase
    .str.replace("cantidad ", "", regex=True)
    .str.replace(r"[-\s]+", "_", regex=True)  # replace white spaces and -
)
headers_clean = headers.tolist() 
print(f"✅ Headers extraídos exitosamente: {headers_clean[0:3]}...")

In [0]:
# Ignoramos las primeras 4 filas con información no relevante 
start_row = pdf.index[pdf.iloc[:, 0] == '-'][0]
data_pdf = pdf.iloc[start_row:].reset_index(drop=True) 

# Creamos el dataframe de spark
spark_df = spark.createDataFrame(data_pdf) \
                 .toDF(*headers_clean)

# Guardamos la tabla Delta
table = "workspace.default.bronze_2024_2025"
spark_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table)

print(f"✅ Tabla e ingestión completada: {table}")