In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import NumericType, StringType, TimestampType, DateType
import pandas as pd

# -----------------------------
# 1 Inicializa Spark
# -----------------------------
spark = SparkSession.builder \
    .appName("Mega_Qualidade_Report_Serverless") \
    .getOrCreate()

# -----------------------------
# 2 Lê Silver
# -----------------------------
df = spark.table("saas_project.core.silver_data")
print(" Silver carregada para análise de qualidade")
df.show(5, truncate=False)

total_linhas = df.count()
total_colunas = len(df.columns)
duplicados = total_linhas - df.dropDuplicates().count()

# -----------------------------
# 3 Estatísticas detalhadas por coluna (Enterprise)
# -----------------------------
col_stats = []

for field in df.schema.fields:
    c = field.name
    tipo = str(field.dataType)
    nulos = df.filter(F.col(c).isNull()).count()
    nulos_pct = round(nulos / total_linhas * 100, 2)
    unicos = df.select(c).distinct().count()
    constante = unicos == 1

    if isinstance(field.dataType, NumericType):

        resumo_row = df.agg(
            F.mean(c).alias("media"),
            F.stddev(c).alias("desvio"),
            F.min(c).alias("min"),
            F.max(c).alias("max")
        ).first()

        media_val = resumo_row["media"]
        desvio_val = resumo_row["desvio"]
        min_val = resumo_row["min"]
        max_val = resumo_row["max"]

        q1, mediana, q3 = df.approxQuantile(c, [0.25, 0.5, 0.75], 0.01)
        iqr = q3 - q1

        outliers_count = df.filter(
            (F.col(c) < q1 - 1.5 * iqr) |
            (F.col(c) > q3 + 1.5 * iqr)
        ).count()

        col_stats.append({
            "coluna": c,
            "tipo": tipo,
            "nulos": nulos,
            "nulos_%": nulos_pct,
            "unicos": unicos,
            "constante": constante,
            "media": media_val,
            "desvio_padrao": desvio_val,
            "min": min_val,
            "max": max_val,
            "q1": q1,
            "mediana": mediana,
            "q3": q3,
            "iqr": iqr,
            "outliers": outliers_count
        })

    elif isinstance(field.dataType, StringType):

        len_stats = df.withColumn("len", F.length(F.col(c))).agg(
            F.mean("len").alias("mean_len"),
            F.min("len").alias("min_len"),
            F.max("len").alias("max_len")
        ).first()

        mean_len = round(len_stats["mean_len"], 2)
        min_len = len_stats["min_len"]
        max_len = len_stats["max_len"]
        empty_strings = df.filter(F.col(c) == "").count()

        top5_df = df.groupBy(c).count().orderBy(F.col("count").desc()).limit(5)
        top5_dict = {row[c]: row["count"] for row in top5_df.collect()}

        col_stats.append({
            "coluna": c,
            "tipo": tipo,
            "nulos": nulos,
            "nulos_%": nulos_pct,
            "unicos": unicos,
            "constante": constante,
            "mean_len": mean_len,
            "min_len": min_len,
            "max_len": max_len,
            "empty_strings": empty_strings,
            "top5_values": top5_dict
        })

    elif isinstance(field.dataType, (TimestampType, DateType)):

        resumo_row = df.agg(
            F.min(c).alias("min"),
            F.max(c).alias("max")
        ).first()

        col_stats.append({
            "coluna": c,
            "tipo": tipo,
            "nulos": nulos,
            "nulos_%": nulos_pct,
            "unicos": unicos,
            "constante": constante,
            "min": resumo_row["min"],
            "max": resumo_row["max"]
        })

    else:
        col_stats.append({
            "coluna": c,
            "tipo": tipo,
            "nulos": nulos,
            "nulos_%": nulos_pct,
            "unicos": unicos,
            "constante": constante
        })

col_stats_df = pd.DataFrame(col_stats)

# -----------------------------
# 4 Alertas profissionais
# -----------------------------
alertas = []

for s in col_stats:
    if s["nulos_%"] > 5:
        alertas.append(f" Coluna '{s['coluna']}' tem {s['nulos_%']}% nulos")
    if s["constante"]:
        alertas.append(f" Coluna '{s['coluna']}' é constante (1 único valor)")

if duplicados > 0:
    alertas.append(f" Existem {duplicados} linhas duplicadas no dataset")

alertas_df = pd.DataFrame({"alerta": alertas})

# -----------------------------
# 5 Resumo geral do dataset
# -----------------------------
resumo_geral_df = pd.DataFrame([{
    "total_linhas": total_linhas,
    "total_colunas": total_colunas,
    "duplicados": duplicados,
    "percentual_duplicados": round(duplicados / total_linhas * 100, 2)
}])

# -----------------------------
# 6 Combina tudo em mega arquivo
# -----------------------------
mega_report_df = pd.concat(
    [
        resumo_geral_df,
        pd.DataFrame({"coluna": [""]}),
        col_stats_df,
        pd.DataFrame({"coluna": [""]}),
        alertas_df
    ],
    axis=0,
    ignore_index=True
)

# >>> CORREÇÃO DO ERRO DO ARROW <<<
if "top5_values" in mega_report_df.columns:
    mega_report_df["top5_values"] = mega_report_df["top5_values"].astype(str)

# -----------------------------
# SAÍDA LIMITADA NO NOTEBOOK (10 linhas)
# -----------------------------
display(mega_report_df.head(10))

# -----------------------------
# 7 Salva CSV único
# -----------------------------
report_file = "/Volumes/saas_project/core/quality/qualidade_report.csv"
mega_report_df.to_csv(report_file, index=False)
print(f" Mega arquivo de qualidade profissional salvo: {report_file}")

# -----------------------------
# 8 Fim
# -----------------------------
print(" Mega relatório de qualidade gerado com métricas completas e alertas profissionais")

 Silver carregada para análise de qualidade
+---+----------+-----+-------------------+-------+--------------+-------------------+
|id |nome      |idade|email              |salario|cidade        |ingestion_time     |
+---+----------+-----+-------------------+-------+--------------+-------------------+
|6  |Pessoa 5  |49   |user5@exemplo.com  |7172.05|Rio           |2026/02/19-18:05:36|
|54 |Pessoa 53 |71   |user53@exemplo.com |5166.14|NULL          |2026/02/19-18:05:36|
|56 |Pessoa 55 |44   |user55@exemplo.com |5647.81|NULL          |2026/02/19-18:05:36|
|100|Pessoa 99 |61   |user99@exemplo.com |7439.98|Rio           |2026/02/19-18:05:36|
|107|Pessoa 106|49   |user106@exemplo.com|6449.11|Belo Horizonte|2026/02/19-18:05:36|
+---+----------+-----+-------------------+-------+--------------+-------------------+
only showing top 5 rows


total_linhas,total_colunas,duplicados,percentual_duplicados,coluna,tipo,nulos,nulos_%,unicos,constante,media,desvio_padrao,min,max,q1,mediana,q3,iqr,outliers,mean_len,min_len,max_len,empty_strings,top5_values,alerta
1000000.0,7.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,
,,,,id,IntegerType(),0.0,0.0,1000000.0,False,500000.5,288675.2789323441,1.0,1000000.0,245577.0,502251.0,744968.0,499391.0,0.0,,,,,,
,,,,nome,StringType(),0.0,0.0,1000000.0,False,,,,,,,,,,12.89,8.0,13.0,0.0,"{'Pessoa 99': 1, 'Pessoa 53': 1, 'Pessoa 5': 1, 'Pessoa 55': 1, 'Pessoa 106': 1}",
,,,,idade,StringType(),15532.0,1.55,64.0,False,,,,,,,,,,2.03,2.0,4.0,0.0,"{'44': 15861, '38': 15817, '41': 15801, '59': 15793, '45': 15779}",
,,,,email,StringType(),100867.0,10.09,899134.0,False,,,,,,,,,,21.89,17.0,22.0,0.0,"{None: 100867, 'user53@exemplo.com': 1, 'user5@exemplo.com': 1, 'user99@exemplo.com': 1, 'user55@exemplo.com': 1}",
,,,,salario,DoubleType(),0.0,0.0,555689.0,False,,,2000.01,,3964.82,5982.64,7969.4,4004.58,50187.0,,,,,,
,,,,cidade,StringType(),399678.0,39.97,4.0,False,,,,,,,,,,8.66,3.0,14.0,0.0,"{None: 399678, 'Rio': 200361, 'São Paulo': 200172, 'Belo Horizonte': 199789}",
,,,,ingestion_time,StringType(),0.0,0.0,1.0,True,,,,,,,,,,19.0,19.0,19.0,0.0,{'2026/02/19-18:05:36': 1000000},
,,,,,,,,,,,,,,,,,,,,,,,,


 Mega arquivo de qualidade profissional salvo: /Volumes/saas_project/core/quality/qualidade_report.csv
 Mega relatório de qualidade gerado com métricas completas e alertas profissionais
