# Linkage manual review dataset — Elasticsearch candidates with controlled rank sampling

Este notebook cria, a partir de um DataFrame Spark com atributos de linkage, uma coluna:

- `es_candidates`: lista (array) com até **N** candidatos retornados do Elasticsearch (via `_msearch`)
- `target_pos`: posição-alvo sorteada por linha segundo uma **distribuição configurada em YAML**
- `es_candidate`: candidato escolhido de `es_candidates[target_pos-1]` com fallback configurável

Pré-requisitos:
- Cluster Spark disponível
- Acesso ao Elasticsearch (rede + credenciais se necessário)
- Pacotes Python: `pyyaml`, `requests`

> Observação: a abordagem usa `_msearch` para reduzir o overhead (lote por partição).


## 1) Configuração YAML (exemplo)

In [None]:
# Ajuste o caminho do YAML conforme seu ambiente (Databricks / Jupyter / filesystem)
yaml_path = "/dbfs/FileStore/linkage_cfg.yaml"  # exemplo Databricks
# yaml_path = "linkage_cfg.yaml"               # exemplo local
print("YAML path:", yaml_path)


Exemplo de `linkage_cfg.yaml`:

```yaml
elasticsearch:
  hosts: ["https://meu-es:9200"]
  index: "sinasc_index"
  auth:
    user_env: "ES_USER"
    pass_env: "ES_PASS"
  tls:
    verify_certs: false

query:
  max_candidates: 10
  source_fields: ["nome", "nome_mae", "dt_nasc", "sexo", "municipio_res"]
  fields:
    - col: "nome"
      es_field: "nome"
      type: "match"
      fuzziness: "AUTO"
      boost: 3.0
      operator: "AND"
    - col: "nome_mae"
      es_field: "nome_mae"
      type: "match"
      fuzziness: "AUTO"
      boost: 2.0
      operator: "AND"
    - col: "dt_nasc"
      es_field: "dt_nasc"
      type: "term"
      boost: 2.0
    - col: "sexo"
      es_field: "sexo"
      type: "term"
      boost: 1.0
    - col: "municipio_res"
      es_field: "municipio_res"
      type: "term"
      boost: 1.0

sampling:
  position_distribution_pct:
    "1": 55
    "2": 20
    "3": 10
    "4": 7
    "5": 4
    "6": 2
    "7": 1
    "8": 1
  seed: 42
  fallback_when_short:
    mode: "last_available"   # options: null | last_available | first_available
```


## 2) Carregar YAML e validar distribuição

In [None]:
import os
import yaml

cfg = yaml.safe_load(open(yaml_path, "r"))

dist = cfg["sampling"]["position_distribution_pct"]
seed = int(cfg["sampling"].get("seed", 42))

# valida soma = 100
s = sum(int(v) for v in dist.values())
assert s == 100, f"Distribuição precisa somar 100 (atual: {s})"

max_candidates = int(cfg["query"]["max_candidates"])
fallback_mode = cfg["sampling"]["fallback_when_short"]["mode"]

print("OK! max_candidates =", max_candidates)
print("fallback_mode =", fallback_mode)
print("seed =", seed)


## 3) DataFrame de entrada (exemplo)

In [None]:
# Esperado: df com colunas (ajuste aos seus nomes reais)
# - nome, nome_mae, dt_nasc, sexo, municipio_res
# Exemplos:
# df = spark.table("minha_tabela_linkage_features")
# df = spark.read.parquet("...")

# Para evitar erro se você rodar sem df definido:
try:
    df
    print("df já existe. Colunas:", df.columns)
except NameError:
    print("Defina o DataFrame 'df' antes de continuar.")


## 4) Sortear a posição-alvo `target_pos` conforme o YAML

In [None]:
from pyspark.sql import functions as F

items = sorted([(int(k), int(v)) for k, v in dist.items()], key=lambda x: x[0])

# acumulados em [0,1]
cum = []
acc = 0
for pos, pct in items:
    acc += pct
    cum.append((pos, acc / 100.0))

u = F.rand(seed)

case_expr = None
for pos, thr in cum:
    cond = (u <= F.lit(thr))
    case_expr = F.when(cond, F.lit(pos)) if case_expr is None else case_expr.when(cond, F.lit(pos))
case_expr = case_expr.otherwise(F.lit(items[-1][0]))

df2 = df.withColumn("target_pos", case_expr.cast("int"))
df2.select("target_pos").groupBy("target_pos").count().orderBy("target_pos").show()


## 5) Buscar candidatos no Elasticsearch via `_msearch` (por partição)

In [None]:
import json
import requests
from typing import Iterator, Dict, Any
from pyspark.sql import types as T

es_hosts = cfg["elasticsearch"]["hosts"]
es_index = cfg["elasticsearch"]["index"]
verify = bool(cfg["elasticsearch"].get("tls", {}).get("verify_certs", True))

user = os.getenv(cfg["elasticsearch"]["auth"]["user_env"], "")
pwd  = os.getenv(cfg["elasticsearch"]["auth"]["pass_env"], "")

fields_cfg = cfg["query"]["fields"]
src_fields = cfg["query"].get("source_fields", [])

def build_es_query(row: Dict[str, Any]) -> Dict[str, Any]:
    should = []
    for f in fields_cfg:
        col = f["col"]
        val = row.get(col)

        if val is None or (isinstance(val, str) and val.strip() == ""):
            continue

        qtype = f["type"]
        es_field = f["es_field"]
        boost = float(f.get("boost", 1.0))

        if qtype == "match":
            clause = {
                "match": {
                    es_field: {
                        "query": val,
                        "boost": boost,
                        "operator": f.get("operator", "OR"),
                        "fuzziness": f.get("fuzziness", "AUTO"),
                    }
                }
            }
        elif qtype == "term":
            clause = {"term": {es_field: {"value": val, "boost": boost}}}
        else:
            raise ValueError(f"Tipo não suportado: {qtype}")

        should.append(clause)

    return {
        "size": max_candidates,
        "_source": src_fields,
        "query": {
            "bool": {
                "should": should,
                "minimum_should_match": 1
            }
        }
    }

cand_schema = T.ArrayType(T.StructType([
    T.StructField("es_id", T.StringType(), True),
    T.StructField("es_score", T.DoubleType(), True),
    T.StructField("es_source", T.StringType(), True),  # JSON compactado (opcional)
]))

out_schema = T.StructType(df2.schema.fields + [
    T.StructField("es_candidates", cand_schema, True),
])

def fetch_partition(rows: Iterator[Any]) -> Iterator[Any]:
    rows_list = list(rows)
    if not rows_list:
        return iter([])

    ndjson_lines = []
    for r in rows_list:
        d = r.asDict(recursive=True)
        header = {"index": es_index}
        query  = build_es_query(d)
        ndjson_lines.append(json.dumps(header))
        ndjson_lines.append(json.dumps(query))

    body = "\n".join(ndjson_lines) + "\n"

    base = es_hosts[0].rstrip("/")
    url = f"{base}/_msearch"

    auth = (user, pwd) if (user or pwd) else None

    resp = requests.post(
        url,
        data=body,
        headers={"Content-Type": "application/x-ndjson"},
        auth=auth,
        verify=verify,
        timeout=60
    )
    resp.raise_for_status()
    payload = resp.json()

    responses = payload.get("responses", [])
    if len(responses) != len(rows_list):
        raise RuntimeError(f"Resposta do ES desalinhada: {len(responses)} != {len(rows_list)}")

    out = []
    for r, pr in zip(rows_list, responses):
        hits = pr.get("hits", {}).get("hits", [])
        cands = []
        for h in hits:
            cands.append({
                "es_id": h.get("_id"),
                "es_score": float(h.get("_score") or 0.0),
                "es_source": json.dumps(h.get("_source", {}), ensure_ascii=False),
            })
        out.append(tuple(list(r) + [cands]))

    return iter(out)

df3 = df2.rdd.mapPartitions(fetch_partition).toDF(schema=out_schema)

df3.select("target_pos", F.size("es_candidates").alias("n_cands")).show(10, truncate=False)


## 6) Criar `es_candidate` escolhendo a posição sorteada (com fallback)

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

cand_struct = T.StructType([
    T.StructField("es_id", T.StringType(), True),
    T.StructField("es_score", T.DoubleType(), True),
    T.StructField("es_source", T.StringType(), True),
])

def pick_candidate(cands, target_pos: int):
    if cands is None or len(cands) == 0:
        return None
    idx = (target_pos or 1) - 1
    if idx < len(cands):
        return cands[idx]
    if fallback_mode == "last_available":
        return cands[-1]
    if fallback_mode == "first_available":
        return cands[0]
    return None

pick_udf = F.udf(pick_candidate, cand_struct)

df4 = df3.withColumn("es_candidate", pick_udf(F.col("es_candidates"), F.col("target_pos")))

df4.select(
    "target_pos",
    F.col("es_candidate.es_id").alias("es_id"),
    F.col("es_candidate.es_score").alias("es_score")
).show(20, truncate=False)


## 7) Auditoria: distribuição observada e taxa de nulos

In [None]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

aud = (
    df4
    .withColumn("has_candidate", F.col("es_candidate").isNotNull())
    .groupBy("target_pos")
    .agg(
        F.count("*").alias("n"),
        F.sum(F.col("has_candidate").cast("int")).alias("n_with_candidate")
    )
    .withColumn("pct_rows", F.col("n") / F.sum("n").over(Window.partitionBy()) * 100)
    .withColumn("pct_with_candidate", F.col("n_with_candidate") / F.col("n") * 100)
    .orderBy("target_pos")
)

aud.show(50, truncate=False)

df4.select(F.mean(F.col("es_candidate").isNull().cast("int")).alias("null_rate")).show()


## 8) Exportar para revisão manual

In [None]:
# Sugestões:
# - manter colunas de linkage + es_candidate (+/- es_candidates)
# - salvar em parquet/delta para revisão interna
# - ou gerar CSV (cuidado com tamanho de es_candidates)

cols_out = [
    "nome", "nome_mae", "dt_nasc", "sexo", "municipio_res",
    "target_pos",
    "es_candidate",
    "es_candidates",
]
cols_out = [c for c in cols_out if c in df4.columns]

df_out = df4.select(*cols_out)
df_out.show(5, truncate=False)

# Exemplo de escrita:
# df_out.write.mode("overwrite").parquet("/mnt/data/linkage_manual_review.parquet")
