In [2]:
# !pip install seaborn
# !pip install scikit-learn

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pyspark.sql.types as T
from pyspark.storagelevel import StorageLevel
from pyspark.sql import SparkSession

import jellyfish
from elasticsearch import Elasticsearch
import json
from datetime import datetime
import time
import yaml
import jellyfish
import sklearn



In [4]:
import random

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [6]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [7]:
spark = SparkSession.builder \
    .appName("assigning_labels") \
    .master("spark://barravento:7077") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.1.3") \
    .config("spark.es.nodes", "barravento") \
    .config("spark.es.port", "9200") \
    .config("spark.es.nodes.wan.only", "false") \
    .config("spark.es.resource", "dbb2") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.shuffle.partitions", 16) \
    .config("spark.sql.files.maxPartitionBytes", "256m") \
    .getOrCreate()

sc = spark.sparkContext
# just to ensure that 
sc.setCheckpointDir("hdfs://barravento:9000/spark-checkpoints")

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.elasticsearch#elasticsearch-spark-30_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-878f470d-1aad-48ba-9f3a-ce9a195dd570;1.0
	confs: [default]
	found org.elasticsearch#elasticsearch-spark-30_2.12;8.1.3 in central
	found org.scala-lang#scala-reflect;2.12.8 in central
	found org.slf4j#slf4j-api;1.7.6 in central
	found commons-logging#commons-logging;1.1.1 in central
	found javax.xml.bind#jaxb-api;2.3.1 in central
	found com.google.protobuf#protobuf-java;2.5.0 in central
	found org.apache.spark#spark-yarn_2.12;3.2.0 in central
:: resolution report :: resolve 279ms :: artifacts dl 3ms
	:: modules in use:
	com.google.protobuf#protobuf-java;2.5.0 from central in [default]
	commons-logging#commons-lo

# Functions

In [29]:
def select_side_by_side(
    df,
    cfg,
    extras_front=None,
    extras_back=None,
    strict=False,
    field_order=None
):
    """
    Reordena colunas para comparação A vs B com base em cfg['dataset'], mantendo nomes originais.

    Retorna: df.select(...) com colunas em ordem:
      extras_front
      left_id, right_id (se existirem)
      [left_field, right_field] para cada campo lógico em cfg['dataset']['fields']
      extras_back

    Params
    ------
    strict: se True, levanta erro quando alguma coluna esperada não existe no df.
            se False, cria coluna NULL com o nome esperado.
    field_order: lista opcional com a ordem dos campos lógicos (ex.: ["nome","nome_mae","dt_nasc","sexo"]).
                 se None, usa a ordem do cfg['dataset']['fields'].
    """
    ds = cfg["dataset"]
    left_id  = ds["keys"]["left_id"]
    right_id = ds["keys"]["right_id"]
    fields   = ds["fields"]  # dict: logical_field -> {left, right, sim, weight, penalty}

    extras_front = extras_front or []
    extras_back  = extras_back or []

    existing = set(df.columns)

    # Mantém só extras que existem
    extras_front = [c for c in extras_front if c in existing]
    extras_back  = [c for c in extras_back if c in existing]

    # Define ordem dos campos lógicos
    logical_fields = field_order if field_order is not None else list(fields.keys())

    def col_or_null(colname: str):
        if colname in existing:
            return F.col(colname)
        if strict:
            raise ValueError(f"Coluna ausente no DataFrame: '{colname}'")
        return F.lit(None).alias(colname)

    cols = []

    # 1) extras_front
    cols += [F.col(c) for c in extras_front]

    # 2) ids
    cols.append(col_or_null(left_id))
    cols.append(col_or_null(right_id))

    # 3) pares lado a lado (mantendo nomes originais do YAML: left/right)
    for lf in logical_fields:
        spec = fields[lf]
        lcol = spec["left"]
        rcol = spec["right"]
        cols.append(col_or_null(lcol))
        cols.append(col_or_null(rcol))

    # 4) extras_back
    cols += [F.col(c) for c in extras_back]

    return df.select(*cols)

# Lendo bases

## Lendo arquivo de configuração

In [26]:
CONFIG_PATH = "config_traindata.yaml"

with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

cfg

{'version': 1,
 'dataset': {'keys': {'left_id': 'id_cidacs_a', 'right_id': 'id_cidacs_b'},
  'label': 'match_status',
  'fields': {'nome': {'left': 'nome_a',
    'right': 'nome_b',
    'sim': 'jaro_winkler',
    'weight': 1,
    'penalty': 0.1},
   'nome_mae': {'left': 'nome_mae_a',
    'right': 'nome_mae_b',
    'sim': 'jaro_winkler',
    'weight': 1,
    'penalty': 0.1},
   'dt_nasc': {'left': 'dt_nasc_a',
    'right': 'dt_nasc_b',
    'sim': 'hamming',
    'weight': 1,
    'penalty': 0.1},
   'sexo': {'left': 'sexo_a',
    'right': 'sexo_b',
    'sim': 'overlap',
    'weight': 1,
    'penalty': 0.1}}}}

## Lendo base construida para a revisão manual

In [31]:
link_df = spark.read.parquet('hdfs://barravento:9000/data/df_for_manual_review.parquet')
link_df.limit(2).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,target_pos,es_candidates,es_candidate,es_candidate_id,es_candidate_score,id_cidacs_a,nome_a,nome_mae_a,dt_nasc_a,sexo_a
0,1059,CRISTIANE SILVA DOS SANTOS,CRISTIANE SILVA DOS SANTOS,20071212,1,1,"[(1059, 32.776756, {}), (778174, 25.224531, {}...","(1059, 32.776756, {})",1059,32.776756,1059,GABRIEL VASCONCELOS SANTOS,CRISTIANE SILVA DOS SANTOS,20071212,1
1,493476,ARIELA LUIZA DA,ANA DANIELE LIMA DOS,20080719,1,1,"[(493476, 73.399155, {}), (416052, 48.723595, ...","(493476, 73.399155, {})",493476,73.399155,493476,ARIELA LUIZA DA SILVA,ANA DANIELE LIMA DOS SANTOS,20080719,2


In [11]:
base_a = spark.read.parquet('hdfs://barravento:9000/data/synthetic-dataset-A.parquet')
base_a.limit(2).toPandas()

                                                                                

Unnamed: 0,id_cidacs_a,nome_a,nome_mae_a,dt_nasc_a,sexo_a
0,1,YASMIM VITORIA MATIAS FONSECA,TACIANY DOS SANTOS,20071122,2
1,2,PEDRO HENRIQUE MARTINS DE CARVALHO,FRANCILEIDE DOS SANTOS ALVES,20061102,1


In [12]:
base_b = spark.read.parquet('hdfs://barravento:9000/data/synthetic-datasets-b-1000.parquet')
base_b.limit(2).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b
0,788,RUAN CESAR COSTA DE JESUS,JUSSARA CAROLINA R ALBUQUERQUE,20080531,1
1,1261,YASMIN MUNIZ MARCELINO,VERA LUCIA RIBEIRO,20080516,2


# Organizando base para revisão manual

In [33]:
link_df = select_side_by_side(
    link_df,
    cfg,
    extras_front=["target_pos", "es_candidate_score", "es_candidate_id"],
    extras_back=["es_candidates", "es_candidate"],
    strict=False
)

link_df.limit(5).toPandas()

Unnamed: 0,target_pos,es_candidate_score,es_candidate_id,id_cidacs_a,id_cidacs_b,nome_a,nome_b,nome_mae_a,nome_mae_b,dt_nasc_a,dt_nasc_b,sexo_a,sexo_b,es_candidates,es_candidate
0,1,32.776756,1059,1059,1059,GABRIEL VASCONCELOS SANTOS,CRISTIANE SILVA DOS SANTOS,CRISTIANE SILVA DOS SANTOS,CRISTIANE SILVA DOS SANTOS,20071212,20071212,1,1,"[(1059, 32.776756, {}), (778174, 25.224531, {}...","(1059, 32.776756, {})"
1,1,73.399155,493476,493476,493476,ARIELA LUIZA DA SILVA,ARIELA LUIZA DA,ANA DANIELE LIMA DOS SANTOS,ANA DANIELE LIMA DOS,20080719,20080719,2,1,"[(493476, 73.399155, {}), (416052, 48.723595, ...","(493476, 73.399155, {})"
2,1,80.12896,364678,364678,364678,JENIFER VITORIA DA SILVA SANTOS,JENIFER VITORIA DA SILVA SANTOS,DANIELE OLIVEIRA SILVA,DANIELE OLIVEIRA SILVA,20061018,20061018,2,2,"[(364678, 80.12896, {}), (567916, 40.68721, {}...","(364678, 80.12896, {})"
3,3,58.0167,851014,851014,996184,ARTHUR GABRIEL ABREU BARROS,ARTHUR GABRIEL ABREU BARROS,ADRIANA CAETANO SANTOS NUNES,MAIRA VANESSA SANTOS,20100616,20090604,2,2,"[(996184, 89.75126, {}), (812302, 58.0167, {})...","(851014, 58.0167, {})"
4,1,71.86185,750623,750623,750623,ATHANY CORREA DE CARVALHO,ATHANY CORREA DE,FABIANA SILVA DE ASSIS,FABIANA SILVA DE,20090725,20090725,2,1,"[(750623, 71.86185, {}), (779657, 42.328606, {...","(750623, 71.86185, {})"


# Atribuindo rótulo em "match_status"

In [35]:
link_df = link_df.withColumn('match_status', F.when(F.col('id_cidacs_a') == F.col('id_cidacs_b'), 1).otherwise(F.lit(0)))
link_df.select("match_status").groupBy("match_status").count().show()

                                                                                

+------------+-----+
|match_status|count|
+------------+-----+
|           1|  560|
|           0|  440|
+------------+-----+



## Escrevendo

In [36]:
link_df.write.parquet("hdfs://barravento:9000/data/result/train_dataset_raw.parquet", mode="overwrite")

                                                                                