In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pyspark.sql.types as T
from pyspark.storagelevel import StorageLevel
from pyspark.sql import SparkSession

import jellyfish
from elasticsearch import Elasticsearch
import json
from datetime import datetime
import time
import yaml
import jellyfish
import sklearn
from sklearn.metrics import roc_curve, auc



In [2]:
# !pip install seaborn
# !pip install scikit-learn

In [3]:
import random

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [5]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [6]:
start = time.time()

In [7]:
spark = SparkSession.builder \
    .appName("TrainDataSet") \
    .master("spark://barravento:7077") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.1.3") \
    .config("spark.es.nodes", "barravento") \
    .config("spark.es.port", "9200") \
    .config("spark.es.nodes.wan.only", "false") \
    .config("spark.es.resource", "dbb2") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.shuffle.partitions", 16) \
    .config("spark.sql.files.maxPartitionBytes", "256m") \
    .getOrCreate()

sc = spark.sparkContext
# just to ensure that 
sc.setCheckpointDir("hdfs://barravento:9000/spark-checkpoints")

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.elasticsearch#elasticsearch-spark-30_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-38cf9473-4c75-443d-b36a-9287fb2e57c9;1.0
	confs: [default]
	found org.elasticsearch#elasticsearch-spark-30_2.12;8.1.3 in central
	found org.scala-lang#scala-reflect;2.12.8 in central
	found org.slf4j#slf4j-api;1.7.6 in central
	found commons-logging#commons-logging;1.1.1 in central
	found javax.xml.bind#jaxb-api;2.3.1 in central
	found com.google.protobuf#protobuf-java;2.5.0 in central
	found org.apache.spark#spark-yarn_2.12;3.2.0 in central
:: resolution report :: resolve 320ms :: artifacts dl 2ms
	:: modules in use:
	com.google.protobuf#protobuf-java;2.5.0 from central in [default]
	commons-logging#commons-lo

# Funções

In [8]:
def jaro_winkler(col1, col2, weight, penality):
    if (col1 == "") or (col2 == "") or (col1 == None) or (col2 == None):
        return penality
    else:
        return jellyfish.jaro_winkler_similarity(str(col1), str(col2)) * weight
udf_jaro_winkler = F.udf(jaro_winkler, DoubleType())

def hamming(col1, col2, weight, penality):
    if (col1 == "") or (col2 == "") or (col1 == None) or (col2 == None):
        return penality
    else: 
        max_size = max(len(col1), len(col2))
        return (1.0 - float(jellyfish.hamming_distance(str(col1), str(col2)) / max_size)) * weight
udf_hamming = F.udf(hamming, DoubleType())


def overlap(col1, col2, weight, penality):
    if (col1 == "") or (col2 == "") or (col1 == None) or (col2 == None) or (col1 != col2):
        return penality
    else: 
        return 1.0 * weight
udf_overlap = F.udf(overlap, DoubleType())

def sim_hub(col1, col2, sim_type, weight, penality):
    if sim_type == "jaro_winkler":
        return jaro_winkler(str(col1), str(col2), weight, penality)
    elif sim_type == "hamming":
        return hamming(str(col1), str(col2), weight, penality)
    else: 
        return overlap(str(col1), str(col2), weight, penality)
udf_sim_hub = F.udf(sim_hub, DoubleType())

def calcula_similaridades(df, config, params):
    config_ = config['dataset']['fields']
    for pair in config['dataset']['fields']:
        weight = params[f"w_{pair}"]
        penalty = params[f"p_{pair}"]
        right_var = config_[pair]['right']
        left_var = config_[pair]['left']
        similarity = config_[pair]['sim']
        
        df = df.withColumn(f"sim_{right_var}_{left_var}", udf_sim_hub(F.col(right_var), 
                                                                      F.col(left_var), 
                                                                      F.lit(similarity), 
                                                                      F.lit(weight), 
                                                                      F.lit(penalty)))\
               .withColumn(f"w_{pair}", F.lit(params[f"w_{pair}"]))\
               .withColumn(f"p_{pair}", F.lit(params[f"p_{pair}"]))
    return df

def calcula_similaridade(df, config, params):
    config_ = config['dataset']['fields']
    score_max = 0
    for pair in config['dataset']['fields']:
        score_max += params[f"w_{pair}"]
        
    # score_max = sum([float(cfg['dataset']['fields'][x]['weight']) for x in cfg['dataset']['fields'].keys()])
    
    vars_ = []
    for pair in config['dataset']['fields']:
        right_var = config_[pair]['right']
        left_var = config_[pair]['left']

        vars_.append(f"sim_{right_var}_{left_var}")
    
    return df.withColumn("total_score", F.lit(sum(F.col(c) for c in vars_))/F.lit(score_max))

# Lendo base

In [9]:
link_df = spark.read.parquet("hdfs://barravento:9000/data/result/train_dataset_raw.parquet").repartition(128)
link_df.limit(2).toPandas()

                                                                                

Unnamed: 0,target_pos,es_candidate_score,es_candidate_id,id_cidacs_a,id_cidacs_b,nome_a,nome_b,nome_mae_a,nome_mae_b,dt_nasc_a,dt_nasc_b,sexo_a,sexo_b,es_candidates,es_candidate,match_status
0,3,49.92177,84146,84146,509978,MARIA VITORIA PESSOA BARBOSA,MARIA VITORIA PESSOA BARBOSA,JAMILE DE OLIVEIRA SILVA,GIZELLE DE OLIVEIRA LIMA,20090212,20080203,2,2,"[(509978, 93.56702, {}), (819674, 49.92177, {}...","(84146, 49.92177, {})",0
1,1,92.05257,124538,124538,124538,WELLINGTON EDGAR ALVES SANTOS,WELLINGTON EDGAR ALVES,SORAIA ARAUJO NOGUEIRA,SORAIA ARAUJO,20100710,20100710,1,2,"[(124538, 92.05257, {}), (831032, 57.37523, {}...","(124538, 92.05257, {})",1


# Calculando similaridades

In [10]:
CONFIG_PATH = "config_traindata_all.yaml"

with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

cfg

{'version': 1,
 'dataset': {'keys': {'left_id': 'id_cidacs_a', 'right_id': 'id_cidacs_b'},
  'label': 'match_status',
  'fields': {'nome': {'left': 'nome_a',
    'right': 'nome_b',
    'sim': 'jaro_winkler',
    'weight': {'low': 0.0, 'high': 6.0, 'step': 0.05},
    'penalty': {'low': 0.0, 'high': 1.0, 'step': 0.01}},
   'nome_mae': {'left': 'nome_mae_a',
    'right': 'nome_mae_b',
    'sim': 'jaro_winkler',
    'weight': {'low': 0.0, 'high': 6.0, 'step': 0.05},
    'penalty': {'low': 0.0, 'high': 1.0, 'step': 0.01}},
   'dt_nasc': {'left': 'dt_nasc_a',
    'right': 'dt_nasc_b',
    'sim': 'hamming',
    'weight': {'low': 0.0, 'high': 6.0, 'step': 0.05},
    'penalty': {'low': 0.0, 'high': 1.0, 'step': 0.01}},
   'sexo': {'left': 'sexo_a',
    'right': 'sexo_b',
    'sim': 'overlap',
    'weight': {'low': 0.0, 'high': 6.0, 'step': 0.05},
    'penalty': {'low': 0.0, 'high': 1.0, 'step': 0.01}}}}}

# Criando grade de experimentos

In [11]:
# import numpy as np
# from itertools import product

# def arange_inclusive(low, high, step):
#     # evita drift de float
#     n = int(round((high - low) / step)) + 1
#     return [round(low + i * step, 10) for i in range(n)]

# def iter_all_param_sets(cfg):
#     fields = cfg["dataset"]["fields"]  # dict: nome, nome_mae, dt_nasc, sexo...

#     # constrói grid por campo
#     grids = {}
#     for key, spec in fields.items():
#         w = spec["weight"]
#         p = spec["penalty"]
#         w_vals = arange_inclusive(w["low"], w["high"], w["step"])
#         p_vals = arange_inclusive(p["low"], p["high"], p["step"])
#         grids[key] = list(product(w_vals, p_vals))  # (w, p)

#     field_keys = list(fields.keys())

#     # produto cartesiano entre os campos
#     for combo in product(*(grids[k] for k in field_keys)):
#         # combo é uma tupla: [(w_nome,p_nome), (w_nome_mae,p_nome_mae), ...]
#         out = {}
#         for k, (w, p) in zip(field_keys, combo):
#             out[f"w_{k}"] = w
#             out[f"p_{k}"] = p
#         yield out

In [11]:
def sample_param_sets(cfg, n, seed=2026):
    rng = np.random.default_rng(seed)
    fields = cfg["dataset"]["fields"]
    field_keys = list(fields.keys())

    # prepara valores por campo
    w_vals_by = {}
    p_vals_by = {}
    for k in field_keys:
        w = fields[k]["weight"]
        p = fields[k]["penalty"]
        w_vals_by[k] = np.round(np.arange(w["low"], w["high"] + 1e-9, w["step"]), 10)
        p_vals_by[k] = np.round(np.arange(p["low"], p["high"] + 1e-9, p["step"]), 10)

    # amostra
    rows = []
    for _ in range(n):
        row = {}
        for k in field_keys:
            row[f"w_{k}"] = float(rng.choice(w_vals_by[k]))
            row[f"p_{k}"] = float(rng.choice(p_vals_by[k]))
        rows.append(row)

    return rows


In [13]:
rows = sample_param_sets(cfg, n=50, seed=2026)
rows[0:2]

[{'w_nome': 5.15,
  'p_nome': 0.18,
  'w_nome_mae': 0.15,
  'p_nome_mae': 0.64,
  'w_dt_nasc': 2.2,
  'p_dt_nasc': 0.47,
  'w_sexo': 0.45,
  'p_sexo': 0.37},
 {'w_nome': 3.85,
  'p_nome': 0.35,
  'w_nome_mae': 5.0,
  'p_nome_mae': 0.79,
  'w_dt_nasc': 4.25,
  'p_dt_nasc': 0.91,
  'w_sexo': 4.35,
  'p_sexo': 0.17}]

In [14]:
len(rows)

50

In [15]:
# it = iter_all_param_sets(cfg)
# for _ in range(5):
#     print(next(it))

In [16]:
# len(it)

# Criando as linhas da base de treino

In [15]:
schema = StructType([
    StructField("VP", DoubleType(), True),
    StructField("FP", DoubleType(), True),
    StructField("FN", DoubleType(), True),
    StructField("VN", DoubleType(), True),
    StructField("precision", DoubleType(), True),
    StructField("recall", DoubleType(), True),
    StructField("specificity", DoubleType(), True),
    StructField("accuracy", DoubleType(), True),
    StructField("w_nome", DoubleType(), True),
    StructField("p_nome", DoubleType(), True),
    StructField("w_nome_mae", DoubleType(), True),
    StructField("p_nome_mae", DoubleType(), True),
    StructField("w_dt_nasc", DoubleType(), True),
    StructField("p_dt_nasc", DoubleType(), True),
    StructField("w_sexo", DoubleType(), True),
    StructField("p_sexo", DoubleType(), True),
])

metrics = spark.createDataFrame([], schema)

In [16]:
for row in rows:
    # Criando similaridades entre os pares de atributos
    link_df_ = calcula_similaridades(link_df, cfg, row)
    # Criando a similaridade total
    link_df_ = calcula_similaridade(link_df_, cfg, row)
    
    # Calculando um ponto de corte usando a ROC
    ## transferindo para pandas
    df = link_df_.select(["match_status", "total_score"]).toPandas()

    ## calculando taxa de acertos
    ### classes (1: link, 0: non-link)
    y_true = df["match_status"].values
    ### similaridade atribuída as classes
    y_score = df["total_score"].values
    ### Usando a curva roc 
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    # roc_auc = auc(fpr, tpr) # não preciso calcular isso agora
    ### usando o youden para encontrar o melhor entre os thresholds
    j_scores = tpr - fpr
    best_idx = np.argmax(j_scores)
    
    best_threshold = thresholds[best_idx]
    best_tpr = tpr[best_idx]
    best_fpr = fpr[best_idx]
    # best_threshold

    # Criando metricas
    link_df_ = link_df_.withColumn(
    "pair_class",
    F.when(
        (F.col("match_status") == 1) & (F.col("total_score") >= F.lit(best_threshold)), "VP"
    ).when(
        (F.col("match_status") == 1) & (F.col("total_score") < F.lit(best_threshold)), "FN"
    ).when(
        (F.col("match_status") == 0) & (F.col("total_score") < F.lit(best_threshold)), "VN"
    ).when(
        (F.col("match_status") == 0) & (F.col("total_score") >= F.lit(best_threshold)), "FP"
    ).otherwise("NA"))

    metrics_ = link_df_.agg(
    F.sum(F.when(F.col("pair_class") == "VP", 1).otherwise(0)).alias("VP"),
    F.sum(F.when(F.col("pair_class") == "FP", 1).otherwise(0)).alias("FP"),
    F.sum(F.when(F.col("pair_class") == "FN", 1).otherwise(0)).alias("FN"),
    F.sum(F.when(F.col("pair_class") == "VN", 1).otherwise(0)).alias("VN"))

    metrics_ = metrics_.withColumn(
        "precision", F.col("VP") / (F.col("VP") + F.col("FP"))
    ).withColumn(
        "recall", F.col("VP") / (F.col("VP") + F.col("FN"))
    ).withColumn(
        "specificity", F.col("VN") / (F.col("VN") + F.col("FP"))
    ).withColumn(
        "accuracy", (F.col("VP") + F.col("VN")) /
                    (F.col("VP") + F.col("FP") + F.col("FN") + F.col("VN")))

    for key in row.keys():
        metrics_ = metrics_.withColumn(key, F.lit(row[key]))
        
    metrics = metrics.union(metrics_)
    # print(row)
    print(f"Número de registros na base de treino: {metrics.count()}")
    # del(df)
    # link_df.unpersist()

26/02/02 21:14:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Número de registros na base de treino: 1


                                                                                

Número de registros na base de treino: 2


                                                                                

Número de registros na base de treino: 3


                                                                                

Número de registros na base de treino: 4


                                                                                

Número de registros na base de treino: 5


                                                                                

Número de registros na base de treino: 6


                                                                                

Número de registros na base de treino: 7


                                                                                

Número de registros na base de treino: 8


                                                                                

Número de registros na base de treino: 9


                                                                                

Número de registros na base de treino: 10


                                                                                

Número de registros na base de treino: 11


                                                                                

Número de registros na base de treino: 12


                                                                                

Número de registros na base de treino: 13


                                                                                

Número de registros na base de treino: 14


                                                                                

Número de registros na base de treino: 15


                                                                                

Número de registros na base de treino: 16


                                                                                

Número de registros na base de treino: 17


                                                                                

Número de registros na base de treino: 18


                                                                                

Número de registros na base de treino: 19


                                                                                

Número de registros na base de treino: 20


                                                                                

Número de registros na base de treino: 21


                                                                                

Número de registros na base de treino: 22


                                                                                

Número de registros na base de treino: 23


                                                                                

Número de registros na base de treino: 24


                                                                                

Número de registros na base de treino: 25


                                                                                

Número de registros na base de treino: 26


                                                                                

Número de registros na base de treino: 27


                                                                                

Número de registros na base de treino: 28


                                                                                

Número de registros na base de treino: 29


                                                                                

Número de registros na base de treino: 30


                                                                                

Número de registros na base de treino: 31


                                                                                

Número de registros na base de treino: 32


                                                                                

Número de registros na base de treino: 33


                                                                                

Número de registros na base de treino: 34


                                                                                

Número de registros na base de treino: 35


                                                                                

Número de registros na base de treino: 36


                                                                                

Número de registros na base de treino: 37


                                                                                

Número de registros na base de treino: 38


                                                                                

Número de registros na base de treino: 39
Número de registros na base de treino: 40


                                                                                

Número de registros na base de treino: 41


                                                                                

Número de registros na base de treino: 42


                                                                                

Número de registros na base de treino: 43


                                                                                

Número de registros na base de treino: 44


                                                                                

Número de registros na base de treino: 45


                                                                                

Número de registros na base de treino: 46


                                                                                

Número de registros na base de treino: 47


                                                                                

Número de registros na base de treino: 48


                                                                                

Número de registros na base de treino: 49


                                                                                

Número de registros na base de treino: 50


In [17]:
metrics.limit(5).toPandas()

                                                                                

Unnamed: 0,VP,FP,FN,VN,precision,recall,specificity,accuracy,w_nome,p_nome,w_nome_mae,p_nome_mae,w_dt_nasc,p_dt_nasc,w_sexo,p_sexo
0,422.0,15.0,138.0,425.0,0.965675,0.753571,0.965909,0.847,5.15,0.18,0.15,0.64,2.2,0.47,0.45,0.37
1,435.0,13.0,125.0,427.0,0.970982,0.776786,0.970455,0.862,3.85,0.35,5.0,0.79,4.25,0.91,4.35,0.17
2,301.0,0.0,259.0,440.0,1.0,0.5375,1.0,0.741,5.15,0.65,0.55,0.3,1.0,0.97,4.4,0.92
3,431.0,14.0,129.0,426.0,0.968539,0.769643,0.968182,0.857,1.7,0.64,3.65,0.76,0.7,0.52,3.85,0.83
4,423.0,15.0,137.0,425.0,0.965753,0.755357,0.965909,0.848,3.95,0.45,2.75,0.34,0.95,0.28,0.85,0.22


In [18]:
metrics.write.parquet("hdfs://barravento:9000/data/result/train_dataset_final.parquet", mode="overwrite")

                                                                                

In [19]:
metrics.coalesce(1).write.csv("hdfs://barravento:9000/data/result/train_dataset_final.csv", header=True, mode="overwrite")

                                                                                ]