In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
#import data_transform_utilities.flatten as flatten
#from data_transform_utilities.text_parsers import  clean_str, extract_json, json_str_to_array, normalize_and_tokenize_text
#import matplotlib.pyplot as plt

import re
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

import pysolr
from sqlalchemy import create_engine
import json
import requests

from sqlalchemy import create_engine, update, Table, MetaData
from sqlalchemy.orm import sessionmaker

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np

In [None]:
spark_conf = SparkConf()
spark_conf.set("spark.cores", "12")
spark_conf.set("spark.driver.cores", "12")
spark_conf.set("spark.speculation", False)
spark_conf.set("spark.jars.packages", "com.mysql:mysql-connector-j:9.2.0")

spark = SparkSession \
    .builder.master("local") \
    .appName("Decision data overview") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
engine = create_engine("mysql+pymysql://decision:1234@localhost/decision?charset=utf8")
days_to_read = 1800

## Carrega os dados de vagas

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies WHERE requested_date > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies")

# Carrega os dados de candidatos

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM applicants WHERE created_at > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("applicants")

# Carrega o modelo

In [None]:
model_version = "0.0.1"

In [None]:
# Carrega modelo e tokenizer
model_version = "0.0.1"
#model_name = "neuralmind/bert-base-portuguese-cased"
model_name = "../trained_model_bert_20250508"
tokenizer_name = "../tokenizer_model_bert_20250508" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# Novo modelo com Sentence

In [None]:
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
import mlflow
import mlflow.sentence_transformers
from sentence_transformers import SentenceTransformer, InputExample, models, losses, evaluation

In [None]:
MODEL_NAME = 'applicant_job_similarity'
MLFLOW_TRACKING_URI = 'http://192.168.101.186:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
model = mlflow.sentence_transformers.load_model("models:/applicant_job_similarity/21")
#model = models.Transformer("neuralmind/bert-base-portuguese-cased")

In [None]:
model.

In [None]:
#model = SentenceTransformer("./modelo_finetuned_afinidade")

In [None]:
emb_vaga = model.encode("Vaga: Desenvolvedor Pyhton", normalize_embeddings=True)

In [None]:
emb_vaga

# Função de embedding

In [None]:
# Função para gerar embedding médio da sequência
def get_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Média dos embeddings dos tokens (ignorando padding)
    attention_mask = inputs["attention_mask"]
    embeddings = outputs.last_hidden_state
    mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
    mean_embedding = sum_embeddings / sum_mask
    return mean_embedding.squeeze().numpy()

In [None]:
def get_embedding2(text, model):
    return model.encode(text, normalize_embeddings=True)

# Cria client do banco de Vetores

In [None]:
client = QdrantClient(host="localhost", port=6333)

# Cria as coleções no Qdrant

In [None]:
if not client.collection_exists(collection_name="applicants"):
    client.create_collection(
        collection_name="applicants",
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )

if not client.collection_exists(collection_name="vacancies"):
    client.create_collection(
        collection_name="vacancies",
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )

# Inicia a inserção no banco de Vetores

In [None]:
def insert_into_db(c, collection_name):
    client.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=c["id"],
                vector=c["embeddings"],
                payload={"description": c["description"]}
            )
        ]
    )

In [None]:
def insert_batch(batch, collection_name, model):
    result = [insert_into_db(
        {"id": v.id, "description": v.description, 
        #"embeddings": get_embedding(v.description, model, tokenizer), 
        "embeddings": get_embedding2(v.description, model), 
        "model_version": v.model_version}, collection_name) for v in batch]

In [None]:
vacancies = spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            id,
            CONCAT(
                COALESCE(main_activities, ''), '\n', 
                COALESCE(technical_and_behavioral_skills, ''), '\n',
                COALESCE(behavioral_skills, ''), '\n',
                'país: ', COALESCE(country, ''), '\n',
                'estado: ', COALESCE(state, ''), '\n',
                'cidade: ', COALESCE(city, ''), '\n'
            ) AS description,
            CURRENT_DATE() AS dt,
            '{model_version}' AS model_version
        FROM 
            vacancies v
        ORDER BY id DESC
        ) AS t
    WHERE LENGTH(TRIM(REGEXP_REPLACE(description, '\n', ''))) > 0
    -- LIMIT 500
""").collect()



In [None]:
insert_batch(vacancies, "vacancies", model)

In [None]:
applicants = spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            id,
            CONCAT(
                COALESCE(technical_knowledge, ''), '\n',
                COALESCE(cv_pt, ''), '\n',
                'Endereço: ', COALESCE(location, '')
            ) AS description,
            CURRENT_DATE() AS dt,
            '{model_version}' AS model_version
        FROM 
            applicants a
        ORDER BY id DESC
        ) AS t
    WHERE LENGTH(TRIM(REGEXP_REPLACE(description, '\n', ''))) > 0
    -- LIMIT 500

""").collect()

In [None]:
insert_batch(applicants, "applicants", model)

In [None]:
from qdrant_client.models import ExtendedPointId

In [None]:
ponto_a = client.retrieve(
    collection_name="vacancies",
    ids=[14153],
    with_vectors=True
)[0]

In [None]:
print(ponto_a.payload["description"])

In [None]:
resultados = client.query_points(
    collection_name="applicants",
    query=ponto_a.vector, 
    limit=3
)

# Exibir resultados
for r in resultados.points:
    print(r.payload["description"])
    print("----------------------------------------------------------\n")
    #print(f"ID: {r.id} | Score: {r.score:.4f} | Descrição: {r.payload['descricao']}")