In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
#import data_transform_utilities.flatten as flatten
#from data_transform_utilities.text_parsers import  clean_str, extract_json, json_str_to_array, normalize_and_tokenize_text
#import matplotlib.pyplot as plt

import re
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

import pysolr
from sqlalchemy import create_engine
import json
import requests

from sqlalchemy import create_engine, update, Table, MetaData
from sqlalchemy.orm import sessionmaker

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, MultiVectorConfig, MultiVectorComparator, NamedVectorStruct, NamedVector


import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, models
import numpy as np

from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
import mlflow
import mlflow.sentence_transformers
from sentence_transformers import SentenceTransformer, InputExample, models, losses, evaluation

In [None]:
spark_conf = SparkConf()
spark_conf.set("spark.cores", "12")
spark_conf.set("spark.driver.cores", "12")
spark_conf.set("spark.speculation", False)
spark_conf.set("spark.jars.packages", "com.mysql:mysql-connector-j:9.2.0")

spark = SparkSession \
    .builder.master("local") \
    .appName("Decision data overview") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
engine = create_engine("mysql+pymysql://decision:1234@localhost/decision?charset=utf8")
days_to_read = 3000

## Carrega os dados de vagas

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies WHERE requested_date > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies")

# Carrega os dados de candidatos

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM applicants WHERE created_at > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("applicants")

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies_applicants WHERE last_update > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies_applicants")

# Carrega o modelo

In [None]:
model_version = "0.0.1"

In [None]:
# Carrega modelo e tokenizer
#model_version = "0.0.1"
#model_name = "neuralmind/bert-base-portuguese-cased"
##model_name = "../trained_model_bert_20250508"
#tokenizer_name = model_name #"../tokenizer_model_bert_20250508" 
#tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
#model = AutoModel.from_pretrained(model_name)
#model.eval()

In [None]:
# 1. Load the transformer model
word_embedding_model = models.Transformer('neuralmind/bert-base-portuguese-cased', max_seq_length=512)

# 2. Add a pooling layer (mean pooling is common)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)

# 3. Build the SentenceTransformer model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Novo modelo com Sentence

In [None]:
MODEL_NAME = 'applicant_job_similarity'
MLFLOW_TRACKING_URI = 'http://192.168.101.186:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
model = mlflow.sentence_transformers.load_model("models:/applicant_job_similarity/21")
#model = models.Transformer("neuralmind/bert-base-portuguese-cased")

In [None]:
emb_vaga = model.encode("Vaga: Desenvolvedor Pyhton", normalize_embeddings=True)

# Função de embedding

In [None]:
# Função para gerar embedding médio da sequência
#def get_embedding(text, model, tokenizer):
#    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#    with torch.no_grad():
#        outputs = model(**inputs)
#    # Média dos embeddings dos tokens (ignorando padding)
#    attention_mask = inputs["attention_mask"]
#    embeddings = outputs.last_hidden_state
#    mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
#    sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
#    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
#    mean_embedding = sum_embeddings / sum_mask
#    return mean_embedding.squeeze().numpy()

In [None]:
def get_embedding(text, model):
    return model.encode(text, normalize_embeddings=True)

# Cria client do banco de Vetores

In [None]:
client = QdrantClient(host="localhost", port=6333)

# Cria as coleções no Qdrant

In [None]:
if not client.collection_exists(collection_name="applicants"):
    client.create_collection(
        collection_name="applicants",
        vectors_config={
            "title": VectorParams(size=768, distance=Distance.COSINE),
            "description": VectorParams(size=768, distance=Distance.COSINE),
            "location": VectorParams(size=768, distance=Distance.COSINE),
        }
    )

if not client.collection_exists(collection_name="vacancies"):
    client.create_collection(
        collection_name="vacancies",
        vectors_config={
            "title": VectorParams(size=768, distance=Distance.COSINE),
            "description": VectorParams(size=768, distance=Distance.COSINE),
            "location": VectorParams(size=768, distance=Distance.COSINE),
        },
    )
    
if not client.collection_exists(collection_name="job_titles"):
    client.create_collection(
        collection_name="job_titles",
        vectors_config= VectorParams(size=768, distance=Distance.COSINE),
    )

# Inicia a inserção no banco de Vetores

In [None]:
def insert_into_db(c, collection_name):
    client.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=c["id"],
                vector={
                    "title":c["title_embeddings"],
                    "description": c["description_embeddings"],
                    "location": c["location_embeddings"],
                },
                payload={"title":c["title"], "description": c["description"], "location": c["location"]}
            )
        ]
    )

In [None]:
def insert_batch(batch, collection_name, model):
    [insert_into_db(
        {"id": v.id, "title":v.title,"description": v.description, "location": v.location, 
        "title_embeddings": get_embedding(v.title, model), 
        "description_embeddings": get_embedding(v.description, model), 
        "location_embeddings": get_embedding(v.location, model),
        "model_version": v.model_version}, collection_name) for v in batch]

In [43]:
vacancies = spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            id,
            LOWER(
                CONCAT(
                    COALESCE(main_activities, ''), '\n', 
                    COALESCE(technical_and_behavioral_skills, ''), '\n',
                    COALESCE(behavioral_skills, ''), '\n'
                )
            ) AS description,
            
            TRIM(REGEXP_REPLACE(LOWER(title), '[0-9]+','')) as title,
            
            TRIM(LOWER(CONCAT(
                'país: ', COALESCE(country, ''), '\n',
                'estado: ', COALESCE(state, ''), '\n',
                'cidade: ', COALESCE(city, ''), '\n'
            ))) AS location,
            
            CURRENT_DATE() AS dt,
            '{model_version}' AS model_version
        FROM 
            vacancies v
        WHERE 
            v.id IN (SELECT vacancy_id FROM vacancies_applicants group by 1)
        ORDER BY id DESC
        ) AS t
    WHERE 
        LENGTH(TRIM(REGEXP_REPLACE(title, '\n', ''))) > 0
        AND LENGTH(TRIM(REGEXP_REPLACE(description, '\n', ''))) > 0
        AND LENGTH(TRIM(REGEXP_REPLACE(location, '\n', ''))) > 0
    -- LIMIT 500
""").collect()

                                                                                

In [None]:
insert_batch(vacancies, "vacancies", model)

In [41]:
applicants = spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            id,
            LOWER(CONCAT(
                COALESCE(technical_knowledge, ''), '\n',
                COALESCE(cv_pt, ''), '\n',
                'Endereço: ', COALESCE(location, '')
            )) AS description,
            TRIM(REGEXP_REPLACE(LOWER(professional_title), '[0-9]+','')) as title,
            CURRENT_DATE() AS dt,
            '{model_version}' AS model_version,
            LOWER(TRIM(location)) AS location
        FROM 
            applicants a
        WHERE 
            a.id IN (SELECT applicant_id FROM vacancies_applicants group by 1)
        ORDER BY id
        ) AS t
    WHERE 
        LENGTH(TRIM(REGEXP_REPLACE(description, '\n', ''))) > 0
        AND LENGTH(TRIM(REGEXP_REPLACE(title, '\n', ''))) > 0
        AND LENGTH(TRIM(REGEXP_REPLACE(location, '\n', ''))) > 0
""").collect()

                                                                                

In [42]:
insert_batch(applicants, "applicants", model)

In [None]:
job_titles = spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            id,
            TRIM(REGEXP_REPLACE(LOWER(professional_title), '[0-9]+','')) as description,
            CURRENT_DATE() AS dt,
            '{model_version}' AS model_version
        FROM
            applicants
        order by id
        ) AS t
    WHERE
        LENGTH(description) > 0
""")

In [None]:
insert_batch(job_titles.collect(), "job_titles", model)

# Buscas no Qdrant

In [None]:
from qdrant_client.models import ExtendedPointId

In [None]:
point = client.retrieve(collection_name="applicants", ids=[47079], with_vectors=True)[0]

In [None]:
collection_name = "applicants"
weights = {
    "title": 0.5,
    "description": 0.3,
    "location": 0.2
}

# Recupera os vetores do ponto base
query_vectors = point.vector

# Realiza buscas separadas e acumula os scores
scores = {}

for field, weight in weights.items():
    results = client.search(
        collection_name=collection_name,
        query_vector=(field, query_vectors[field]),
        limit=10
    )
    for r in results:
        if r.id not in scores:
            scores[r.id] = 0
        scores[r.id] += r.score * weight

# Ordena os resultados combinados
sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True )

#print("Resultados combinados:")
#for point_id, score in sorted_results:
#    print(f"ID: {point_id}, Score: {score}, {}")

In [None]:
sorted_results

In [None]:
ponto_a = client.retrieve(
    collection_name="applicants",
    ids=[47024],
    with_vectors=True
)[0]

In [None]:
print(ponto_a.payload["description"])

In [None]:
resultados = client.query_points(
    collection_name="applicants",
    query=ponto_a.vector, 
    limit=3
)

# Exibir resultados
for r in resultados.points:
    print(r.payload["description"])
    print("----------------------------------------------------------\n")
    #print(f"ID: {r.id} | Score: {r.score:.4f} | Descrição: {r.payload['descricao']}")

In [None]:
ponto_a.vector["title"]

In [None]:
from qdrant_client.http.models import QueryRequest, SearchRequest, WithPayloadInterface

In [None]:
title_query = SearchRequest(
    vector=ponto_a.vector["title"],
    limit=10,
)

In [None]:
search_requests = [
    SearchRequest(
        vector_name="title",
        vector="title",
        query_vector=ponto_a.vector["title"],
        limit=10,
    ),
    SearchRequest(
        vector_name="description",
        vector="title",
        query_vector=ponto_a.vector["description"],
        limit=10,
    ),
    SearchRequest(
        vector_name="location",
        vector="title",
        query_vector=ponto_a.vector["location"],
        limit=10,
    ),
]

In [None]:


# Combine as sub-buscas utilizando o método DBSF
query_request = QueryRequest(
    searches=search_requests,
    with_payload=WithPayloadInterface(enable=True),
    limit=10,
    score_aggregation="dbsf"  # ou "rrf" para Reciprocal Rank Fusion
)

# Execute a consulta
results = client.query(collection_name="candidatos", query_request=query_request)

In [None]:
client.search