In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
from data_transform_utilities.text_parsers import  clean_str, extract_json, json_str_to_array, normalize_and_tokenize_text
from data_transform_utilities.score import generate_score_from_status

from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

from sqlalchemy import create_engine
import json
import requests
import os
import torch

from torch.utils.data import DataLoader
import os

import mlflow
from sentence_transformers import SentenceTransformer, InputExample, models, losses, evaluation
import os
from sklearn.model_selection import train_test_split
import mlflow.pytorch
import mlflow.sentence_transformers
import mlflow.sentence_transformers
import time

## Define algumas variáveis de ambiente para poder rodar ROCm com placas de vídeo AMD

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_SHM_DISABLE"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"

In [None]:
print(torch.cuda.is_available())
print(torch.version.hip)

In [None]:
spark_conf = SparkConf()
spark_conf.set("spark.cores", "12")
spark_conf.set("spark.driver.cores", "12")
#spark_conf.set("spark.driver.memory", "16g")
spark_conf.set("spark.speculation", False)
spark_conf.set("spark.jars.packages", "com.mysql:mysql-connector-j:9.2.0")

spark = SparkSession \
    .builder.master("local") \
    .appName("Decision data overview") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

# Extrai os dados do banco Relacional para Treinamento do Modelo e Embeddings

## Cria conexão com banco relacionsl

In [None]:
engine = create_engine("mysql+pymysql://decision:1234@localhost/decision?charset=utf8")
days_to_read = 1800

# Registra UDFs

In [None]:
spark.udf.register("clean_str", clean_str)
spark.udf.register("generate_score_from_status", generate_score_from_status, FloatType())

## Carrega os dados de vagas

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies WHERE requested_date > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies")

# Carrega os dados de candidatos

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM applicants WHERE created_at > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("applicants")

## Carrega os dados de candidatos que se canditaram a uma vaga

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies_applicants WHERE application_date > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies_applicants")

## Extrai apenas os campos textuais mais significativos para os datasets

In [None]:
spark.sql(f"""
    SELECT
        va.vacancy_id,
        va.applicant_id,
        LOWER(TRIM(clean_str(v.title))) as vacancy_title,
        LOWER(COALESCE(v.country, '')) AS country,
        LOWER(COALESCE(v.city, '')) AS city,
        LOWER(COALESCE(v.state, '')) AS state,
        LOWER(COALESCE(v.main_activities, '')) AS main_activities,
        LOWER(COALESCE(v.behavioral_skills, '')) AS behavioral_skills,
        LOWER(COALESCE(v.technical_and_behavioral_skills, '')) AS technical_and_behavioral_skills,
        LOWER(COALESCE(a.location, '')) AS applicant_location,
        LOWER(COALESCE(a.professional_title, '')) AS professional_title,
        LOWER(COALESCE(a.technical_knowledge, '')) AS technical_knowledge,
        LOWER(COALESCE(a.cv_pt,'')) AS cv_pt,
        LOWER(COALESCE(a.area_of_expertise,'')) AS area_of_expertise,
        generate_score_from_status(status) AS artificial_score
    FROM
        vacancies_applicants va 
        LEFT JOIN vacancies v ON v.id = va.vacancy_id 
        LEFT JOIN applicants a ON a.id = va.applicant_id
    WHERE
        va.vacancy_id IN
            (SELECT
                va.vacancy_id
            FROM
                vacancies_applicants va
            GROUP BY 1
            HAVING COUNT(DISTINCT va.status) >= 5)          
""").createOrReplaceTempView("tmp_data")

In [None]:
train_df = spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            vacancy_id,
            CLEAN_STR(
                if(main_activities = technical_and_behavioral_skills,
                main_activities
                ,
                CONCAT(
                    main_activities, '\n', 
                    technical_and_behavioral_skills, '\n',
                    behavioral_skills
                )
            )) AS vacancy_description,
            
            vacancy_title,
            CONCAT( state, ', ', city) AS vacancy_location,
            
            applicant_id,
            CLEAN_STR(professional_title) AS applicant_title,
            TRIM(CLEAN_STR(CONCAT(technical_knowledge, '\n', cv_pt, '\n', area_of_expertise))) AS applicant_description,
            applicant_location,
            artificial_score
        FROM 
            tmp_data v
        ) AS t
    WHERE
        LENGTH(vacancy_title) > 0
        AND LENGTH(vacancy_description) > 150
        AND LENGTH(vacancy_location) > 0
        AND LENGTH(applicant_title) > 0
        AND LENGTH(applicant_description) > 150
        AND LENGTH(applicant_location) > 0
        AND artificial_score IS NOT NULL
    
""")

In [None]:
df = train_df.toPandas()

In [None]:
df

# Formatando os dados para o SentenceTransformer

In [None]:
# Verifica se os rótulos estão normalizados
assert df['artificial_score'].between(0.0, 1.0).all(), "Coluna 'afinidade' deve estar entre 0 e 1"


In [None]:
input_examples = [
    InputExample(texts=[ [row.vacancy_title, row.vacancy_description, row.vacancy_location] , [row.applicant_title, row.applicant_description, row.applicant_location] ], label=float(row.artificial_score))
    for row in df.itertuples()
]

In [None]:
train_samples, test_samples = train_test_split(input_examples, test_size=0.3, random_state=1)

# Definição do Modelo

In [None]:
model_name = "neuralmind/bert-base-portuguese-cased"
#model_name = "google-bert/bert-base-uncased"
#model_name = "google-bert/bert-base-multilingual-cased"
#model_name = "google-bert/bert-base-multilingual-uncased"
bert = models.Transformer(model_name)

pooling = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode="mean"
)

model = SentenceTransformer(modules=[bert, pooling])

# Hiperparâmetros

In [None]:
batch_size = 30
epochs = 4
steps_per_epoch = int(1000 / batch_size)
total_steps = steps_per_epoch * epochs
warmup_steps = int(total_steps * 0.1)
evaluation_steps = int(total_steps * 0.2)

In [None]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model)
output_dir = "./trained_model"
os.makedirs(output_dir, exist_ok=True)

In [None]:
MODEL_NAME = 'applicant_job_similarity'
MLFLOW_TRACKING_URI = 'http://localhost:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment('Applicant Job Similarity')

# Execução do Treino e publicação do Modelo no MLFlowServer

In [None]:
def mlflow_logging_callback(score, epoch, steps):
   mlflow.log_metric("val_score", score, step=int(epoch))


evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="val")

with mlflow.start_run():
   model.fit(
      train_objectives=[(train_dataloader, train_loss)],
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      warmup_steps=warmup_steps,
      output_path=output_dir,
      show_progress_bar=True,
      evaluator=evaluator,
      callback=mlflow_logging_callback,
      evaluation_steps=steps_per_epoch,
   )

   metrics = evaluator(model, output_path=None, epoch=-1, steps=-1)
   for name, value in metrics.items():
      mlflow.log_metric(name, value)
    
   mlflow.log_param("base_model", model_name)
   mlflow.log_param("loss", "CosineSimilarityLoss")
                                      
   mlflow.sentence_transformers.log_model(model, "trained_model", registered_model_name=MODEL_NAME)
   mlflow.end_run()