In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
#import data_transform_utilities.flatten as flatten
#from data_transform_utilities.text_parsers import  clean_str, extract_json, json_str_to_array, normalize_and_tokenize_text
#import matplotlib.pyplot as plt

import re
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

import pysolr
from sqlalchemy import create_engine
import json
import requests
import os

from sqlalchemy import create_engine, update, Table, MetaData
from sqlalchemy.orm import sessionmaker

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_SHM_DISABLE"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"

In [None]:
spark_conf = SparkConf()
spark_conf.set("spark.cores", "12")
spark_conf.set("spark.driver.cores", "12")
#spark_conf.set("spark.driver.memory", "16g")
spark_conf.set("spark.speculation", False)
spark_conf.set("spark.jars.packages", "com.mysql:mysql-connector-j:9.2.0")

spark = SparkSession \
    .builder.master("local") \
    .appName("Decision data overview") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/allan/Workspace/fiap-datathon-decision/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/allan/.ivy2/cache
The jars for the packages stored in: /home/allan/.ivy2/jars
com.mysql#mysql-connector-j added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f8a54e0b-c508-4a6e-9471-1a334283e446;1.0
	confs: [default]
	found com.mysql#mysql-connector-j;9.2.0 in central
	found com.google.protobuf#protobuf-java;4.29.0 in central
:: resolution report :: resolve 203ms :: artifacts dl 5ms
	:: modules in use:
	com.google.protobuf#protobuf-java;4.29.0 from central in [default]
	com.mysql#mysql-connector-j;9.2.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-----------------------------------------

# Extrai os dados do banco Relacional para Treinamento do Modelo e Embeddings

## Cria conexão com banco relacionsl

In [9]:
engine = create_engine("mysql+pymysql://decision:1234@localhost/decision?charset=utf8")
days_to_read = 1800

## Carrega os dados de vagas

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies WHERE requested_date > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies")

25/05/09 21:41:09 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


# Carrega os dados de candidatos

In [10]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM applicants WHERE created_at > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("applicants")

## Carrega os dados de candidatos que se canditaram a uma vaga

In [12]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies_applicants WHERE application_date > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies_applicants")

## Extrai apenas os campos textuais mais significativos para os datasets

In [None]:
storage_location = "../datasets/decision/"

In [None]:
spark.sql(f"""
    SELECT
        v.id AS vacancy_id,
        CONCAT(
            COALESCE(main_activities, ''), '\n', 
            COALESCE(technical_and_behavioral_skills, ''), '\n',
            COALESCE(behavioral_skills, '')
        ) AS description,
        CURRENT_DATE() AS dt
    FROM
        vacancies v
""").repartition(1) \
    .write \
    .partitionBy('dt') \
    .option('header','true') \
    .option('sep',';') \
    .mode('overwrite') \
    .csv(f'{storage_location}/export/vacancies')

In [None]:
spark.sql(f"""
    SELECT
        a.id AS applicant_id,
        CONCAT(
            COALESCE(technical_knowledge, ''), '\n',
            COALESCE(cv_pt, '')
        ) AS description,
        CURRENT_DATE() AS dt
    FROM
        applicants a
""").repartition(1). \
    write \
    .partitionBy('dt') \
    .option('header','true') \
    .option('sep',';') \
    .mode('overwrite') \
    .csv(f'{storage_location}/export/applicants')

## Cria o dataset para treino do Modelo com os dados que relaciona os melhores candidatos para as vagas

In [None]:
spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            va.id AS id,
            CONCAT(
                COALESCE(main_activities, ''), '\n', 
                COALESCE(technical_and_behavioral_skills, ''), '\n',
                COALESCE(behavioral_skills, '')
            ) AS vacancy_description,
            CONCAT(
                COALESCE(technical_knowledge, ''), '\n',
                COALESCE(cv_pt, '')
            ) AS applicant_description,
            IF( LOWER(status) LIKE '%contratado%', 1, 0 ) AS label,
            CURRENT_DATE() AS dt
        FROM 
            vacancies_applicants va
            LEFT JOIN applicants a
                ON va.applicant_id = a.id
            LEFT JOIN vacancies v
                ON va.vacancy_id = v.id
        ) AS t
    WHERE
        LENGTH(TRIM(vacancy_description)) > 0
        AND LENGTH(TRIM(applicant_description)) > 0
    
""").repartition(1). \
    write \
    .partitionBy('dt') \
    .option('header','true') \
    .option('sep',';') \
    .option('quoteAll','true') \
    .option('escapeQuotes', 'true') \
    .mode('overwrite') \
    .csv(f'{storage_location}/train/vacancies_applicants')

In [None]:
train_df = spark.sql(f"""
    SELECT
        id,
        REGEXP_REPLACE(vacancy_description, '\n', ' ') AS vacancy_description,
        REGEXP_REPLACE(applicant_description, '\n', ' ') AS applicant_description,
        label
    FROM
        (SELECT
            va.id AS id,
            CONCAT(
                COALESCE(main_activities, ''), '\n', 
                COALESCE(technical_and_behavioral_skills, ''), '\n',
                COALESCE(behavioral_skills, '')
            ) AS vacancy_description,
            CONCAT(
                COALESCE(technical_knowledge, ''), '\n',
                COALESCE(cv_pt, '')
            ) AS applicant_description,
            IF( LOWER(status) LIKE '%contratado%', 1, 0 ) AS label,
            CURRENT_DATE() AS dt
        FROM 
            vacancies_applicants va
            LEFT JOIN applicants a
                ON va.applicant_id = a.id
            LEFT JOIN vacancies v
                ON va.vacancy_id = v.id
        ) AS t
    WHERE
        LENGTH(TRIM(vacancy_description)) > 0
        AND LENGTH(TRIM(applicant_description)) > 0

""")

In [None]:
df = train_df.toPandas()

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df = df.dropna()

In [None]:
df.info()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

In [None]:
def tokenize(batch):
    return tokenizer(
        text=batch["vacancy_description"],
        text_pair=batch["applicant_description"],
        padding="max_length", # garante que todos os exemplos do batch tenham o mesmo comprimento — exatamente o que o torch.stack espera.
        truncation=True,
        max_length=512
    )

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset = dataset.map(tokenize, batched=True)

In [None]:
#dataset = dataset.rename_column("label", "labels")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=2)

In [None]:
args = TrainingArguments(
    output_dir="./model",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

25/05/07 17:07:16 WARN TransportChannelHandler: Exception in connection from fedora/192.168.101.88:35385
java.io.IOException: Connection timed out
	at java.base/sun.nio.ch.SocketDispatcher.read0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:47)
	at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:330)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:284)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:259)
	at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:417)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:254)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:357)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788)
	at io.netty.channel.nio.NioEventLoop.processSelectedK

: 

In [None]:

trainer.train()

TrainOutput(global_step=16902, training_loss=0.2363526517294551, metrics={'train_runtime': 10872.7876, 'train_samples_per_second': 12.436, 'train_steps_per_second': 1.555, 'total_flos': 3.557603512839168e+16, 'train_loss': 0.2363526517294551, 'epoch': 3.0})

In [None]:
trainer.save_model("trained_model_bert_20250508")

In [None]:
tokenizer.save_pretrained("tokenizer_model_bert_20250508")