In [9]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import os

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.9/150.0 MB 18.1 MB/s eta 0:00:09
   ---------------------------------------- 0.9/150.0 MB 18.1 MB/s eta 0:00:09
   ---------------------------------------- 1.1/150.0 MB 9.7 MB/s eta 0:00:16
   ---------------------------------------- 1.1/150.0 MB 9.7 MB/s eta 0:00:16
   ---------------------------------------- 1.1/150.0 MB 9.7 MB/s eta 0:00:16
   ---------------------------------------- 1.1/150.0 MB 9.7 MB/s eta 0:00:16
   ---------------------------------------- 1.2/150.0 MB 3.7 MB/s eta 0:00:41
   ---------------------------------------- 1.9/150.0 MB 5.6 MB/s eta 0:00:27
   ---------------------------------------- 1.9/150.0 MB 5.6 MB/s eta 0:00:27
    --------------------------------------- 3.3/150.0 MB 7.0 MB/s eta 0:00:21

In [25]:

def carregar_dados(path):
    return pd.read_parquet(path)

def extrair_features_completas(df):
    df.columns = df.columns.astype(str)
    # Vetorização textual combinada de CV, objetivo e atividades da vaga
    texto_completo = (
        df['cv'].fillna('') + ' ' +
        df['objetivo_profissional'].fillna('') + ' ' +
        df['titulo_profissional'].fillna('') + ' ' +
        df['principais_atividades_vaga'].fillna('')
    )

    tfidf = TfidfVectorizer(max_features=500)
    X_texto = tfidf.fit_transform(texto_completo)

    # Selecionar todas as colunas one-hot e numéricas, incluindo as de match
    X_estrut = df.filter(
        regex=r'^(tipo_contratacao_|nivel_profissional_|nivel_academico_|nivel_ingles_|nivel_espanhol_|ingles_vaga_|espanhol_vaga_|feature_mesma_cidade$|^match_)'
    ).reset_index(drop=True)

    # Concatenar tudo
    X_final = pd.concat([pd.DataFrame(X_texto.toarray()), X_estrut.reset_index(drop=True)], axis=1)
    return X_final, tfidf

def treinar_modelo_supervisionado(df):
    df.columns = df.columns.astype(str)
    X, tfidf = extrair_features_completas(df)
    X.columns = X.columns.astype(str)
    y = df['contratado']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print("Distribuição antes do SMOTE:")
    print(y_train.value_counts())
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

    # smote = SMOTE(random_state=42, sampling_strategy=0.5, k_neighbors=3)
    # X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

    # print("\nDistribuição após SMOTE:")
    # print(y_train_bal.value_counts())

    # scale_pos_weight = len(y_train_bal[y_train_bal == 0]) / len(y_train_bal[y_train_bal == 1])
    clf = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        max_depth=32,
        learning_rate=0.05,
        n_estimators=300,
        max_delta_step=10,
        eta=0.1,
        subsample=0.5,
        eval_metric='auc',
        nthread=16,
        colsample_bytree=0.8,
        random_state=42,
        objective='binary:logitraw'
    )
    # clf.fit(X_train_bal, y_train_bal)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nROC AUC:", roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

    mlflow.set_experiment("modelo_candidato_sucesso")
    with mlflow.start_run():
        mlflow.log_params(clf.get_params())
        # Métricas adicionais
        mlflow.log_metric("acuracia", clf.score(X_test, y_test))
        mlflow.log_metric("precision", precision_score(y_test, y_pred))
        mlflow.log_metric("recall", recall_score(y_test, y_pred))
        mlflow.log_metric("f1_score", f1_score(y_test, y_pred))
        mlflow.log_metric("roc_auc", roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
        
        # Feature importance
        importances = clf.feature_importances_
        feature_names = X.columns.tolist()
        fi_df = pd.DataFrame({"feature": feature_names, "importance": importances})
        fi_df.to_csv("feature_importances.csv", index=False)
        mlflow.log_artifact("feature_importances.csv")
        
        # Input example e assinatura
        input_example = X_test.iloc[:1]
        signature = infer_signature(X_test, clf.predict(X_test))
        mlflow.sklearn.log_model(clf, "modelo_xgboost", input_example=input_example, signature=signature)

    joblib.dump(clf, "modelo_xgboost.pkl")
    joblib.dump(tfidf, "vetorizador_tfidf.pkl")

    return clf

if __name__ == "__main__":
    path = "C:\\Users\\ffporto\\Desktop\\Estudo\\FIAP\\fase05\\data\\"
    df = carregar_dados(f"{path}dataset_processado.parquet")
    df.columns = df.columns.astype(str)
    clf = treinar_modelo_supervisionado(df)
    df.to_parquet(f"{path}dataset_clusterizado.parquet", index=False)
    print("Modelos treinados e salvos com sucesso!")


Distribuição antes do SMOTE:
contratado
0    29988
1     1578
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     12852
           1       0.47      0.19      0.27       677

    accuracy                           0.95     13529
   macro avg       0.71      0.59      0.62     13529
weighted avg       0.93      0.95      0.94     13529


Confusion Matrix:
[[12706   146]
 [  549   128]]

ROC AUC: 0.789143336638775




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Modelos treinados e salvos com sucesso!


In [None]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, cv, DMatrix
import joblib
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import numpy as np
import os
import matplotlib.pyplot as plt

def carregar_dados(path):
    return pd.read_parquet(path)

def extrair_features_completas(df):
    df.columns = df.columns.astype(str)
    texto_completo = (
        df['cv'].fillna('') + ' ' +
        df['objetivo_profissional'].fillna('') + ' ' +
        df['titulo_profissional'].fillna('') + ' ' +
        df['principais_atividades_vaga'].fillna('')
    )

    tfidf = TfidfVectorizer(max_features=100)
    X_texto = tfidf.fit_transform(texto_completo)

    X_estrut = df.filter(
        regex=r'^(tipo_contratacao_|nivel_profissional_|nivel_academico_|nivel_ingles_|nivel_espanhol_|ingles_vaga_|espanhol_vaga_|feature_mesma_cidade$|^match_|^qtd_keywords_cv$|^sim_cv_atividade$)'
    ).reset_index(drop=True)

    X_final = pd.concat([pd.DataFrame(X_texto.toarray()), X_estrut.reset_index(drop=True)], axis=1)
    return X_final, tfidf

def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param['scale_pos_weight'] = ratio
    wtrain = dtrain.get_weight()
    wtest = dtest.get_weight()
    sum_weight = sum(wtrain) + sum(wtest)
    if sum(wtrain) > 0:
        wtrain *= sum_weight / sum(wtrain)
    if sum(wtest) > 0:
        wtest *= sum_weight / sum(wtest)
    dtrain.set_weight(wtrain)
    dtest.set_weight(wtest)
    return dtrain, dtest, param

def plot_threshold_curve(y_test, y_probs):
    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
    best_thresh = thresholds[np.argmax(f1_scores)]

    plt.figure(figsize=(10,6))
    plt.plot(thresholds, f1_scores[:-1], label="F1 Score")
    plt.xlabel("Threshold")
    plt.ylabel("F1 Score")
    plt.title("F1 Score vs Threshold")
    plt.grid(True)
    plt.legend()
    plt.savefig("f1_threshold_plot.png")
    plt.close()

    print(f"Melhor threshold para F1: {best_thresh:.2f}")
    return best_thresh


def grid_search_xgboost(X_train, y_train):
    param_grid = {
        'max_depth': [8, 16, 32],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_delta_step': [5,10,20],
        'n_estimators': [150, 250, 350],
        'nthread': [8,16,32],
        'subsample': [0.5, 0.7, 0.8],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }
    clf = XGBClassifier(scale_pos_weight=float(np.sum(y_train == 0)) / np.sum(y_train == 1),
                        eval_metric='auc',
                        objective='binary:logistic',
                        use_label_encoder=False,
                        random_state=42)
    grid = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=3, verbose=1, n_jobs=-1)
    grid.fit(X_train, y_train)
    print("Melhores parâmetros do GridSearch:", grid.best_params_)
    return grid.best_estimator_

def treinar_modelo_supervisionado(df):
    df.columns = df.columns.astype(str)
    X, tfidf = extrair_features_completas(df)
    X.columns = X.columns.astype(str)
    y = df['contratado']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print("Distribuição original:")
    print(y_train.value_counts())

    dtrain = DMatrix(X_train, label=y_train)
    dtest = DMatrix(X_test, label=y_test)

    param = {
        'max_depth': 32,
        'learning_rate': 0.05,
        'max_delta_step': 10,
        'n_estimators': 300,
        'nthread': 16,
        'eta': 0.1,
        'subsample': 0.5,
        'colsample_bytree': 0.8,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'seed': 42
    }
    num_round = 300

    print("\nExecutando cross-validation com xgb.cv...")
    cv_results = cv(
        param,
        dtrain,
        num_boost_round=num_round,
        nfold=5,
        seed=42,
        metrics=['auc'],
        fpreproc=fpreproc,
        early_stopping_rounds=10,
        verbose_eval=10
    )

    best_num_round = len(cv_results)
    print(f"\nMelhor número de rounds: {best_num_round}")

    clf = grid_search_xgboost(X_train, y_train)
    clf.set_params(n_estimators=best_num_round)
    clf.fit(X_train, y_train)

    y_probs = clf.predict_proba(X_test)[:, 1]
    best_thresh = plot_threshold_curve(y_test, y_probs)
    y_pred = (y_probs >= best_thresh).astype(int)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nROC AUC:", roc_auc_score(y_test, y_probs))

    mlflow.set_experiment("modelo_candidato_sucesso")
    with mlflow.start_run():
        mlflow.log_params(clf.get_params())
        mlflow.log_metric("acuracia", clf.score(X_test, y_test))
        mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_probs))
        mlflow.log_metric("precision", precision_score(y_test, y_pred))
        mlflow.log_metric("recall", recall_score(y_test, y_pred))
        mlflow.log_metric("f1_score", f1_score(y_test, y_pred))
        mlflow.log_artifact("f1_threshold_plot.png")

        importances = clf.feature_importances_
        feature_names = X.columns.tolist()
        fi_df = pd.DataFrame({"feature": feature_names, "importance": importances})
        fi_df.to_csv("feature_importances.csv", index=False)
        mlflow.log_artifact("feature_importances.csv")

        input_example = X_test.iloc[:1]
        signature = infer_signature(X_test, clf.predict(X_test))
        mlflow.sklearn.log_model(clf, "modelo_xgboost", input_example=input_example, signature=signature)

    joblib.dump(clf, "modelo_xgboost.pkl")
    joblib.dump(tfidf, "vetorizador_tfidf.pkl")

    return clf

if __name__ == "__main__":
    path = "C:\\Users\\ffporto\\Desktop\\Estudo\\FIAP\\fase05\\data\\"
    df = carregar_dados(f"{path}dataset_processado.parquet")
    df.columns = df.columns.astype(str)
    clf = treinar_modelo_supervisionado(df)
    df.to_parquet(f"{path}dataset_clusterizado.parquet", index=False)
    print("Modelos treinados e salvos com sucesso!")

In [46]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, cv, DMatrix
from imblearn.combine import SMOTETomek, SMOTEENN
import joblib
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import numpy as np
import os

def carregar_dados(path):
    return pd.read_parquet(path)

def extrair_features_completas(df):
    df.columns = df.columns.astype(str)
    texto_completo = (
        df['cv'].fillna('') + ' ' +
        df['objetivo_profissional'].fillna('') + ' ' +
        df['titulo_profissional'].fillna('') + ' ' +
        df['principais_atividades_vaga'].fillna('')
    )

    tfidf = TfidfVectorizer(max_features=100)
    X_texto = tfidf.fit_transform(texto_completo)

    X_estrut = df.filter(
       regex=r'^(tipo_contratacao_|nivel_profissional_|nivel_academico_|nivel_ingles_|nivel_espanhol_|ingles_vaga_|espanhol_vaga_|feature_mesma_cidade$|^match_|^qtd_keywords_cv$|^sim_cv_atividade$)'
    ).reset_index(drop=True)

    X_final = pd.concat([pd.DataFrame(X_texto.toarray()), X_estrut.reset_index(drop=True)], axis=1)
    return X_final, tfidf

def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param['scale_pos_weight'] = ratio
    wtrain = dtrain.get_weight()
    wtest = dtest.get_weight()
    sum_weight = sum(wtrain) + sum(wtest)
    if sum(wtrain) > 0:
        wtrain *= sum_weight / sum(wtrain)
    if sum(wtest) > 0:
        wtest *= sum_weight / sum(wtest)
    dtrain.set_weight(wtrain)
    dtest.set_weight(wtest)
    return dtrain, dtest, param

def treinar_modelo_supervisionado(df):
    df.columns = df.columns.astype(str)
    X, tfidf = extrair_features_completas(df)
    X.columns = X.columns.astype(str)
    y = df['contratado']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print("Distribuição original do treinamento:")
    print(y_train.value_counts())

    # Aplicar SMOTE APENAS no conjunto de treinamento
    smoteenn = SMOTEENN(random_state=42)
    X_train_res, y_train_res = smoteenn.fit_resample(X_train, y_train)

    print("\nDistribuição após SMOTE no treinamento:")
    print(y_train_res.value_counts())

    dtrain = DMatrix(X_train_res, label=y_train_res)
    dtest = DMatrix(X_test, label=y_test) # O dtest não deve ser reamostrado
    
    param = {
        'max_depth': 8,
        'learning_rate': 0.05,
        'max_delta_step': 1,
        'n_estimators': 300,
        'nthread': 16,
        # 'eta': 0.1,
        'subsample': 0.5,
        'colsample_bytree': 0.8,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'seed': 42
    }
    num_round = param['n_estimators']

    print("\nExecutando cross-validation com xgb.cv...")
    cv_results = cv(
        param,
        dtrain,
        num_boost_round=num_round,
        nfold=5,
        seed=42,
        metrics=['auc'],
        fpreproc=fpreproc,
        early_stopping_rounds=10,
        verbose_eval=10
    )

    best_num_round = len(cv_results)
    print(f"\nMelhor número de rounds: {best_num_round}")

    clf = XGBClassifier(
        max_depth=param['max_depth'],
        learning_rate=param['learning_rate'],
        max_delta_step=param['max_delta_step'],
        n_estimators=best_num_round,
        nthread=param['nthread'],
        # eta=param['eta'],
        subsample=param['subsample'],
        colsample_bytree=param['colsample_bytree'],
        # scale_pos_weight=float(np.sum(y_train == 0)) / np.sum(y_train == 1),
        objective=param['objective'],
        eval_metric=param['eval_metric'],
        use_label_encoder=False,
        random_state=42
    )
    clf.fit(X_train_res, y_train_res)
    y_pred = clf.predict(X_test)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nROC AUC:", roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

    mlflow.set_experiment("modelo_candidato_sucesso")
    with mlflow.start_run():
        mlflow.log_params(clf.get_params())
        mlflow.log_metric("acuracia", clf.score(X_test, y_test))
        mlflow.log_metric("roc_auc", roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
        mlflow.log_metric("precision", precision_score(y_test, y_pred))
        mlflow.log_metric("recall", recall_score(y_test, y_pred))
        mlflow.log_metric("f1_score", f1_score(y_test, y_pred))

        importances = clf.feature_importances_
        feature_names = X.columns.tolist()
        fi_df = pd.DataFrame({"feature": feature_names, "importance": importances})
        fi_df.to_csv("feature_importances.csv", index=False)
        mlflow.log_artifact("feature_importances.csv")

        input_example = X_test.iloc[:1]
        signature = infer_signature(X_test, clf.predict(X_test))
        mlflow.sklearn.log_model(clf, "modelo_xgboost", input_example=input_example, signature=signature)

    joblib.dump(clf, "modelo_xgboost.pkl")
    joblib.dump(tfidf, "vetorizador_tfidf.pkl")

    return clf

if __name__ == "__main__":
    path = "C:\\Users\\ffporto\\Desktop\\Estudo\\FIAP\\fase05\\data\\"
    df = carregar_dados(f"{path}dataset_processado.parquet")
    df.columns = df.columns.astype(str)
    clf = treinar_modelo_supervisionado(df)
    df.to_parquet(f"{path}dataset_clusterizado.parquet", index=False)
    print("Modelos treinados e salvos com sucesso!")


Distribuição original do treinamento:
contratado
0    29988
1     1578
Name: count, dtype: int64

Distribuição após SMOTE no treinamento:
contratado
1    29528
0    23742
Name: count, dtype: int64

Executando cross-validation com xgb.cv...


Parameters: { "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)
Parameters: { "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)
Parameters: { "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)
Parameters: { "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)


[0]	train-auc:0.86939+0.00292	test-auc:0.86127+0.00524
[10]	train-auc:0.96273+0.00112	test-auc:0.95483+0.00340
[20]	train-auc:0.97711+0.00088	test-auc:0.97061+0.00238
[30]	train-auc:0.98333+0.00034	test-auc:0.97743+0.00181
[40]	train-auc:0.98662+0.00032	test-auc:0.98117+0.00166
[50]	train-auc:0.98914+0.00029	test-auc:0.98397+0.00144
[60]	train-auc:0.99114+0.00019	test-auc:0.98623+0.00127
[70]	train-auc:0.99275+0.00016	test-auc:0.98800+0.00116
[80]	train-auc:0.99399+0.00013	test-auc:0.98941+0.00108
[90]	train-auc:0.99502+0.00009	test-auc:0.99065+0.00096
[100]	train-auc:0.99577+0.00011	test-auc:0.99154+0.00084
[110]	train-auc:0.99640+0.00008	test-auc:0.99228+0.00077
[120]	train-auc:0.99687+0.00008	test-auc:0.99283+0.00073
[130]	train-auc:0.99731+0.00004	test-auc:0.99338+0.00070
[140]	train-auc:0.99768+0.00004	test-auc:0.99384+0.00069
[150]	train-auc:0.99797+0.00004	test-auc:0.99421+0.00065
[160]	train-auc:0.99821+0.00004	test-auc:0.99449+0.00063
[170]	train-auc:0.99844+0.00005	test-auc:0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     12852
           1       0.29      0.40      0.34       677

    accuracy                           0.92     13529
   macro avg       0.63      0.67      0.65     13529
weighted avg       0.93      0.92      0.93     13529


Confusion Matrix:
[[12191   661]
 [  406   271]]

ROC AUC: 0.8012708365801597




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Modelos treinados e salvos com sucesso!


In [70]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder # Import OneHotEncoder
from xgboost import XGBClassifier, cv, DMatrix
from imblearn.combine import SMOTEENN
from mlflow.models.signature import infer_signature
import joblib
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from preprocess import pre_processar # Import the pre_processar function


def fpreproc(dtrain, dtest, param):
    """
    Pré-processamento dos dados de treino e teste antes da validação cruzada do XGBoost.

    Esta função ajusta dinamicamente o parâmetro `scale_pos_weight` com base na proporção
    entre classes majoritária e minoritária. Além disso, ela reescala os pesos dos conjuntos
    de treino e teste para garantir que o impacto das instâncias seja proporcional e
    comparável durante o processo de validação cruzada.

    Args:
        dtrain (xgboost.DMatrix): Conjunto de dados de treino com rótulos e pesos.
        dtest (xgboost.DMatrix): Conjunto de dados de teste com rótulos e pesos.
        param (dict): Dicionário de parâmetros do modelo XGBoost.

    Returns:
        Tuple[xgboost.DMatrix, xgboost.DMatrix, dict]: Os objetos `dtrain` e `dtest`
        com pesos atualizados, e o dicionário de parâmetros com `scale_pos_weight` ajustado.
    """
    label = dtrain.get_label()
    # Check if there are instances of the minority class to avoid division by zero
    if np.sum(label == 1) > 0:
        ratio = float(np.sum(label == 0)) / np.sum(label == 1)
        param['scale_pos_weight'] = ratio
    else:
        # If no positive samples, scale_pos_weight might not be applicable or set to a default
        param['scale_pos_weight'] = 1.0 # Or handle as an error
        print("Warning: No positive samples found in dtrain for scale_pos_weight calculation.")

    wtrain = dtrain.get_weight()
    wtest = dtest.get_weight()

    # Only re-scale weights if they exist and sum > 0 to avoid division by zero
    if wtrain is not None and sum(wtrain) > 0:
        sum_weight_train = sum(wtrain)
    else:
        sum_weight_train = 0

    if wtest is not None and sum(wtest) > 0:
        sum_weight_test = sum(wtest)
    else:
        sum_weight_test = 0

    total_sum_weight = sum_weight_train + sum_weight_test

    if total_sum_weight > 0:
        if sum_weight_train > 0:
            wtrain *= total_sum_weight / sum_weight_train
        if sum_weight_test > 0:
            wtest *= total_sum_weight / sum_weight_test

        if wtrain is not None:
            dtrain.set_weight(wtrain)
        if wtest is not None:
            dtest.set_weight(wtest)
    return dtrain, dtest, param


def criar_coluna_contratado_refinada(df):
    """
    Refina a coluna 'contratado' com base na situação do candidato e separa o dataset
    entre dados de treinamento (com rótulo definido) e dados em andamento (sem rótulo).

    Define `contratado = 1` para situações claramente bem-sucedidas no processo seletivo,
    `contratado = 0` para rejeições ou desistências, e mantém como NaN os casos indefinidos
    ou em andamento. Após o processamento, separa o DataFrame original em dois subconjuntos:
    um para treinamento supervisionado e outro com candidatos ainda em processo.

    Args:
        df (pd.DataFrame): DataFrame contendo, entre outras, a coluna 'situacao_candidado'.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]:
            - df_treinamento: Subconjunto com a coluna 'contratado' preenchida (0 ou 1), pronto para treinamento.
            - df_em_andamento: Subconjunto com candidatos sem definição final, sem a coluna 'contratado'.
    """
    contratado_status = [
        'contratado pela decision',
        'contratado como hunting',
        'proposta aceita'
    ]
    nao_contratado_status = [
        'nao aprovado pelo cliente',
        'desistiu',
        'nao aprovado pelo rh',
        'nao aprovado pelo requisitante',
        'sem interesse nesta vaga',
        'desistiu da contratacao',
        'recusado'
    ]

    df['contratado'] = np.nan

    df.loc[df['situacao_candidado'].isin(contratado_status), 'contratado'] = 1
    df.loc[df['situacao_candidado'].isin(nao_contratado_status), 'contratado'] = 0

    df_treinamento = df.dropna(subset=['contratado']).copy()
    df_treinamento['contratado'] = df_treinamento['contratado'].astype(int)

    # df_em_andamento now explicitly contains only rows where 'contratado' was NaN,
    # and the 'contratado' column is dropped as it's not applicable for prediction
    df_em_andamento = df[df['contratado'].isna()].copy()
    if 'contratado' in df_em_andamento.columns:
        df_em_andamento.drop(columns=['contratado'], inplace=True)

    return df_treinamento, df_em_andamento


def carregar_dados(path):
    """
    Carrega dados a partir de um arquivo no formato Parquet.

    Args:
        path (str): Caminho completo para o arquivo .parquet.

    Returns:
        pd.DataFrame: DataFrame contendo os dados carregados do arquivo.
    """
    return pd.read_parquet(path)


def extrair_e_transformar_features(df_input, tfidf_model=None, ohe_models=None, original_feature_columns=None, is_training=True):
    """
    Extrai e transforma um conjunto completo de features a partir de dados textuais e estruturados,
    aplicando vetorização TF-IDF e One-Hot Encoding.

    Args:
        df_input (pd.DataFrame): DataFrame contendo as colunas de texto e estruturadas.
        tfidf_model (TfidfVectorizer, opcional): Modelo TF-IDF previamente ajustado.
            Se None e is_training for True, um novo modelo será ajustado.
        ohe_models (dict, opcional): Dicionário de objetos OneHotEncoder previamente ajustados.
            As chaves são os nomes das colunas e os valores são os objetos OHE.
            Se None e is_training for True, novos modelos serão ajustados.
        original_feature_columns (list, opcional): Lista de nomes das colunas de features esperadas
            para garantir consistência na ordem e presença das colunas.
        is_training (bool): Indica se a função está sendo chamada para o conjunto de treino (True)
            ou para predição/teste (False).

    Returns:
        Tuple[pd.DataFrame, TfidfVectorizer, dict, list]:
            - Um DataFrame com as features combinadas (TF-IDF + One-Hot + numéricas/binárias).
            - O objeto `TfidfVectorizer` (ajustado ou passado).
            - O dicionário de objetos `OneHotEncoder` (ajustados ou passados).
            - A lista de nomes das colunas finais de features.
    """
    df = df_input.copy()
    df.columns = df.columns.astype(str)

    # 1. Pré-processamento de texto e features diretas (comum a treino e inferência)
    # A função pre_processar agora faz isso e não mais o One-Hot Encoding.
    df = pre_processar(df)


    texto_completo = (
            df['cv'].fillna('') + ' ' +
            df['objetivo_profissional'].fillna('') + ' ' +
            df['titulo_profissional'].fillna('') + ' ' +
            df['principais_atividades_vaga'].fillna('')
    )

    # TF-IDF
    if is_training:
        tfidf = TfidfVectorizer(max_features=100)
        X_texto = tfidf.fit_transform(texto_completo)
    else:
        if tfidf_model is None:
            raise ValueError("tfidf_model must be provided for prediction/test.")
        tfidf = tfidf_model
        X_texto = tfidf.transform(texto_completo)

    X_texto_df = pd.DataFrame(X_texto.toarray())
    # Ensure TF-IDF column names are strings
    X_texto_df.columns = [f'tfidf_{i}' for i in range(X_texto_df.shape[1])]


    # One-Hot Encoding
    cols_to_encode = [
        "tipo_contratacao", "nivel_profissional", "nivel_academico",
        "nivel_ingles", "nivel_espanhol", "ingles_vaga", "espanhol_vaga",
        "nivel_academico_vaga"
    ]

    ohe_fitted_models = {}
    df_encoded_features = pd.DataFrame(index=df.index) # Initialize with original index

    for col in cols_to_encode:
        if is_training:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
            encoded_data = ohe.fit_transform(df[[col]])
            ohe_fitted_models[col] = ohe # Save the fitted OHE model
        else:
            if ohe_models is None or col not in ohe_models:
                raise ValueError(f"OneHotEncoder for column '{col}' must be provided for prediction/test.")
            ohe = ohe_models[col]
            encoded_data = ohe.transform(df[[col]])

        new_cols_names = ohe.get_feature_names_out([col])
        temp_df = pd.DataFrame(encoded_data, columns=new_cols_names, index=df.index)
        df_encoded_features = pd.concat([df_encoded_features, temp_df], axis=1)

    # Numeric and binary features (already processed by pre_processar)
    X_numeric_binary = df.filter(
        regex=r'^(match_ingles|match_nivel_academico|match_area_atuacao|match_localidade|match_pcd|qtd_keywords_cv|match_cv_atividade$)'
    ).reset_index(drop=True)

    # Combine all features
    X_final = pd.concat([X_texto_df, df_encoded_features.reset_index(drop=True), X_numeric_binary], axis=1)
    X_final.columns = X_final.columns.astype(str)

    if is_training:
        # For training, capture the final column names
        final_feature_columns = X_final.columns.tolist()
    else:
        # For prediction, reindex to match the training columns, filling missing with 0
        if original_feature_columns is None:
            raise ValueError("original_feature_columns must be provided for prediction/test.")
        X_final = X_final.reindex(columns=original_feature_columns, fill_value=0)
        final_feature_columns = original_feature_columns # Just to return consistently

    return X_final, tfidf, ohe_fitted_models, final_feature_columns


def treinar_modelo_supervisionado(df_treinamento_input):
    """
    Treina um modelo supervisionado XGBoost com balanceamento de classes e clustering como feature adicional.

    Esta função realiza o pré-processamento completo dos dados, incluindo:
    - Extração de features TF-IDF e estruturadas (com One-Hot Encoding).
    - Balanceamento das classes com SMOTEENN (over + under sampling).
    - Ajuste dos hiperparâmetros via cross-validation com `xgb.cv`.
    - Treinamento final com o número ideal de árvores (`n_estimators`).
    - Avaliação com métricas de classificação e registro no MLflow.
    - Salvamento do modelo, do vetor TF-IDF e dos OneHotEncoders via `joblib`.

    Args:
        df_treinamento_input (pd.DataFrame): DataFrame contendo os dados de treino,
            já rotulados com a coluna 'contratado'.

    Returns:
        Tuple[XGBClassifier, TfidfVectorizer, dict, List[str]]:
            - O modelo XGBoost treinado.
            - O vetorizador TF-IDF ajustado.
            - O dicionário de OneHotEncoders ajustados.
            - A lista de nomes das colunas finais de features.
    """
    df_treinamento_input.columns = df_treinamento_input.columns.astype(str)

    # Extrai e transforma features no conjunto de TREINO.
    # tfidf e ohe_models serão ajustados aqui.
    X, tfidf_model, ohe_models, original_feature_columns = extrair_e_transformar_features(
        df_treinamento_input, is_training=True
    )

    y = df_treinamento_input['contratado']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print("Distribuição original do treinamento:")
    print(y_train.value_counts())

    # Apply SMOTEENN on the training set
    smoteenn = SMOTEENN(random_state=42)
    X_train_res, y_train_res = smoteenn.fit_resample(X_train, y_train)

    print("\nDistribuição após SMOTEENN no treinamento:")
    print(y_train_res.value_counts())

    # Prepare DMatrix for xgb.cv with the resampled data
    dtrain = DMatrix(X_train_res, label=y_train_res)
    # X_test needs to be transformed using the fitted TF-IDF and OHE models.
    # For now, X_test already is, as it's a split of X which was generated with the fitted models.
    # However, if we were loading X_test from an external source, it would need the full transformation.
    # Let's create DMatrix for X_test as well.
    dtest = DMatrix(X_test, label=y_test)

    param = {
        'max_depth': 8,
        'learning_rate': 0.05,
        'max_delta_step': 1,
        'nthread': 16,
        'subsample': 0.5,
        'colsample_bytree': 0.8,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'seed': 42
    }
    num_round = 300  # Max rounds for CV

    print("\nExecutando cross-validation com xgb.cv...")
    cv_results = cv(
        param,
        dtrain,
        num_boost_round=num_round,
        nfold=5,
        seed=42,
        metrics=['auc'],
        fpreproc=fpreproc,
        early_stopping_rounds=10,
        verbose_eval=10
    )

    best_num_round = len(cv_results) if len(cv_results) > 0 else num_round
    print(f"\nMelhor número de rounds: {best_num_round}")

    clf = XGBClassifier(
        max_depth=param['max_depth'],
        learning_rate=param['learning_rate'],
        max_delta_step=param['max_delta_step'],
        n_estimators=best_num_round,
        nthread=param['nthread'],
        subsample=param['subsample'],
        colsample_bytree=param['colsample_bytree'],
        # scale_pos_weight is calculated based on the RESAMPLED data for the final model fit
        scale_pos_weight=float(np.sum(y_train_res == 0)) / np.sum(y_train_res == 1),
        objective=param['objective'],
        eval_metric=param['eval_metric'],
        use_label_encoder=False,
        random_state=42
    )
    # Fit the model on the RESAMPLED training data!
    clf.fit(X_train_res, y_train_res)

    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]

    print("\nClassification Report (Padrão 0.5):")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix (Padrão 0.5):")
    print(confusion_matrix(y_test, y_pred))
    print("\nROC AUC:", roc_auc_score(y_test, y_pred_proba))

    mlflow.set_experiment("modelo_candidato_sucesso")
    with mlflow.start_run():
        mlflow.log_params(clf.get_params())
        mlflow.log_metric("acuracia", clf.score(X_test, y_test))
        mlflow.log_metric("roc_auc", roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
        mlflow.log_metric("precision_class1", precision_score(y_test, y_pred, pos_label=1))
        mlflow.log_metric("recall_class1", recall_score(y_test, y_pred, pos_label=1))
        mlflow.log_metric("f1_score_class1", f1_score(y_test, y_pred, pos_label=1))

        importances = clf.feature_importances_
        feature_names = original_feature_columns
        fi_df = pd.DataFrame({"feature": feature_names, "importance": importances})
        fi_df.to_csv("feature_importances.csv", index=False)
        mlflow.log_artifact("feature_importances.csv")

        # Use the raw X_test (before DMatrix conversion if needed) as input_example
        input_example = X_test.iloc[:1]
        signature = infer_signature(X_test, clf.predict(X_test))
        mlflow.sklearn.log_model(clf, "modelo_xgboost", input_example=input_example, signature=signature)

    joblib.dump(clf, "modelo_xgboost.pkl")
    joblib.dump(tfidf_model, "vetorizador_tfidf.pkl")  # Save the fitted tfidf vectorizer
    joblib.dump(ohe_models, "one_hot_encoders.pkl") # Save the dictionary of fitted OHE models
    joblib.dump(original_feature_columns, "feature_columns.pkl") # Save the list of feature names

    return clf, tfidf_model, ohe_models, original_feature_columns


if __name__ == "__main__":
    path = "C:\\Users\\ffporto\\Desktop\\Estudo\\FIAP\\fase05\\data\\"
    # Ensure dataset_processado.parquet is generated by the new preprocess.py first
    df = carregar_dados(f"{path}dataset_processado.parquet")
    df.columns = df.columns.astype(str)

    # 1. Prepare training data and "in-progress" data
    # This separation happens on the raw, pre-processed (text/direct features) dataframe
    df_treinamento, df_em_andamento = criar_coluna_contratado_refinada(df)

    # 2. Train the model and get all fitted transformers and feature columns
    clf, tfidf_model, ohe_models, original_feature_columns = treinar_modelo_supervisionado(df_treinamento)

    # 3. Prepare "in-progress" data for prediction using the loaded/fitted transformers
    X_em_andamento, _, _, _ = extrair_e_transformar_features(
        df_em_andamento,
        tfidf_model=tfidf_model,
        ohe_models=ohe_models,
        original_feature_columns=original_feature_columns,
        is_training=False # IMPORTANT: Set to False for prediction/test
    )

    # 4. Make probability predictions for "in-progress" candidates
    probabilities_em_andamento = clf.predict_proba(X_em_andamento)[:, 1]

    # 5. Add predictions back to the 'df_em_andamento' DataFrame
    df_em_andamento['prob_contratado'] = probabilities_em_andamento

    # 6. Classify with an adjusted threshold for actionable insights
    # Adjust this threshold based on your desired balance of precision and recall for 'contratado'
    threshold_predicao = 0.5  # Example: You might want to experiment with this value
    df_em_andamento['predicao_contratado'] = (df_em_andamento['prob_contratado'] > threshold_predicao).astype(int)

    print("\n--- Candidatos Em Andamento com Previsões ---")
    # Display top 10 candidates with highest probability of being hired
    print(df_em_andamento[['situacao_candidado', 'prob_contratado', 'predicao_contratado']].sort_values(
        by='prob_contratado', ascending=False).head(10))

    # You can save this DataFrame with predictions for further analysis
    df_em_andamento.to_parquet(f"{path}dataset_em_andamento_com_predicao.parquet", index=False)
    print("\nModelos treinados e salvos com sucesso! Previsões para candidatos em andamento geradas.")

ModuleNotFoundError: No module named 'preprocess'