# Jupyter Notebook - Preprocessamento e Enriquecimento de Dados para Matching de Vagas e Candidatos

## Etapa 1: Importação de bibliotecas essenciais

In [None]:
from nltk.corpus import stopwords
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import unicodedata
import re
import json
import string
import nltk

## Etapa 2: Carregamento dos arquivos JSON

In [13]:
# Carregar os dados dos arquivos de amostra
path = "C:\\Users\\ffporto\\Desktop\\Estudo\\FIAP\\fase05\\data\\"

with open(f"{path}prospects.json", "r", encoding="utf-8") as f:
    prospects_data = json.load(f)

with open(f"{path}vagas.json", "r", encoding="utf-8") as f:
    vagas_data = json.load(f)

with open(f"{path}applicants.json", "r", encoding="utf-8") as f:
    applicants_data = json.load(f)

# Quantidade de registros em cada dataset
len(prospects_data), len(vagas_data), len(applicants_data)


(14222, 14081, 42482)

## Etapa 3: Consolidação dos dados em um único DataFrame

In [22]:
# Lista para armazenar os registros consolidados
registros = []

# Itera por vaga e seus respectivos prospects
for id_vaga, dados_vaga in prospects_data.items():
    prospects = dados_vaga.get("prospects", [])
    vaga_info = vagas_data.get(id_vaga, {})

    for prospect in prospects:
        codigo_candidato = prospect.get("codigo")
        dados_candidato = applicants_data.get(codigo_candidato, {})

        if not dados_candidato:
            continue  # ignora se candidato não estiver presente no applicants

        # Define o rótulo
        contratado = int(prospect.get("situacao_candidado") == "Contratado pela Decision")

        # Monta um registro consolidado com informações úteis de vaga, prospect e candidato
        registro = {
            "id_vaga": id_vaga,
            "id_candidato": codigo_candidato,
            "situacao_candidado": prospect.get("situacao_candidado"),
            "recrutador": prospect.get("recrutador"),
            "data_candidatura": prospect.get("data_candidatura"),
            "comentario": prospect.get("comentario"),
            "contratado": contratado,
            # Dados da vaga
            "titulo_vaga": vaga_info.get("informacoes_basicas", {}).get("titulo_vaga"),
            "vaga_sap": vaga_info.get("informacoes_basicas", {}).get("vaga_sap"),
            "cliente": vaga_info.get("informacoes_basicas", {}).get("cliente"),
            "tipo_contratacao": vaga_info.get("informacoes_basicas", {}).get("tipo_contratacao"),
            "objetivo_vaga": vaga_info.get("informacoes_basicas", {}).get("objetivo_vaga"),
            "estado_vaga": vaga_info.get("perfil_vaga", {}).get("estado"),
            "cidade_vaga": vaga_info.get("perfil_vaga", {}).get("cidade"),
            "vaga_especifica_para_pcd": vaga_info.get("perfil_vaga", {}).get("vaga_especifica_para_pcd"),
            "faixa_etaria_vaga": vaga_info.get("perfil_vaga", {}).get("faixa_etaria"),
            "nivel_profissional": vaga_info.get("perfil_vaga", {}).get("nivel profissional"),
            "nivel_academico_vaga": vaga_info.get("perfil_vaga", {}).get("nivel_academico"),
            "ingles_vaga": vaga_info.get("perfil_vaga", {}).get("nivel_ingles"),
            "espanhol_vaga": vaga_info.get("perfil_vaga", {}).get("nivel_espanhol"),
            "outro_idioma_vaga": vaga_info.get("perfil_vaga", {}).get("outro_idioma"),
            "area_atuacao_vaga": vaga_info.get("perfil_vaga", {}).get("areas_atuacao"),
            "principais_atividades_vaga": vaga_info.get("perfil_vaga", {}).get("principais_atividades"),
            "competencia_tec_e_comp_vaga": vaga_info.get("perfil_vaga", {}).get("competencia_tecnicas_e_comportamentais"),
            "valor_venda": vaga_info.get("beneficios", {}).get("valor_venda"),
            "valor_compra_1": vaga_info.get("beneficios", {}).get("valor_compra_1"),
            "valor_compra_2": vaga_info.get("beneficios", {}).get("valor_compra_2"),
            # Dados do candidato
            "nome": dados_candidato.get("infos_basicas", {}).get("nome"),
            "pcd": dados_candidato.get("informacoes_pessoais", {}).get("pcd"),
            "email": dados_candidato.get("infos_basicas", {}).get("email"),
            "local_candidato": dados_candidato.get("infos_basicas", {}).get("local"),
            "objetivo_profissional": dados_candidato.get("infos_basicas", {}).get("objetivo_profissional"),
            "titulo_profissional": dados_candidato.get("informacoes_profissionais", {}).get("titulo_profissional"),
            "area_atuacao": dados_candidato.get("informacoes_profissionais", {}).get("area_atuacao"),
            "conhecimentos_tecnicos": dados_candidato.get("informacoes_profissionais", {}).get("conhecimentos_tecnicos"),
            "certificacoes": dados_candidato.get("informacoes_profissionais", {}).get("certificacoes"),
            "outras_certificacoes": dados_candidato.get("informacoes_profissionais", {}).get("outras_certificacoes"),
            "remuneracao": dados_candidato.get("informacoes_profissionais", {}).get("remuneracao"),
            "nivel_profissional": dados_candidato.get("informacoes_profissionais", {}).get("nivel_profissional"),
            "nivel_academico": dados_candidato.get("formacao_e_idiomas", {}).get("nivel_academico"),
            "nivel_ingles": dados_candidato.get("formacao_e_idiomas", {}).get("nivel_ingles"),
            "nivel_espanhol": dados_candidato.get("formacao_e_idiomas", {}).get("nivel_espanhol"),
            "outro_idioma": dados_candidato.get("formacao_e_idiomas", {}).get("outro_idioma"),
            "cargo_atual": dados_candidato.get("cargo_atual"),
            "cv": dados_candidato.get("cv_pt")
        }

        registros.append(registro)

# Criar DataFrame
df = pd.DataFrame(registros)

# Exibir amostra
df.sample(5, random_state=42)


Unnamed: 0,id_vaga,id_candidato,situacao_candidado,recrutador,data_candidatura,comentario,contratado,titulo_vaga,vaga_sap,cliente,...,conhecimentos_tecnicos,certificacoes,outras_certificacoes,remuneracao,nivel_academico,nivel_ingles,nivel_espanhol,outro_idioma,cargo_atual,cv
34230,12007,45340,Sem interesse nesta vaga,Luna Correia,06-12-2023,Não busca por posições PJ,0,DBA de Oracle,Não,Hernandez-Wyatt,...,,,,,,,,-,{},
36037,11828,19491,Encaminhado ao Requisitante,Melina Montenegro,25-10-2023,,0,Java Developer 621,Não,Bell Group,...,,,,,,,,-,{},+55 11 97804.2069\n+55 44 99157.4334\n\nespeci...
8790,6563,27099,Inscrito,Caroline Machado,03-09-2021,,0,Supply Chain - 12208858,Não,Nelson-Page,...,,"SAP FI, SAP FI/CO",,5.0,Pós Graduação Completo,Intermediário,Intermediário,Português - Avançado,{},"brasileira, 38 anos solteira\n(31) 3785 2470 (..."
23445,4323,25063,Não Aprovado pelo Cliente,Bruna Aparecida,05-03-2021,Projeto considerou o profissional com pouco co...,0,Web builder - MSC,Não,Miller-Curry,...,,,,,,,,-,{},"perfil\ncanadense, nascido em quebec, 31 anos...."
5762,5703,27688,Encaminhado ao Requisitante,Stella Vieira,23-06-2021,CLT: 7k Dispo: 15 dias (negociavel),0,Consultor Back - PL -RE-355139,Não,"Morris, Moran and Dodson",...,,,,,,,,-,{},objetivo\nanalista de sistemas sr\nresumo das ...


## Etapa 4: Função de limpeza e normalização de texto + Etapa 5: Pré-processamento e engenharia de features

In [90]:
# Cópia de trabalho do dataframe
df_proc = df.copy()

# Normalização de colunas textuais e substituição de nulos
nltk.download('stopwords')
stopwords_pt = set(stopwords.words('portuguese'))

def extrair_keywords_linha(atividades, competencias, n_top=10):
    texto = f"{atividades or ''} {competencias or ''}"
    texto = texto.lower()
    texto = re.sub(r'[^a-zà-ú0-9\s]', '', texto)
    tokens = texto.split()
    stopwords_pt = set(stopwords.words('portuguese'))
    stopwords_custom = {
        'vaga', 'atividades', 'responsabilidades', 'trabalhar', 'empresa',
        'experiência', 'profissional', 'atuar', 'área', 'conhecimento',
        'suporte', 'realizar', 'projetos', 'cliente', 'analista', 'tecnologia'
    }
    stopwords_total = stopwords_pt | stopwords_custom
    tokens = [t for t in tokens if t not in stopwords_total and len(t) > 2]
    mais_frequentes = Counter(tokens).most_common(n_top)
    keywords = [palavra for palavra, _ in mais_frequentes]
    return keywords

def contar_keywords_cv_linha(cv, keywords):
    if not isinstance(cv, str):
        return 0
    cv = cv.lower()
    return sum(1 for kw in keywords if kw in cv)

def calcular_similaridade_cv_atividade(df):
    tfidf = TfidfVectorizer(max_features=500)

    textos_candidato = df['cv'].fillna("").astype(str)
    textos_vaga = df['principais_atividades_vaga'].fillna("").astype(str)

    tfidf_matrix = tfidf.fit_transform(pd.concat([textos_candidato, textos_vaga], ignore_index=True))
    tfidf_candidato = tfidf_matrix[:len(df)]
    tfidf_vaga = tfidf_matrix[len(df):]

    similaridades = [cosine_similarity(tfidf_candidato[i], tfidf_vaga[i])[0][0] for i in range(len(df))]

    df['match_cv_atividade'] = similaridades
    return df

def match_texto_in_texto(base, alvo):
    if isinstance(base, str) and isinstance(alvo, str):
        return int(base in alvo)
    return 0

def normalizar_tipo_contratacao(texto):
    """
    Normaliza os tipos de contratação da coluna, tratando combinações, sinônimos,
    capitalização e ordem dos termos.
    """
    # 1. Tratamento inicial de valores nulos/vazios
    if pd.isna(texto) or not isinstance(texto, str) or str(texto).strip() == "":
        return "vazio"
    # 2. Converte para string, remove espaços e minúsculas
    texto = str(texto).strip().lower()
    # 3. Remover acentos
    texto = unicodedata.normalize('NFKD', texto).encode('ascii', 'ignore').decode('utf-8')
    # 4. Substituições e padronizações específicas
    texto = texto.replace('pj/autonomo', 'pj_autonomo') # Unifica PJ/Autônomo
    texto = texto.replace('clt full', 'clt_full') # Padroniza CLT Full
    texto = texto.replace('clt cotas', 'clt_cotas') # Padroniza CLT Cotas
    texto = texto.replace('candidato podera escolher', '') # Remove frase redundante
    texto = texto.replace('estagiario', 'estagio') # Padroniza Estagiário
    # 5. Remover caracteres não alfanuméricos exceto underscores e espaços
    texto = re.sub(r'[^a-z0-9_\s]', '', texto)
    # 6. Compactar múltiplos espaços
    texto = re.sub(r'\s+', ' ', texto).strip()
    # 7. Separar os termos, remover vazios e normalizar a ordem para combinações
    # Ex: 'clt_full pj_autonomo' e 'pj_autonomo clt_full' viram a mesma lista e são ordenadas
    termos_individuais = sorted(list(set(filter(None, texto.split(' ')))))
    # Junta os termos novamente com '_' para formar a categoria combinada
    texto_normalizado = '_'.join(termos_individuais)
    # 8. Mapeamento final para categorias complexas (se necessário, pode ser mais genérico)
    # Essas regras são importantes para agrupar categorias que têm várias palavras
    # mas que você quer tratar como um único tipo.
    # A ordem é importante: das combinações mais longas para as mais curtas.
    if "clt_cotas_cooperado_estagio_hunting_pj_autonomo" in texto_normalizado:
        return "clt_cotas_cooperado_estagio_hunting_pj_autonomo"
    elif "clt_cotas_cooperado_estagio_pj_autonomo" in texto_normalizado:
        return "clt_cotas_cooperado_estagio_pj_autonomo"
    elif "clt_cotas_clt_full_cooperado_estagio_pj_autonomo" in texto_normalizado:
        return "clt_cotas_clt_full_cooperado_estagio_pj_autonomo"
    elif "clt_cotas_clt_full_cooperado_pj_autonomo" in texto_normalizado:
        return "clt_cotas_clt_full_cooperado_pj_autonomo"
    elif "clt_cotas_clt_full_pj_autonomo" in texto_normalizado:
        return "clt_cotas_clt_full_pj_autonomo"
    elif "clt_cotas_pj_autonomo" in texto_normalizado:
        return "clt_cotas_pj_autonomo"
    elif "clt_full_cooperado_pj_autonomo" in texto_normalizado:
        return "clt_full_cooperado_pj_autonomo"
    elif "clt_full_hunting_pj_autonomo" in texto_normalizado:
        return "clt_full_hunting_pj_autonomo"
    elif "cooperado_hunting_pj_autonomo" in texto_normalizado:
        return "cooperado_hunting_pj_autonomo"
    elif "cooperado_pj_autonomo" in texto_normalizado:
        return "cooperado_pj_autonomo"
    elif "clt_cotas_cooperado" in texto_normalizado:
        return "clt_cotas_cooperado"
    elif "clt_cotas_clt_full" in texto_normalizado:
        return "clt_cotas_clt_full"
    elif "clt_full_cooperado" in texto_normalizado:
        return "clt_full_cooperado"
    elif "clt_full_hunting" in texto_normalizado:
        return "clt_full_hunting"
    elif "clt_full_pj_autonomo" in texto_normalizado:
        return "clt_full_pj_autonomo"
    elif "pj_autonomo_hunting" in texto_normalizado:
        return "pj_autonomo_hunting"
    elif "clt_full" in texto_normalizado:
        return "clt_full"
    elif "pj_autonomo" in texto_normalizado:
        return "pj_autonomo"
    elif "hunting" in texto_normalizado:
        return "hunting"
    elif "cooperado" in texto_normalizado:
        return "cooperado"
    elif "clt_cotas" in texto_normalizado:
        return "clt_cotas"
    elif "estagio" in texto_normalizado:
        return "estagio"

    return texto_normalizado if texto_normalizado else "vazio"

def limpa_texto(texto):
    """
    Realiza uma série de limpezas em um texto:
    1. Converte a entrada para string.
    2. Remove espaços em branco no início e fim.
    3. Converte para minúsculas.
    4. Remove acentos.
    5. Remove caracteres não alfanuméricos (mantém letras, números e espaços).
    6. Compacta múltiplos espaços em um único espaço.

    Args:
        texto (str ou qualquer tipo): O texto a ser limpo.

    Returns:
        str: O texto limpo, ou "vazio" se a entrada for inválida/nula.
    """
    if pd.isna(texto) or not isinstance(texto, str) or str(texto).strip() == "":
        return "vazio" # Retorna "vazio" para NaN, não-strings ou strings vazias/apenas espaços
    # 1. Converte para string (garante que números, etc., sejam tratados como texto)
    texto = str(texto)
    # 2. Remove espaços em branco no início e fim e converte para minúsculas
    texto = texto.strip().lower()
    # 3. Remove acentos (normalização Unicode)
    texto = unicodedata.normalize('NFKD', texto).encode('ascii', 'ignore').decode('utf-8')
    # 4. Remove caracteres não alfanuméricos (mantém letras, números e espaços)
    texto = re.sub(r'[^a-z0-9\s,]', '', texto)
    # 5. Compacta múltiplos espaços em um único espaço
    texto = re.sub(r'\s+', ' ', texto).strip() # .strip() final para pegar espaços extras criados por re.sub
    return texto

colunas_texto = [
    "situacao_candidado", "recrutador", "comentario",
    "titulo_vaga", "vaga_sap", "cliente", "objetivo_vaga",
    "estado_vaga", "cidade_vaga", "vaga_especifica_para_pcd",
    "nivel_profissional", "nivel_academico_vaga", "ingles_vaga", "espanhol_vaga",
    "outro_idioma_vaga", "area_atuacao_vaga", "principais_atividades_vaga",
    "competencia_tec_e_comp_vaga", "nome", "pcd", "objetivo_profissional",
    "titulo_profissional", "area_atuacao", "conhecimentos_tecnicos", 
    "certificacoes", "outras_certificacoes", "nivel_academico",
    "nivel_ingles", "nivel_espanhol", "outro_idioma", "cargo_atual", "cv"
]

for col in colunas_texto:
    df_proc[col] = df_proc[col].apply(limpa_texto)

# Feature: distância geográfica entre cidade do candidato e cidade da vaga (proxy simples)
df["match_ingles"] = (df["nivel_ingles"] == df["ingles_vaga"]).astype(int)
df["match_nivel_academico"] = (df["nivel_academico"] == df["nivel_academico_vaga"]).astype(int)
df["match_area_atuacao"] = df.apply(
    lambda row: match_texto_in_texto(row["area_atuacao"], row["area_atuacao_vaga"]),
    axis=1
)
df['match_qtd_keywords_cv'] = df.apply(
        lambda row: contar_keywords_cv_linha(
            row['cv'],
            extrair_keywords_linha(row['principais_atividades_vaga'], row['competencia_tec_e_comp_vaga'])
        ),
        axis=1
    )
df['match_titulo'] = (df['titulo_profissional'] == df['titulo_vaga']).astype(int)
df["match_localidade"] = (df["local_candidato"] == df["cidade_vaga"]).astype(int)
df["match_pcd"] = (df["pcd"] == df["vaga_especifica_para_pcd"]).astype(int)
calcular_similaridade_cv_atividade(df)
# Aplica a função de normalização
df_proc['tipo_contratacao'] = df_proc['tipo_contratacao'].apply(normalizar_tipo_contratacao)

# handle_unknown='ignore' evita erros se surgir uma categoria nova que não foi vista no fit
# DataFrame para armazenar as novas colunas codificadas
df_encoded_features = pd.DataFrame()

cols_to_encode = [
    "tipo_contratacao","nivel_profissional","nivel_academico", 
    "nivel_ingles","nivel_espanhol","ingles_vaga","espanhol_vaga",
    "nivel_academico_vaga"
]

for col in cols_to_encode:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_data = ohe.fit_transform(df_proc[[col]])
    # Ohe.get_feature_names_out() nos dá os nomes corretos para as novas colunas
    new_cols_names = ohe.get_feature_names_out([col])
    # Crie um DataFrame temporário para as colunas codificadas
    temp_df = pd.DataFrame(encoded_data, columns=new_cols_names, index=df_proc.index)
    # Concatene as novas colunas ao DataFrame de features codificadas
    df_encoded_features = pd.concat([df_encoded_features, temp_df], axis=1)

df_final = pd.concat([df_proc, df_encoded_features], axis=1)

df_final = df_final.drop(columns=cols_to_encode)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ffporto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
# Exibir colunas resultantes
pd.set_option('display.max_columns', None)  
df_final.head()

Unnamed: 0,id_vaga,id_candidato,situacao_candidado,recrutador,data_candidatura,comentario,contratado,titulo_vaga,vaga_sap,cliente,objetivo_vaga,estado_vaga,cidade_vaga,vaga_especifica_para_pcd,faixa_etaria_vaga,outro_idioma_vaga,area_atuacao_vaga,principais_atividades_vaga,competencia_tec_e_comp_vaga,valor_venda,valor_compra_1,valor_compra_2,nome,pcd,email,local_candidato,objetivo_profissional,titulo_profissional,area_atuacao,conhecimentos_tecnicos,certificacoes,outras_certificacoes,remuneracao,outro_idioma,cargo_atual,cv,match_ingles,match_nivel_academico,match_area_atuacao,match_localidade,match_pcd,match_titulo,match_qtd_keywords_cv,match_cv_atividade,tipo_contratacao_clt_cotas,tipo_contratacao_clt_cotas_clt_full,tipo_contratacao_clt_cotas_clt_full_cooperado_estagio_pj_autonomo,tipo_contratacao_clt_cotas_clt_full_cooperado_pj_autonomo,tipo_contratacao_clt_cotas_clt_full_pj_autonomo,tipo_contratacao_clt_cotas_cooperado,tipo_contratacao_clt_cotas_cooperado_estagio_hunting_pj_autonomo,tipo_contratacao_clt_cotas_cooperado_estagio_pj_autonomo,tipo_contratacao_clt_cotas_pj_autonomo,tipo_contratacao_clt_full,tipo_contratacao_clt_full_cooperado,tipo_contratacao_clt_full_cooperado_pj_autonomo,tipo_contratacao_clt_full_hunting,tipo_contratacao_clt_full_hunting_pj_autonomo,tipo_contratacao_clt_full_pj_autonomo,tipo_contratacao_cooperado,tipo_contratacao_cooperado_hunting_pj_autonomo,tipo_contratacao_cooperado_pj_autonomo,tipo_contratacao_estagio,tipo_contratacao_hunting,tipo_contratacao_pj_autonomo,tipo_contratacao_vazio,nivel_profissional_analista,nivel_profissional_especialista,nivel_profissional_estagiario,nivel_profissional_gerente,nivel_profissional_junior,nivel_profissional_lider,nivel_profissional_pleno,nivel_profissional_senior,nivel_profissional_tecnico de nivel medio,nivel_profissional_vazio,nivel_academico_doutorado completo,nivel_academico_doutorado cursando,nivel_academico_doutorado incompleto,nivel_academico_ensino fundamental completo,nivel_academico_ensino fundamental cursando,nivel_academico_ensino fundamental incompleto,nivel_academico_ensino medio completo,nivel_academico_ensino medio cursando,nivel_academico_ensino medio incompleto,nivel_academico_ensino superior completo,nivel_academico_ensino superior cursando,nivel_academico_ensino superior incompleto,nivel_academico_ensino tecnico completo,nivel_academico_ensino tecnico cursando,nivel_academico_ensino tecnico incompleto,nivel_academico_mestrado completo,nivel_academico_mestrado cursando,nivel_academico_mestrado incompleto,nivel_academico_pos graduacao completo,nivel_academico_pos graduacao cursando,nivel_academico_pos graduacao incompleto,nivel_academico_vazio,nivel_ingles_avancado,nivel_ingles_basico,nivel_ingles_fluente,nivel_ingles_intermediario,nivel_ingles_nenhum,nivel_ingles_vazio,nivel_espanhol_avancado,nivel_espanhol_basico,nivel_espanhol_fluente,nivel_espanhol_intermediario,nivel_espanhol_nenhum,nivel_espanhol_vazio,ingles_vaga_avancado,ingles_vaga_basico,ingles_vaga_fluente,ingles_vaga_intermediario,ingles_vaga_nenhum,ingles_vaga_tecnico,ingles_vaga_vazio,espanhol_vaga_avancado,espanhol_vaga_basico,espanhol_vaga_fluente,espanhol_vaga_intermediario,espanhol_vaga_nenhum,espanhol_vaga_tecnico,espanhol_vaga_vazio,nivel_academico_vaga_doutorado cursando,nivel_academico_vaga_ensino fundamental completo,nivel_academico_vaga_ensino medio completo,nivel_academico_vaga_ensino medio incompleto,nivel_academico_vaga_ensino superior completo,nivel_academico_vaga_ensino superior cursando,nivel_academico_vaga_ensino superior incompleto,nivel_academico_vaga_ensino tecnico completo,nivel_academico_vaga_ensino tecnico cursando,nivel_academico_vaga_ensino tecnico incompleto,nivel_academico_vaga_mestrado completo,nivel_academico_vaga_mestrado cursando,nivel_academico_vaga_pos graduacao completo,nivel_academico_vaga_pos graduacao cursando,nivel_academico_vaga_pos graduacao incompleto,nivel_academico_vaga_vazio
0,4530,25632,encaminhado ao requisitante,ana livia moreira,25-03-2021,"encaminhado para pj r 72,00hora",0,consultor control m,nao,"morris, moran and dodson",vazio,rio de janeiro,rio de janeiro,nao,De: Até:,vazio,ti desenvolvimentoprogramacao,experiencia comprovada em projetos de controlm,experiencia comprovada em projetos de controlm,-,R$,,jose vieira,vazio,josé_vieira@gmail.com,,vazio,vazio,vazio,vazio,vazio,vazio,,,vazio,dados pessoais estado civil casado idade 33 an...,0,0,1,0,0,0,2,0.381487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4530,25529,encaminhado ao requisitante,ana livia moreira,22-03-2021,"encaminhado para r 6000,00 clt full , nao tem ...",0,consultor control m,nao,"morris, moran and dodson",vazio,rio de janeiro,rio de janeiro,nao,De: Até:,vazio,ti desenvolvimentoprogramacao,experiencia comprovada em projetos de controlm,experiencia comprovada em projetos de controlm,-,R$,,srta isabela cavalcante,nao,srta._isabela_cavalcante@hotmail.com,"Rio de Janeiro, Rio de Janeiro",analista de redes e teleprocessamentoinfraestr...,analista de redes e teleprocessamentoinfraestr...,"ti governanca, ti infraestrutura, ti projetos,...",vazio,vazio,vazio,"R$ 5600,00 / mês / CLT",,vazio,"solteiro, 47 anos estrada meringuava, no 1763 ...",0,1,0,0,1,0,2,0.318026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4531,25364,contratado pela decision,juliana cassiano,17-03-2021,data de inicio 12042021,1,20212607395peoplesoft application enginedomain...,nao,gonzalez and sons,contratacao,sao paulo,sao paulo,nao,De: Até:,vazio,gestao e alocacao de recursos de ti,key skills required for the job are peoplesoft...,o recurso peoplesoft tem como responsabilidade...,-,hora,,sra yasmin fernandes,nao,sra._yasmin_fernandes@hotmail.com,"São Paulo, São Paulo",lideranca desenvolvimento,lideranca desenvolvimento,ti projetos,"peoplesoft peopletools 849, 853, 855, 857 peop...",vazio,capacitacao peoplesoft formacao tecnica people...,15.000 - Mensal PJ,,vazio,area de atuacao lider de consultoria gerenciam...,0,0,0,0,1,0,4,0.019821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4531,25360,encaminhado ao requisitante,juliana cassiano,17-03-2021,vazio,0,20212607395peoplesoft application enginedomain...,nao,gonzalez and sons,contratacao,sao paulo,sao paulo,nao,De: Até:,vazio,gestao e alocacao de recursos de ti,key skills required for the job are peoplesoft...,o recurso peoplesoft tem como responsabilidade...,-,hora,,alexia barbosa,vazio,alexia_barbosa@hotmail.com,,vazio,vazio,vazio,vazio,vazio,vazio,,,vazio,informacoes pessoais estado civil casado nacio...,0,0,1,0,0,0,5,0.038739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4533,26338,contratado pela decision,stella vieira,29-04-2021,vazio,1,20212605708microfocus application life cycle m...,nao,barneswoods,contratacao,sao paulo,sao paulo,nao,De: Até:,vazio,gestao e alocacao de recursos de ti,arquiteto foco na area e automacao requerido e...,arquiteto foco na area e automacao requerido e...,"207,00 -",hora,,arthur almeida,vazio,arthur_almeida@gmail.com,,vazio,vazio,vazio,vazio,vazio,vazio,,,vazio,"solteiro, brasileiro, 21061987 habilitacao cat...",0,0,1,0,0,0,6,0.263252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
df2 = pd.read_parquet(f"{path}dataset_processado.parquet")

In [94]:
df2.filter(regex=r'^(tipo_contratacao_|nivel_profissional_|nivel_academico_|nivel_ingles_|nivel_espanhol_|ingles_vaga_|espanhol_vaga_|feature_mesma_cidade$|^match_)').reset_index(drop=True).head()

Unnamed: 0,match_ingles,match_nivel_academico,match_area_atuacao,match_localidade,match_pcd,tipo_contratacao_clt_cotas,tipo_contratacao_clt_cotas_clt_full,tipo_contratacao_clt_cotas_clt_full_cooperado_estagio_pj_autonomo,tipo_contratacao_clt_cotas_clt_full_cooperado_pj_autonomo,tipo_contratacao_clt_cotas_clt_full_pj_autonomo,tipo_contratacao_clt_cotas_cooperado,tipo_contratacao_clt_cotas_cooperado_estagio_hunting_pj_autonomo,tipo_contratacao_clt_cotas_cooperado_estagio_pj_autonomo,tipo_contratacao_clt_cotas_pj_autonomo,tipo_contratacao_clt_full,tipo_contratacao_clt_full_cooperado,tipo_contratacao_clt_full_cooperado_pj_autonomo,tipo_contratacao_clt_full_hunting,tipo_contratacao_clt_full_hunting_pj_autonomo,tipo_contratacao_clt_full_pj_autonomo,tipo_contratacao_cooperado,tipo_contratacao_cooperado_hunting_pj_autonomo,tipo_contratacao_cooperado_pj_autonomo,tipo_contratacao_estagio,tipo_contratacao_hunting,tipo_contratacao_pj_autonomo,tipo_contratacao_vazio,nivel_profissional_analista,nivel_profissional_especialista,nivel_profissional_estagiario,nivel_profissional_gerente,nivel_profissional_junior,nivel_profissional_lider,nivel_profissional_pleno,nivel_profissional_senior,nivel_profissional_tecnico de nivel medio,nivel_profissional_vazio,nivel_academico_doutorado completo,nivel_academico_doutorado cursando,nivel_academico_doutorado incompleto,nivel_academico_ensino fundamental completo,nivel_academico_ensino fundamental cursando,nivel_academico_ensino fundamental incompleto,nivel_academico_ensino medio completo,nivel_academico_ensino medio cursando,nivel_academico_ensino medio incompleto,nivel_academico_ensino superior completo,nivel_academico_ensino superior cursando,nivel_academico_ensino superior incompleto,nivel_academico_ensino tecnico completo,nivel_academico_ensino tecnico cursando,nivel_academico_ensino tecnico incompleto,nivel_academico_mestrado completo,nivel_academico_mestrado cursando,nivel_academico_mestrado incompleto,nivel_academico_pos graduacao completo,nivel_academico_pos graduacao cursando,nivel_academico_pos graduacao incompleto,nivel_academico_vazio,nivel_ingles_avancado,nivel_ingles_basico,nivel_ingles_fluente,nivel_ingles_intermediario,nivel_ingles_nenhum,nivel_ingles_vazio,nivel_espanhol_avancado,nivel_espanhol_basico,nivel_espanhol_fluente,nivel_espanhol_intermediario,nivel_espanhol_nenhum,nivel_espanhol_vazio,ingles_vaga_avancado,ingles_vaga_basico,ingles_vaga_fluente,ingles_vaga_intermediario,ingles_vaga_nenhum,ingles_vaga_tecnico,ingles_vaga_vazio,espanhol_vaga_avancado,espanhol_vaga_basico,espanhol_vaga_fluente,espanhol_vaga_intermediario,espanhol_vaga_nenhum,espanhol_vaga_tecnico,espanhol_vaga_vazio,nivel_academico_vaga_doutorado cursando,nivel_academico_vaga_ensino fundamental completo,nivel_academico_vaga_ensino medio completo,nivel_academico_vaga_ensino medio incompleto,nivel_academico_vaga_ensino superior completo,nivel_academico_vaga_ensino superior cursando,nivel_academico_vaga_ensino superior incompleto,nivel_academico_vaga_ensino tecnico completo,nivel_academico_vaga_ensino tecnico cursando,nivel_academico_vaga_ensino tecnico incompleto,nivel_academico_vaga_mestrado completo,nivel_academico_vaga_mestrado cursando,nivel_academico_vaga_pos graduacao completo,nivel_academico_vaga_pos graduacao cursando,nivel_academico_vaga_pos graduacao incompleto,nivel_academico_vaga_vazio
0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
def extrair_keywords_linha(atividades, competencias, n_top=10):
    texto = f"{atividades or ''} {competencias or ''}"
    texto = texto.lower()
    texto = re.sub(r'[^a-zà-ú0-9\s]', '', texto)
    tokens = texto.split()
    stopwords_pt = set(stopwords.words('portuguese'))
    stopwords_custom = {
        'vaga', 'atividades', 'responsabilidades', 'trabalhar', 'empresa',
        'experiência', 'profissional', 'atuar', 'área', 'conhecimento',
        'suporte', 'realizar', 'projetos', 'cliente', 'analista', 'tecnologia'
    }
    stopwords_total = stopwords_pt | stopwords_custom
    tokens = [t for t in tokens if t not in stopwords_total and len(t) > 2]
    mais_frequentes = Counter(tokens).most_common(n_top)
    keywords = [palavra for palavra, _ in mais_frequentes]
    return keywords

def contar_keywords_cv_linha(cv, keywords):
    if not isinstance(cv, str):
        return 0
    cv = cv.lower()
    return sum(1 for kw in keywords if kw in cv)

In [96]:
df2['qtd_keywords_cv'] = df2.apply(
        lambda row: contar_keywords_cv_linha(
            row['cv'],
            extrair_keywords_linha(row['principais_atividades_vaga'], row['competencia_tec_e_comp_vaga'])
        ),
        axis=1
    )

In [101]:
df2['situacao_candidado'].value_counts()

situacao_candidado
prospect                          16618
encaminhado ao requisitante       13809
inscrito                           3185
nao aprovado pelo cliente          2967
contratado pela decision           2255
desistiu                           2061
nao aprovado pelo rh               1583
nao aprovado pelo requisitante      653
entrevista tecnica                  485
entrevista com cliente              363
sem interesse nesta vaga            361
em avaliacao pelo rh                339
aprovado                            183
contratado como hunting             172
desistiu da contratacao              48
documentacao pj                       4
documentacao clt                      3
recusado                              2
documentacao cooperado                2
encaminhar proposta                   1
proposta aceita                       1
Name: count, dtype: int64