In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import data_transform_utilities.flatten as flatten
from data_transform_utilities.text_parsers import  clean_str, extract_json, json_str_to_array, normalize_and_tokenize_text
import matplotlib.pyplot as plt

import re
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

import pysolr
from sqlalchemy import create_engine
import json
import requests

from sqlalchemy import create_engine, update, Table, MetaData
from sqlalchemy.orm import sessionmaker

In [2]:
spark_conf = SparkConf()
spark_conf.set("spark.cores", "12")
spark_conf.set("spark.driver.cores", "12")
spark_conf.set("spark.driver.memory", "16g")
spark_conf.set("spark.speculation", False)
spark_conf.set("spark.jars.packages", "com.mysql:mysql-connector-j:9.2.0")

spark = SparkSession \
    .builder.master("local") \
    .appName("Decision data overview") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/allan/Workspace/fiap-datathon-decision/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/allan/.ivy2/cache
The jars for the packages stored in: /home/allan/.ivy2/jars
com.mysql#mysql-connector-j added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4c05e2ca-f344-44eb-abb5-8ca86a9d6cd3;1.0
	confs: [default]
	found com.mysql#mysql-connector-j;9.2.0 in central
	found com.google.protobuf#protobuf-java;4.29.0 in central
:: resolution report :: resolve 153ms :: artifacts dl 4ms
	:: modules in use:
	com.google.protobuf#protobuf-java;4.29.0 from central in [default]
	com.mysql#mysql-connector-j;9.2.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-----------------------------------------

# Registra as UDFs para o Spark

In [3]:
spark.udf.register("json_str_to_array", json_str_to_array, ArrayType(StringType()))
spark.udf.register("normalize_and_tokenize_text", normalize_and_tokenize_text, ArrayType(StringType()))

<function data_transform_utilities.text_parsers.normalize_and_tokenize_text(text: str)>

# Cria conexão com o banco MySQL

In [4]:
engine = create_engine("mysql+pymysql://decision:1234@localhost/decision?charset=utf8")

# Analise dos dados reais sobre requisitos das Vagas 

In [5]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table="vacancies",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies")

25/05/03 10:50:32 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [6]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table="vacancies_applicants",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies_applicants")

# Tokenização dos dados textuais das vagas e dos candidatos

In [None]:
spark.sql(f"""
    SELECT
        v.id,
        normalize_and_tokenize_text(
            CONCAT(
                COALESCE(main_activities, ''), '\n', 
                COALESCE(technical_and_behavioral_skills, ''), '\n',
                COALESCE(behavioral_skills, '')
            )
        ) AS infos,
        get_json_object(llm_result, '$.requirements[*].name') as requirements
    FROM
        vacancies v
        LEFT JOIN vacancies_applicants va ON v.id = va.vacancy_id
    WHERE
        va.applicant_id IS NOT NULL
""").createOrReplaceTempView("tmp_vacancy_infos")

In [8]:
spark.sql(f"""
    SELECT
        TRIM(id) as id,
        TRIM(SUBSTR(info, 0, 255)) as token
    FROM
        tmp_vacancy_infos
        LATERAL VIEW explode(infos) AS info
    GROUP BY 1,2
    HAVING LENGTH(token) > 0
""").cache().createOrReplaceTempView("tmp_vacancy_infos_cleaned")

In [9]:
tokens = spark.sql(f"""SELECT id as vacancy_id, token FROM tmp_vacancy_infos_cleaned group by 1,2""").coalesce(1) \
    .write \
    .option("spark.speculation", False) \
    .jdbc( \
    url="jdbc:mysql://localhost:3306/decision", \
    table="vacancies_tokens_tmp_2", \
    mode="overwrite", \
    properties={"driver":"com.mysql.jdbc.Driver", "user":"decision", "password":"1234"} \
    )

Loading class `com.mysql.jdbc.Driver'. This is deprecated. The new driver class is `com.mysql.cj.jdbc.Driver'. The driver is automatically registered via the SPI and manual loading of the driver class is generally unnecessary.
                                                                                

# Observando o resultado da tokenização dos textos das vagas

In [10]:
spark.sql(f"""
    SELECT
        token,
        count(distinct id) as vacancies
    FROM
        tmp_vacancy_infos_cleaned
    GROUP BY 1
    ORDER BY 2 desc
""").show(10, False)



+-------------------------------------------------------------------------------+---------+
|token                                                                          |vacancies|
+-------------------------------------------------------------------------------+---------+
|1                                                                              |1276     |
|8                                                                              |695      |
|00pm                                                                           |647      |
|00am to 5                                                                      |619      |
|primary skill  others please note the skill at the beginning of job description|547      |
|outros detalhes do trabalho                                                    |515      |
|etc                                                                            |509      |
|horas                                                                          

                                                                                

In [11]:
spark.sql(f"""
    SELECT
        token,
        count(distinct id) as vacancies
    FROM
        tmp_vacancy_infos_cleaned
    GROUP BY 1
""").cache().createOrReplaceTempView("tmp_vacancy_infos_cleaned_grouped")

In [12]:
spark.sql(f"""
    SELECT
        *
    FROM
        tmp_vacancy_infos_cleaned_grouped
    ORDER BY vacancies desc
    limit 10
""").show()



+--------------------+---------+
|               token|vacancies|
+--------------------+---------+
|                   1|     1276|
|                   8|      695|
|                00pm|      647|
|           00am to 5|      619|
|primary skill  ot...|      547|
|outros detalhes d...|      515|
|                 etc|      509|
|               horas|      409|
|           desejável|      407|
|          atividades|      385|
+--------------------+---------+



                                                                                

# Análise dos dados reais das habilidads dos Candidatos

In [13]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table="applicants",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("applicants")

In [None]:
spark.sql(f"""
    SELECT
        a.id,
        normalize_and_tokenize_text(
            CONCAT(
                COALESCE(technical_knowledge, ''), '\n',
                COALESCE(cv_pt, '')
            )
        ) AS infos
    FROM
        applicants a
    GROUP BY 1,2
    
""").createOrReplaceTempView("tmp_applicant_infos")

In [15]:
spark.sql(f"""
    SELECT
        id,
        TRIM(SUBSTR(i, 0, 255)) as token
    FROM
        tmp_applicant_infos
        LATERAL VIEW explode(infos) AS i
    GROUP BY 1,2
    HAVING LENGTH(token) > 0 
    
""").cache().createOrReplaceTempView("tmp_applicant_infos_cleaned")

In [16]:
spark.sql(f"""SELECT id as applicant_id, token FROM tmp_applicant_infos_cleaned group by 1,2""").coalesce(1) \
    .write \
    .option("spark.speculation", False) \
    .jdbc( \
    url="jdbc:mysql://localhost:3306/decision", \
    table="applicants_tokens_tmp_2", \
    mode="overwrite", \
    properties={"driver":"com.mysql.jdbc.Driver", "user":"decision", "password":"1234"} \
    )

                                                                                

In [17]:
spark.sql(f"""
    SELECT
        token,
        count(distinct id) as applicants
    FROM
        tmp_applicant_infos_cleaned
    GROUP BY 1
""").cache().createOrReplaceTempView("tmp_applicant_infos_cleaned_grouped")

In [18]:
spark.sql(f"""
    SELECT
        *
    FROM
        tmp_applicant_infos_cleaned_grouped
    order by applicants desc
    limit 10
""").show()



+--------------------+----------+
|               token|applicants|
+--------------------+----------+
|experiência profi...|     12604|
|  formação acadêmica|     11816|
|             idiomas|      9556|
|            formação|      6713|
|            objetivo|      6371|
|              inglês|      5425|
|                  sp|      4774|
|           são paulo|      4549|
|         experiência|      4395|
|              brasil|      4384|
+--------------------+----------+



                                                                                

# Verifica a interseção dos requisitos de vagas e de candidatos

In [19]:
spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            a.token as a_infos,
            v.token as v_infos,
            applicants,
            vacancies,
            ((vacancies + applicants) * ( abs(vacancies - applicants) / IF(vacancies > applicants, applicants, vacancies))) / (2.0 + ( abs(vacancies - applicants) * ( abs(vacancies - applicants) / IF(vacancies > applicants, applicants, vacancies)) ) )  as score
        FROM
            tmp_applicant_infos_cleaned_grouped a
            FULL OUTER JOIN tmp_vacancy_infos_cleaned_grouped v
                ON a.token = v.token
        WHERE
            a.token IS NOT NULL AND v.token IS NOT NULL
        ) AS t
    -- WHERE
        -- a_infos = 'python'
        -- score > 1.0
    ORDER BY score desc
""").show(100, False)

                                                                                

+-----------------------------------------+-----------------------------------------+----------+---------+------------------+
|a_infos                                  |v_infos                                  |applicants|vacancies|score             |
+-----------------------------------------+-----------------------------------------+----------+---------+------------------+
|8                                        |8                                        |625       |695      |15.02439024390244 |
|descrição                                |descrição                                |211       |237      |10.608378870673953|
|local                                    |local                                    |332       |321      |9.414154652686761 |
|soft skills                              |soft skills                              |104       |94       |6.875             |
|conhecimento técnico                     |conhecimento técnico                     |89        |112      |6.5388967468

# Analisa se os candidatos que foram contratados possuem os mesmos requisitos das vagas

In [27]:
spark.sql(f"""
    SELECT
        applicant_id,
        vacancy_id
        -- status
    FROM
        vacancies_applicants
    WHERE
        lower(status) like '%contratado%'
""").cache().createOrReplaceTempView("tmp_vacancy_applicants_hired")

In [23]:
spark.sql(f"""
    SELECT
        *
    FROM
        tmp_vacancy_applicants_hired 
    
""").show()


+------------+----------+
|applicant_id|vacancy_id|
+------------+----------+
|       12585|         2|
|       12651|        13|
|       12664|        13|
|       12671|        13|
|       12703|        13|
|       12721|        13|
|       12734|        13|
|       12745|        13|
|       12630|        14|
|       14193|        15|
|        8025|        15|
|        8471|        15|
|       12628|        16|
|       12629|        17|
|       12681|        20|
|       12665|        24|
|       12384|        28|
|       12895|        30|
|       12896|        30|
|       12667|        37|
+------------+----------+
only showing top 20 rows



In [28]:
spark.sql(f"""
    SELECT
        vh.applicant_id,
        vh.vacancy_id,
        COUNT(DISTINCT IF(a.infos = v.infos, a.infos, NULL)) as same_infos,
        COUNT(DISTINCT IF(a.infos = vv.infos, a.infos, NULL)) as same_infos_llm
        -- COLLECT_SET(IF(a.infos = v.infos, a.infos, NULL)) as a_infos,
        -- COLLECT_SET(IF(a.infos = vv.infos, a.infos, NULL)) as a_infos_llm
    FROM
        tmp_vacancy_applicants_hired vh
        LEFT JOIN tmp_applicant_infos_cleaned a
            ON a.id = vh.applicant_id
        LEFT JOIN tmp_vacancy_infos_cleaned v 
            ON v.id = vh.vacancy_id
        LEFT JOIN tmp_vacancy_infos_cleaned_llm vv 
            ON vv.id = vh.vacancy_id
            
    GROUP BY 1,2
    ORDER BY 3 desc
    
""").show(100, False)



+------------+----------+----------+--------------+
|applicant_id|vacancy_id|same_infos|same_infos_llm|
+------------+----------+----------+--------------+
|33862       |8573      |38        |0             |
|15385       |806       |16        |0             |
|15106       |710       |10        |0             |
|14135       |6933      |7         |0             |
|25636       |4546      |7         |0             |
|24281       |6901      |5         |6             |
|17996       |1635      |5         |0             |
|43265       |11828     |5         |0             |
|20100       |2596      |5         |0             |
|5167        |2745      |5         |0             |
|12563       |6901      |5         |6             |
|21488       |3083      |5         |0             |
|20382       |2748      |5         |0             |
|31557       |9387      |5         |0             |
|21467       |3195      |5         |7             |
|30623       |6901      |5         |5             |
|9758       

                                                                                

In [37]:
spark.sql(f"""
    SELECT
        vh.applicant_id,
        vh.vacancy_id,
        COUNT(DISTINCT IF(a.infos = trim(lower(vv.infos)), a.infos, NULL)) as same_infos_llm,
        -- COLLECT_SET(IF(a.infos = v.infos, a.infos, NULL)) as a_infos,
        COLLECT_SET(IF(a.infos = trim(lower(vv.infos)), a.infos, NULL)) as a_infos_llm
    FROM
        vacancies_applicants vh
        -- tmp_vacancy_applicants_hired vh
        LEFT JOIN tmp_applicant_infos_cleaned a 
            ON a.id = vh.vacancy_id
        LEFT JOIN tmp_vacancy_infos_cleaned_llm vv 
            ON vv.id = vh.vacancy_id
            
    GROUP BY 1,2
    ORDER BY 3 desc
    
""").show(100, False)



+------------+----------+--------------+-----------------------+
|applicant_id|vacancy_id|same_infos_llm|a_infos_llm            |
+------------+----------+--------------+-----------------------+
|18121       |6637      |1             |[banco de dados oracle]|
|16290       |5319      |1             |[javascript]           |
|18415       |10873     |1             |[angular]              |
|16726       |5319      |1             |[javascript]           |
|27836       |6637      |1             |[banco de dados oracle]|
|17352       |5319      |1             |[javascript]           |
|41149       |10873     |1             |[angular]              |
|17962       |5319      |1             |[javascript]           |
|29224       |6637      |1             |[banco de dados oracle]|
|23617       |5319      |1             |[javascript]           |
|41323       |10873     |1             |[angular]              |
|24621       |5319      |1             |[javascript]           |
|29651       |6637      |

                                                                                

In [None]:
spark.sql(f"""
    SELECT
        vh.applicant_id,
        vh.vacancy_id,
        COUNT(DISTINCT IF(a.infos = v.infos, a.infos, NULL)) as same_infos
    FROM
        tmp_vacancy_applicants_hired vh
        LEFT JOIN tmp_applicant_infos_cleaned a
            ON a.id = vh.applicant_id
        LEFT JOIN tmp_vacancy_infos_cleaned v 
            ON v.id = vh.vacancy_id
            
    GROUP BY 1,2
    ORDER BY 3 desc
    
""").show(100, False)

# Melhores candidatos

In [46]:
spark.sql(f"""
    SELECT
        a.id as applicant_id,
        v.id as vacancy_id,
        ARRAY_SIZE(COLLECT_SET(a.infos)) as applicant_infos,
        ARRAY_SIZE(COLLECT_SET(v.infos)) as vacancy_infos,
        COUNT(DISTINCT IF(a.infos = v.infos, a.infos, NULL)) as same_infos
    FROM
        tmp_vacancy_infos_cleaned v
        LEFT JOIN vacancies_applicants va
            ON v.id = va.vacancy_id 
        LEFT JOIN tmp_applicant_infos_cleaned a
            ON a.id = va.applicant_id
    WHERE
        v.id = 10482
        -- and LENGTH(v.infos) < 80
    GROUP BY 1,2
    ORDER BY 3 desc
    
""").show(100, False)



+------------+----------+---------------+-------------+----------+
|applicant_id|vacancy_id|applicant_infos|vacancy_infos|same_infos|
+------------+----------+---------------+-------------+----------+
|40041       |10482     |175            |4            |0         |
|40046       |10482     |141            |4            |0         |
|40606       |10482     |88             |4            |0         |
|40081       |10482     |88             |4            |0         |
|40113       |10482     |84             |4            |0         |
|40629       |10482     |75             |4            |0         |
|40034       |10482     |67             |4            |0         |
|40026       |10482     |61             |4            |0         |
|40061       |10482     |50             |4            |0         |
|40638       |10482     |48             |4            |0         |
|40610       |10482     |46             |4            |0         |
|40630       |10482     |38             |4            |0      

                                                                                

In [49]:
spark.sql(f"""
    SELECT
        id,
        ARRAY_SIZE(SPLIT(infos, '\n')) as infos
    FROM
        tmp_vacancy_infos_cleaned v
    WHERE
        v.id = 10482
    
""").show()

                                                                                

+-----+-----+
|   id|infos|
+-----+-----+
|10482|    1|
|10482|    1|
|10482|    1|
|10482|    1|
+-----+-----+



# Procura dados pessoais nos dados de Curriculo

In [44]:
spark.sql(f"""
    SELECT
        id,
        name,
        email,
        regexp_extract(cv_pt, '([\.\_a-z0-9]+@[a-z0-9]+\.[a-z0-9]+)') as cv_pt_email
    FROM
        applicants
    WHERE
        -- regexp_like(cv_pt, '[a-z0-9]+@[a-z0-9]\.[a-z0-9]+')
        lower(cv_pt) like '%@gmail.com%'

""").show(100, False)

+-----+-------------------------------+-----------------------------------------+--------------------------------------+
|id   |name                           |email                                    |cv_pt_email                           |
+-----+-------------------------------+-----------------------------------------+--------------------------------------+
|426  |Fernanda Brito                 |fernanda_brito@gmail.com                 |smpas8@hotmail.com                    |
|871  |Dra. Isabel Cassiano           |dra._isabel_cassiano@hotmail.com         |robson.barata@gmail.com               |
|1139 |Dra. Ana Júlia da Cunha        |dra._ana_júlia_da_cunha@gmail.com        |rogerio.vieira.mota@gmail.com         |
|1446 |Melina da Conceição            |melina_da_conceição@hotmail.com          |robertopmendes@gmail.com              |
|1561 |Cauã Peixoto                   |cauã_peixoto@hotmail.com                 |videira.will@gmail.com                |
|1870 |Arthur Miguel da Conceiçã

In [45]:
spark.sql(f"""
    select count(1) as total from
    (SELECT
        id,
        name,
        email,
        regexp_extract(cv_pt, '([\.\_a-z0-9]+@[a-z0-9]+\.[a-z0-9]+)') as cv_pt_email
    FROM
        applicants
    WHERE
        -- regexp_like(cv_pt, '[a-z0-9]+@[a-z0-9]\.[a-z0-9]+')
        lower(cv_pt) like '%@gmail.com%'
    ) as t
    
    where email <> cv_pt_email
""").show(100, False)

[Stage 188:>                                                        (0 + 1) / 1]

+-----+
|total|
+-----+
|615  |
+-----+



                                                                                

# Visualização dos dados de vaga e candidato para conferência

In [32]:
spark.sql(f"""
    SELECT
        va.vacancy_id,
        a.id as a_id,
        a.infos,
        v.infos
    FROM
        vacancies_applicants va
        LEFT JOIN  tmp_applicant_infos_cleaned  a
            ON va.applicant_id = a.id
        LEFT JOIN tmp_vacancy_infos_cleaned v
            ON va.vacancy_id = v.id
        LEFT JOIN tmp_vacancy_infos_cleaned_llm vl     
            ON va.vacancy_id = vl.id
    WHERE
        -- LENGth(vl.infos) < 150
        LENGth(v.infos) < 150
""").show(10, False)



+----------+-----+-----+---------------------------------------------------------------------------+
|vacancy_id|a_id |infos|infos                                                                      |
+----------+-----+-----+---------------------------------------------------------------------------+
|474       |14596|     |a versão java  6 com algumas tecnologias como toplink                      |
|474       |14596|     |hibernate                                                                  |
|474       |14596|     |swing com banco db2a ideia é trabalhar em time que utiliza metodologia ágil|
|474       |14596|     |hibernate                                                                  |
|474       |14596|     |swing com banco db2a ideia é trabalhar em time que utiliza metodologia ágil|
|474       |14596|     |com manutenção evolutiva e atendimento de incidentes em produção           |
|474       |14596|     |a versão java  6 com algumas tecnologias como toplink              

                                                                                