# Notebook para cálculo do NDCG sobre o score gerado pelo modelo

In [None]:
import numpy as np
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

from data_transform_utilities.text_parsers import clean_str
from data_transform_utilities.score import generate_score_from_status

import os
from pyspark.sql.functions import udf

from sqlalchemy import create_engine

from numpy import dot
from numpy.linalg import norm
import mlflow

In [None]:
spark_conf = SparkConf()
spark_conf.set("spark.cores", "12")
spark_conf.set("spark.driver.cores", "12")
spark_conf.set("spark.speculation", False)
spark_conf.set("spark.jars.packages", "com.mysql:mysql-connector-j:9.2.0")

spark = SparkSession \
    .builder.master("local") \
    .appName("Decision data overview") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
spark.udf.register("generate_score_from_status", generate_score_from_status)
spark.udf.register("clean_str", clean_str)

In [None]:
engine = create_engine("mysql+pymysql://decision:1234@localhost/decision?charset=utf8")
days_to_read = 3000

## Carrega os dados de vagas

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies WHERE requested_date > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies")

# Carrega os dados de candidatos

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM applicants WHERE created_at > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("applicants")

In [None]:
spark.read.jdbc(
    url="jdbc:mysql://decision:1234@localhost:3306/decision?charset=utf8",
    table=f"(SELECT * FROM vacancies_applicants WHERE last_update > DATE_ADD(current_date(), INTERVAL -{days_to_read} DAY)) AS t",
    properties={"driver": "com.mysql.cj.jdbc.Driver"}
).createOrReplaceTempView("vacancies_applicants")

# Carrega os dados (histórico recente) para montar o dataset para o NDCG

In [None]:
spark.sql(f"""
    SELECT
        va.vacancy_id,
        va.applicant_id,
        LOWER(TRIM(clean_str(v.title))) as vacancy_title,
        LOWER(COALESCE(v.country, '')) AS country,
        LOWER(COALESCE(v.city, '')) AS city,
        LOWER(COALESCE(v.state, '')) AS state,
        LOWER(COALESCE(v.main_activities, '')) AS main_activities,
        LOWER(COALESCE(v.behavioral_skills, '')) AS behavioral_skills,
        LOWER(COALESCE(v.technical_and_behavioral_skills, '')) AS technical_and_behavioral_skills,
        LOWER(COALESCE(a.location, '')) AS applicant_location,
        LOWER(COALESCE(a.professional_title, '')) AS professional_title,
        LOWER(COALESCE(a.technical_knowledge, '')) AS technical_knowledge,
        LOWER(COALESCE(a.cv_pt,'')) AS cv_pt,
        LOWER(COALESCE(a.area_of_expertise,'')) AS area_of_expertise,
        generate_score_from_status(status) AS artificial_score
    FROM
        vacancies_applicants va 
        LEFT JOIN vacancies v ON v.id = va.vacancy_id 
        LEFT JOIN applicants a ON a.id = va.applicant_id
    WHERE
        va.vacancy_id IN
            (SELECT
                va.vacancy_id
            FROM
                vacancies_applicants va
            GROUP BY 1
            HAVING COUNT(DISTINCT va.status) >= 5)          
""").createOrReplaceTempView("tmp_data")

In [None]:
spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT
            vacancy_id,
            CLEAN_STR(
                if(main_activities = technical_and_behavioral_skills,
                main_activities
                ,
                CONCAT(
                    main_activities, '\n', 
                    technical_and_behavioral_skills, '\n',
                    behavioral_skills
                )
            )) AS vacancy_description,
            
            vacancy_title,
            CONCAT( state, ', ', city) AS vacancy_location,
            
            applicant_id,
            CLEAN_STR(professional_title) AS applicant_title,
            TRIM(CLEAN_STR(CONCAT(technical_knowledge, '\n', cv_pt, '\n', area_of_expertise))) AS applicant_description,
            applicant_location,
            artificial_score
        FROM 
            tmp_data v
        ) AS t
    WHERE
        LENGTH(vacancy_title) > 0
        AND LENGTH(vacancy_description) > 150
        AND LENGTH(vacancy_location) > 0
        AND LENGTH(applicant_title) > 0
        AND LENGTH(applicant_description) > 150
        AND LENGTH(applicant_location) > 0
    
""").createOrReplaceTempView("tmp_results")

In [None]:
results = spark.sql(f"""
    SELECT
        *
    FROM
        (SELECT 
            *,
            SUM(1) OVER (PARTITION BY vacancy_id) AS items 
        FROM 
            tmp_results 
        ORDER BY 
            vacancy_id 
        ) AS t
    WHERE
        items >= 5
""")

In [None]:
dataset = results.toPandas()

# Carrega o modelo

In [None]:
MODEL_NAME = 'applicant_job_similarity'
MLFLOW_TRACKING_URI = 'http://192.168.101.186:5000'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_version = os.environ["MODEL_VERSION"] if "MODEL_VERSION" in os.environ else 27
model = mlflow.sentence_transformers.load_model(f"models:/applicant_job_similarity/{model_version}")

In [None]:
def get_embedding(text, model):
    return model.encode(text, normalize_embeddings=True)

# Calcula o score (Cosine Similarity)

In [None]:
for i, doc in dataset.iterrows():
    # Embeddings da Vaga
    query_title = [get_embedding(doc.vacancy_title, model)]
    query_description = [get_embedding(doc.vacancy_description, model)]
    query_location = [get_embedding(doc.vacancy_location, model)]

    # Embeddings do Candidato
    title = get_embedding(doc.applicant_title, model)
    description = get_embedding(doc.applicant_description, model)
    location = get_embedding(doc.applicant_location, model)

    # Calcula o Score de Cosine Similarity para usar no NDCG
    titles_score = cosine_similarity(query_title, [title])[0]    
    description_score = cosine_similarity(query_description, [description])[0]
    location_score = cosine_similarity(query_location, [location])[0]
    
    # Estamos considerando que os campos possuem o mesmo peso.
    # Caso queira considerar pesos diferentes será necessário aplicar os mesmo pesos 
    # posteriormente na API
    dataset.at[i, "model_score"] = ((titles_score + description_score + location_score) / 3)

In [None]:
dataset["artificial_score"] = dataset["artificial_score"].astype(float)

In [None]:
ndcg_dataset = dataset[["vacancy_id","artificial_score", "model_score"]]
ndcg_dataset = ndcg_dataset.groupby("vacancy_id").agg(list)

In [None]:
ndcg_dataset.count()

# Trunca as listas de score para apenas 5 itens

In [None]:
ndcg_dataset["artificial_score"] = ndcg_dataset["artificial_score"].apply(lambda x: x[:5])
ndcg_dataset["model_score"] = ndcg_dataset["model_score"].apply(lambda x: x[:5])

In [None]:
ndcg_dataset

In [None]:
score = ndcg_score(ndcg_dataset["artificial_score"].to_list(), ndcg_dataset["model_score"].to_list(), k=5)
print(f"NDCG@5: {score:.4f}")

# Registra o resultado do NDCG no banco de dados para monitoramento

In [None]:
spark.sql(f"""SELECT {score} AS score""").write.jdbc( \
    url="jdbc:mysql://localhost:3306/decision", \
    table="ndcg_results", \
    mode="append", \
    properties={"driver":"com.mysql.jdbc.Driver", "user":"decision", "password":"1234"} \
    )