In [1]:
import os
import glob

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, udf, explode, collect_list, array
from pyspark.sql.types import StringType
from pyspark.sql import functions as F

from utils.mongodb_to_parquet import get_pyspark_session
from utils.gdrive_utils import connect_to_gdrive, sync_gdrive_db_to_local

from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, FloatType
from rapidfuzz import process, fuzz

# Import data

## Import parquet-ified resumes & JD

In [2]:
spark = get_pyspark_session()

:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-09682370-5076-47b2-8baa-5888b1d48b2e;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;10.5.0 in central
	found org.mongodb#mongodb-driver-sync;5.1.4 in central
	[5.1.4] org.mongodb#mongodb-driver-sync;[5.1.1,5.1.99)
	found org.mongodb#bson;5.1.4 in central
	found org.mongodb#mongodb-driver-core;5.1.4 in central
	found org.mongodb#bson-record-codec;5.1.4 in central
:: resolution report :: resolve 3180ms :: artifacts dl 9ms
	:: modules in use:
	org.mongodb#bson;5.1.4 from central in [default]
	org.mongodb#bson-record-codec;5.1.4 from central in [default]
	org.mongodb#mongodb-driver-core;5.1.4 from central in [default]
	org.mongodb#mongodb-driver-sync;5.1.4 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;10.5.0 fr

In [3]:
def read_silver_table(table, silver_db, spark):
    """
    Helper function to read all partitions of a silver table
    """
    folder_path = os.path.join(silver_db, table)
    files_list = [os.path.join(folder_path, os.path.basename(f)) for f in glob.glob(os.path.join(folder_path, '*'))]
    df = spark.read.option("header", "true").parquet(*files_list)
    return df

In [4]:
df_jd = read_silver_table('job_descriptions', 'datamart/silver', spark)
df_resumes = read_silver_table('resumes', 'datamart/silver', spark)
df_labels = read_silver_table('labels', 'datamart/silver', spark)

                                                                                

In [5]:
# Select relevant columns for skill processing
df_resumes_skills = df_resumes.select('id', 'hard_skills', 'soft_skills') 
df_jd_skills = df_jd.select('id', 'required_hard_skills', 'required_soft_skills')

In [6]:
# Convert to lowercase
df_resumes_skills = df_resumes_skills.withColumn("hard_skills", expr("transform(hard_skills, x -> lower(x))"))
df_resumes_skills = df_resumes_skills.withColumn("soft_skills", expr("transform(soft_skills, x -> lower(x))"))

df_jd_skills = df_jd_skills.withColumn("required_hard_skills", expr("transform(required_hard_skills, x -> lower(x))"))
df_jd_skills = df_jd_skills.withColumn("required_soft_skills", expr("transform(required_soft_skills, x -> lower(x))"))

## Import skills list silver table

In [76]:
sync_gdrive_db_to_local()

Downloading data files...: 100%|██████████| 6/6 [00:15<00:00,  2.62s/it]

Download complete.





# Skills Standardization

In [7]:
df_hard_skills_keywords = read_silver_table('hardskill', 'datamart/silver', spark)
df_hard_skills_keywords.show()

                                                                                

+----------+--------------------+--------------------+
|keyword_id|             keyword|             example|
+----------+--------------------+--------------------+
|  43232202|document manageme...|adobe systems ado...|
|  43232306|data base user in...|     adsense tracker|
|  43232201|content workflow ...|      atlassian jira|
|  43232303|customer relation...|blackbaud the rai...|
|  43231601| accounting software|computerease cons...|
|  43232305|data base reporti...|database reportin...|
|  43232306|data base user in...|             databox|
|  43233501|electronic mail s...|      email software|
|  43231602|enterprise resour...|enterprise resour...|
|  43231605|time accounting s...|exact software ma...|
|  43232403|enterprise applic...|extensible markup...|
|  43231601| accounting software|fund accounting s...|
|  43232102|graphics or photo...|graphic presentat...|
|  43231505|human resources s...|        halogen e360|
|  43231505|human resources s...|    halogen epraisal|
|  4323150

In [8]:
df_soft_skills_keywords = read_silver_table('softskill', 'datamart/silver', spark)
df_soft_skills_keywords.show()

+----------+--------------------+--------------------+
|keyword_id|             keyword|            examples|
+----------+--------------------+--------------------+
|   2.a.1.a|reading comprehen...| getting information|
|   2.a.1.a|reading comprehen...|monitor processes...|
|   2.a.1.a|reading comprehen...|identifying objec...|
|   2.a.1.a|reading comprehen...|judging the quali...|
|   2.a.1.a|reading comprehen...|processing inform...|
|   2.a.1.a|reading comprehen...|evaluating inform...|
|   2.a.1.a|reading comprehen...|analyzing data or...|
|   2.a.1.a|reading comprehen...|making decisions ...|
|   2.a.1.a|reading comprehen...|updating and usin...|
|   2.a.1.a|reading comprehen...|interacting with ...|
|   2.a.1.a|reading comprehen...|drafting, laying ...|
|   2.a.1.a|reading comprehen...|interpreting the ...|
|   2.a.1.a|reading comprehen...|training and teac...|
|   2.a.1.a|reading comprehen...|provide consultat...|
|   2.a.1.a|reading comprehen...|performing admini...|
|   2.a.1.

## Fuzzy Matching

In [79]:
# df_resumes_skills = df_resumes_skills.withColumnRenamed("id", "resume_id")
# df_jd_skills = df_jd_skills.withColumnRenamed("id", "job_id")
# df_joined = df_labels.join(df_resumes_skills, on="resume_id", how="inner") 
# df_joined = df_joined.join(df_jd_skills, on="job_id", how="inner")
# df_joined.show()

In [23]:
def fuzzy_match_array_udf_factory(mapping_dict):
    def match_array(arr):
        keyword_matches = []
        example_matches = []
        scores = []

        reference_values = list(mapping_dict.keys())
        
        if arr:
            for x in arr:
                if x:
                    match = process.extractOne(x, reference_values, scorer=fuzz.token_set_ratio)
                    score = float(match[1])
                    if match and score > 80:
                        example_matches.append(match[0])
                        keyword_matches.append(mapping_dict[match[0]])
                        scores.append(score)  # similarity score as float
                    else:
                        example_matches.append(None)
                        keyword_matches.append(None)
                        scores.append(None)
                else:
                    keyword_matches.append(None)
                    example_matches.append(None)
                    scores.append(None)
            return keyword_matches, example_matches, scores
        return None
    
    schema = StructType([
        StructField("keyword_matches", ArrayType(StringType()), nullable=True),
        StructField("example_matches", ArrayType(StringType()), nullable=True),
        StructField("scores", ArrayType(FloatType()), nullable=True)
    ])
    
    return udf(match_array, schema)




In [None]:
# import math

# def fuzzy_match_array_udf_factory(reference_values):
#     counts = Counter(reference_values)
#     weighted_choices = [(k, int(v)) for k, v in counts.items()]

#     max_weight = counts.most_common(1)[0][1]

#     def match_array(arr):
#         matches = []
#         scores = []
        
#         if arr:
#             for item in arr:
#                 best_match = None
#                 best_score = -1.0
#                 best_original_score = -1.0
#                 if item:
#                     for ref, weight in weighted_choices:
#                         original_score = fuzz.ratio(item, ref)
#                         score = fuzz.ratio(item, ref) + math.log(weight)
#                         if score > best_score:
#                             best_score = score
#                             best_original_score = original_score
#                             best_match = ref
#                 matches.append(best_match)
#                 scores.append(best_original_score)
#             return matches, scores
#         return None
    
#     schema = StructType([
#         StructField("matches", ArrayType(StringType()), nullable=True),
#         StructField("scores", ArrayType(FloatType()), nullable=True)
#     ])
    
#     return udf(match_array, schema)


**Hard skills convert to keywords**

Standardize hard skills

In [None]:
hard_skills_ref = df_hard_skills_keywords.select("example").rdd.flatMap(lambda x: x).collect()

mapping_dict = dict(df_hard_skills_keywords.rdd.map(lambda row: (row['example'], row['keyword'])).collect())

fuzzy_match_udf = fuzzy_match_array_udf_factory(mapping_dict)

df_matched = df_resumes_skills.withColumn("hard_skills_fuzzy", fuzzy_match_udf("hard_skills"))

df_matched = df_matched \
    .withColumn("hard_skills_category", df_matched["hard_skills_fuzzy"]["keyword_matches"]) \
    .withColumn("hard_skills_standardized", df_matched["hard_skills_fuzzy"]["example_matches"]) \
    .withColumn("hard_skills_similarity_scores", df_matched["hard_skills_fuzzy"]["scores"]) \
    .drop("hard_skills_fuzzy")

df_matched.show()



+------------+--------------------+--------------------+--------------------+------------------------+-----------------------------+
|          id|         hard_skills|         soft_skills|hard_skills_category|hard_skills_standardized|hard_skills_similarity_scores|
+------------+--------------------+--------------------+--------------------+------------------------+-----------------------------+
|RES_s93wTCLp|[atm, c++, ccna, ...|[clients, procure...|[NULL, object or ...|    [NULL, c++, NULL,...|         [NULL, 100.0, NUL...|
|RES_Z7yf1tu6|[sql, pl/sql, tra...|[communication, t...|[data base user i...|    [structured query...|         [100.0, 100.0, 10...|
|RES_hAp1XnJZ|[database design,...|[excellent commun...|[data base manage...|    [database design ...|         [100.0, NULL, NUL...|
|RES_zsgGxd2s|[microsoft access...|[collaboration, c...|[data base user i...|    [microsoft access...|         [100.0, 100.0, NU...|
|RES_RrDNNvMz|[r, sql, data ana...|[innovative, deta...|[object or co

                                                                                



In [84]:

# # reference_list = ["java", "python", "sql", "machine learning"]

# # def fuzzy_match_array_udf_factory(reference_values):
# #     def match_array(arr):
# #         if arr:
# #             # For each element, find the best fuzzy match in reference_values
# #             return [process.extractOne(x, reference_values)[0] if x else None for x in arr]
# #         return None
# #     return udf(match_array, ArrayType(StringType()))


# # Only the match
# # def fuzzy_match_array_udf_factory():
# #     def match_array(arr1, arr2):
# #         if arr1 and arr2:
# #             return [
# #                 process.extractOne(x, arr2)[0] if x else None
# #                 for x in arr1
# #             ]
# #         return None
# #     return udf(match_array, ArrayType(StringType()))

# # Match and score
# def fuzzy_match_array_udf_factory():
#     def match_array(arr1, arr2):
#         if arr1 and arr2:
#             matches = []
#             scores = []
#             for x in arr1:
#                 if x:
#                     match = process.extractOne(x, arr2)
#                     if match:
#                         matches.append(match[0])
#                         scores.append(float(match[1]))  # similarity score as float
#                     else:
#                         matches.append(None)
#                         scores.append(None)
#                 else:
#                     matches.append(None)
#                     scores.append(None)
#             return matches, scores
#         return None, None
    
#     schema = StructType([
#         StructField("matches", ArrayType(StringType()), nullable=True),
#         StructField("scores", ArrayType(FloatType()), nullable=True)
#     ])
#     return udf(match_array, schema)


# fuzzy_match_udf = fuzzy_match_array_udf_factory()

# df_matched = df_joined.withColumn("hard_skills_fuzzy", fuzzy_match_udf("hard_skills", "required_hard_skills"))

# df_matched = df_matched \
#     .withColumn("hard_skills_matched", df_matched["hard_skills_fuzzy"]["matches"]) \
#     .withColumn("hard_skills_similarity_scores", df_matched["hard_skills_fuzzy"]["scores"]) \
#     .drop("hard_skills_fuzzy")

# df_matched.show()


In [25]:
row = df_matched.filter(df_matched.id == "RES_s93wTCLp").first().asDict()
for skill, category, keyword, score in zip(row['hard_skills'],
                        row['hard_skills_category'], 
                        row['hard_skills_standardized'],
                        row['hard_skills_similarity_scores']):
    print(f"Original skill: {skill}, Matched category: {category}, Keyword: {keyword}, Score: {score}"  )

Original skill: atm, Matched category: None, Keyword: None, Score: None
Original skill: c++, Matched category: object or component oriented development software, Keyword: c++, Score: 100.0
Original skill: ccna, Matched category: None, Keyword: None, Score: None
Original skill: cisco, Matched category: access software, Keyword: cisco anyconnect, Score: 100.0
Original skill: dsl, Matched category: None, Keyword: None, Score: None
Original skill: ethernet, Matched category: None, Keyword: None, Score: None
Original skill: lan, Matched category: lan software, Keyword: lan software, Score: 100.0
Original skill: linux, Matched category: operating system software, Keyword: linux, Score: 100.0
Original skill: lotus notes, Matched category: electronic mail software, Keyword: ibm lotus notes, Score: 100.0
Original skill: matlab, Matched category: analytical or scientific software, Keyword: the mathworks matlab, Score: 100.0
Original skill: microsoft excel, Matched category: spreadsheet software,

Convert hard skills to keyword

**Soft skills convert to keywords**

In [44]:
soft_skills_ref = df_soft_skills_keywords.select("keyword").dropDuplicates().rdd.flatMap(lambda x: x).collect()

fuzzy_match_udf = fuzzy_match_array_udf_factory(soft_skills_ref)

df_matched_soft = df_resumes_skills.withColumn("soft_skills_fuzzy", fuzzy_match_udf("soft_skills"))

df_matched_soft = df_matched_soft \
    .withColumn("soft_skills_matched", df_matched_soft["soft_skills_fuzzy"]["matches"]) \
    .withColumn("soft_skills_similarity_scores", df_matched_soft["soft_skills_fuzzy"]["scores"]) \
    .drop("soft_skills_fuzzy")

df_matched_soft.show()

+------------+--------------------+--------------------+--------------------+-----------------------------+
|          id|         hard_skills|         soft_skills| soft_skills_matched|soft_skills_similarity_scores|
+------------+--------------------+--------------------+--------------------+-----------------------------+
|RES_s93wTCLp|[atm, c++, ccna, ...|[clients, procure...|[active listening...|         [43.47826, 38.461...|
|RES_Z7yf1tu6|[sql, pl/sql, tra...|[communication, t...|[coordination, sp...|         [64.0, 37.5, 40.0...|
|RES_hAp1XnJZ|[database design,...|[excellent commun...|[service orientat...|         [44.89796, 48.780...|
|RES_zsgGxd2s|[microsoft access...|[collaboration, c...|[coordination, se...|         [72.0, 60.869564,...|
|RES_RrDNNvMz|[r, sql, data ana...|[innovative, deta...|[service orientat...|         [48.275864, 47.05...|
|RES_7rYoKTcb|[matlab, simulink...|[communication, t...|[coordination, sp...|         [64.0, 37.5, 40.0...|
|RES_dI74kKQw|[html, html5, 

In [45]:
row = df_matched_soft.filter(df_matched_soft.id == "RES_mSxlSENu").first().asDict()
for skill, match, score in zip(row['soft_skills'],
                        row['soft_skills_matched'], 
                        row['soft_skills_similarity_scores']):
    print(f"Original skill: {skill}, Matched word: {match}, Score: {score}"  )

Original skill: communication, Matched word: coordination, Score: 64.0
Original skill: teamwork, Matched word: speaking, Score: 37.5
Original skill: leadership, Matched word: active learning, Score: 40.0
Original skill: problem-solving, Matched word: complex problem solving, Score: 73.68421173095703
Original skill: analytical skills, Matched word: critical thinking, Score: 47.05882263183594
Original skill: project management, Matched word: time management, Score: 80.0
Original skill: client handling, Matched word: active learning, Score: 60.0
Original skill: written communication skills, Matched word: service orientation, Score: 51.0638313293457


## Embedding

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_mistralai import MistralAIEmbeddings

def embed_match_array_udf_factory(keywords_list):

    def match_array(arr):
        embeddings = MistralAIEmbeddings(model="mistral-embed")
   
        vectorstore = InMemoryVectorStore.from_texts(
            keywords_list,
            embedding=embeddings
        )
        
        if arr:
            matches = []
            scores = []

            for x in arr:
                if x:
                    result = vectorstore.similarity_search_with_score(x, k=1)
                    top_doc, score = result[0]

                    matches.append(top_doc.page_content)
                    scores.append(score)

            return matches, scores
        return None, None
        
    schema = StructType([
        StructField("matches", ArrayType(StringType()), nullable=True),
        StructField("scores", ArrayType(FloatType()), nullable=True)
    ])
    
    return udf(match_array, schema)


In [50]:
soft_skills_ref = df_soft_skills_keywords.select("keyword").dropDuplicates().rdd.flatMap(lambda x: x).collect()

embed_match_udf = embed_match_array_udf_factory(soft_skills_ref)

df_matched_soft = df_resumes_skills.withColumn("soft_skills_embed", embed_match_udf("soft_skills"))

df_matched_soft = df_matched_soft \
    .withColumn("soft_skills_matched", df_matched_soft["soft_skills_embed"]["matches"]) \
    .withColumn("soft_skills_similarity_scores", df_matched_soft["soft_skills_embed"]["scores"]) \
    .drop("soft_skills_embed")

df_matched_soft.show()

[Stage 79:>                                                         (0 + 1) / 1]

+------------+--------------------+--------------------+--------------------+-----------------------------+
|          id|         hard_skills|         soft_skills| soft_skills_matched|soft_skills_similarity_scores|
+------------+--------------------+--------------------+--------------------+-----------------------------+
|RES_s93wTCLp|[atm, c++, ccna, ...|[clients, procure...|[writing, negotia...|         [0.6538537, 0.714...|
|RES_Z7yf1tu6|[sql, pl/sql, tra...|[communication, t...|[speaking, coordi...|         [0.7909141, 0.760...|
|RES_hAp1XnJZ|[database design,...|[excellent commun...|[active listening...|         [0.74233043, 0.72...|
|RES_zsgGxd2s|[microsoft access...|[collaboration, c...|[coordination, se...|         [0.7823948, 0.755...|
|RES_RrDNNvMz|[r, sql, data ana...|[innovative, deta...|[critical thinkin...|         [0.6805059, 0.726...|
|RES_7rYoKTcb|[matlab, simulink...|[communication, t...|[speaking, coordi...|         [0.7909141, 0.760...|
|RES_dI74kKQw|[html, html5, 

                                                                                

In [51]:
row = df_matched_soft.filter(df_matched_soft.id == "RES_mSxlSENu").first().asDict()
for skill, match, score in zip(row['soft_skills'],
                        row['soft_skills_matched'], 
                        row['soft_skills_similarity_scores']):
    print(f"Original skill: {skill}, Matched word: {match}, Score: {score}"  )



Original skill: communication, Matched word: speaking, Score: 0.7909141182899475
Original skill: teamwork, Matched word: coordination, Score: 0.7602235078811646
Original skill: leadership, Matched word: critical thinking, Score: 0.747788667678833
Original skill: problem-solving, Matched word: complex problem solving, Score: 0.8326230049133301
Original skill: analytical skills, Matched word: critical thinking, Score: 0.777920126914978
Original skill: project management, Matched word: time management, Score: 0.82828289270401
Original skill: client handling, Matched word: management of personnel resources, Score: 0.6217860579490662
Original skill: written communication skills, Matched word: writing, Score: 0.7942733764648438


                                                                                