## Processing of Job Descriptions Required Education

Goal:</br>
Transform the single variable 'required_education' to two variables 'required_education_level' 'required_education_field'

Assumtions:</br>
- Choose education level based on lowest level referred to (e.g. bachelor required, master preferred --> bachelor)
- something else?

In [1]:
# Initialisation
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T

from utils.spark_utils import pyspark_df_info

In [8]:
spark = SparkSession.builder \
    .appName("Silver Processing Education") \
    .master("local[*]") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [9]:
# Loading data 
df = spark.read.parquet('data/silver/*.parquet')
pyspark_df_info(df)


Total entries: 6241
Data columns (total 32 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   resume_id                 6241               string         
1   job_id                    6241               string         
2   _id                       6241               string         
3   fit                       6241               string         
4   snapshot_date             6241               date           
5   fit_score                 6241               double         
6   company_name              3908               string         
7   role_title                6046               string         
8   employment_type           6196               string         
9   job_location              4075               string         
10  about_the_company         3664               string         
11  job_responsibilities      6241               array<string>  
12  required_hard_skills      6241  

In [10]:
# Filtering to relevant columns & removing null values
df_education = df['job_id', '_id', 'snapshot_date', 'job_snapshot', 'required_education']
df_education.show(5, truncate=False)
print(f"Total records: {df_education.count()}")

df_education = df_education.filter(F.col('required_education').isNotNull())
print(f"Records after filtering nulls: {df_education.count()}")

+-----------+------------------------+-------------+------------+--------------------------------------------------------------------------------------------------+
|job_id     |_id                     |snapshot_date|job_snapshot|required_education                                                                                |
+-----------+------------------------+-------------+------------+--------------------------------------------------------------------------------------------------+
|JD_s93wTCLp|6845479b07df3572368fc32c|2021-07-15   |2021-07-15  |NULL                                                                                              |
|JD_Z7yf1tu6|684547e670634603a66d8deb|2021-07-24   |2021-07-24  |Bachelor's degree in computer science or a related discipline required. Master's degree preferred.|
|JD_hAp1XnJZ|684547ed70634603a66d8def|2021-07-21   |2021-07-21  |NULL                                                                                              |
|JD_zsgGxd

In [19]:
from utils import edu_utils as U

# Save the education levels and fields as Parquet files
U.save_education_synonyms(spark, 'data/education_level_synonyms.parquet', U.education_levels)
U.save_education_synonyms(spark, 'data/education_field_synonyms.parquet', U.education_fields)


In [11]:
# Using rapidfuzz for fuzzy matching
from utils.edu_utils import determine_edu_mapping

# Collect and broadcast the education levels data
education_levels_list = spark.read.parquet('data/education_level_synonyms.parquet').collect()
edu_levels = spark.sparkContext.broadcast(education_levels_list)

# iterate through the DataFrame and apply the function
df_education = df_education.withColumn(
    'education_level',
    F.udf(lambda x: determine_edu_mapping(x, edu_levels.value), T.StringType())(F.col('required_education'))
)
df_education.show(10, truncate=False)
pyspark_df_info(df_education)

+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|job_id     |_id                     |snapshot_date|job_snapshot|required_education                                                                                                                      |education_level  |
+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|JD_Z7yf1tu6|684547e670634603a66d8deb|2021-07-24   |2021-07-24  |Bachelor's degree in computer science or a related discipline required. Master's degree preferred.                                      |Bachelor's Degree|
|JD_RrDNNvMz|684547f470634603a66d8df4|2021-07-13   |2021-07-13  |Bachelor's degree in Business Administration, Infor

In [12]:
# Using rapidfuzz for fuzzy matching
from utils.edu_utils import determine_edu_mapping

# Collect and broadcast the education levels data
education_fields_list = spark.read.parquet('data/education_field_synonyms.parquet').collect()
edu_fields = spark.sparkContext.broadcast(education_fields_list)

# iterate through the DataFrame and apply the function
df_education = df_education.withColumn(
    'education_field',
    F.udf(lambda x: determine_edu_mapping(x, edu_fields.value), T.StringType())(F.col('required_education'))
)
df_education.show(10, truncate=False)
pyspark_df_info(df_education)

+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+-----------------+---------------------+
|job_id     |_id                     |snapshot_date|job_snapshot|required_education                                                                                                                      |education_level  |education_field      |
+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+-----------------+---------------------+
|JD_Z7yf1tu6|684547e670634603a66d8deb|2021-07-24   |2021-07-24  |Bachelor's degree in computer science or a related discipline required. Master's degree preferred.                                      |Bachelor's Degree|Computer Science & IT|
|JD_RrDNNvMz|684547f47063460

## Education Level Parsing

In [19]:
df_education_levels = spark.createDataFrame(
    data=[
        ("High School", 1, "Completion of high school or equivalent", 
            [
                "high school", "ged", "secondary", "college", "diploma"
            ]
        ),
        ("Associate's Degree", 2, "Completion of a two-year degree program or certificate", 
            [
                "associate's", "a.a", "a.s", "a.a.s", "certificate"
            ]
        ),
        ("Bachelor's Degree", 3, "Completion of a undergraduate degree program", 
            [
                "bachelor", "bachelor's degree", "bachelor's in", "undergraduate degree",
                "ba", "bs", "bsc", "bba", "beng"
                
            ]
        ),
        ("Master's Degree", 4, "Completion of a postgraduate degree program", 
            [
                "master", "master's degree", "master's in", "postgraduate degree",
                "ma", "ms", "msc", "mba", "m.eng"
            ]
        ),
        ("Doctorate", 5, "Completion of a doctoral degree program", 
            [
                "phd", "md", "jd", "doctor", "doctorate", "doctorate degree", "doctoral degree"
            ]
        )
    ],
    schema=T.StructType([
        T.StructField('level_name', T.StringType(), False),
        T.StructField('level_scale', T.IntegerType(), False),
        T.StructField('level_description', T.StringType(), True),
        T.StructField('level_references', T.ArrayType(T.StringType()), True)
    ])
)

df_education_levels.show(5, truncate=False)
pyspark_df_info(df_education_levels)
# Save as Parquet instead of CSV
df_education_levels.write.mode('overwrite').parquet('data/education_level_synonyms_simplified.parquet')

# Read back from Parquet (preserves schema and array types)
df_education_levels = spark.read.parquet('data/education_level_synonyms_simplified.parquet')
pyspark_df_info(df_education_levels)

+------------------+-----------+------------------------------------------------------+------------------------------------------------------------------------------------------+
|level_name        |level_scale|level_description                                     |level_references                                                                          |
+------------------+-----------+------------------------------------------------------+------------------------------------------------------------------------------------------+
|High School       |1          |Completion of high school or equivalent               |[high school, ged, secondary, college, diploma]                                           |
|Associate's Degree|2          |Completion of a two-year degree program or certificate|[associate's, a.a, a.s, a.a.s, certificate]                                               |
|Bachelor's Degree |3          |Completion of a undergraduate degree program          |[bachelor, bachelo

In [None]:
education_levels_list = spark.read.parquet('data/education_level_synonyms_simplified.parquet').collect()
broadcasted_education_levels = spark.sparkContext.broadcast(education_levels_list)

print(f"Broadcasted education levels: {len(broadcasted_education_levels.value)}")
print(f"Broadcasted education levels sample: \n{broadcasted_education_levels.value[:2]}")

Broadcasted education levels: 5
Broadcasted education levels sample: 
[Row(level_name="Associate's Degree", level_scale=2, level_description='Completion of a two-year degree program or certificate', level_references=["associate's", 'a.a', 'a.s', 'a.a.s', 'certificate']), Row(level_name="Bachelor's Degree", level_scale=3, level_description='Completion of a undergraduate degree program', level_references=['bachelor', "bachelor's degree", "bachelor's in", 'undergraduate degree'])]


In [None]:
# Using rapidfuzz for fuzzy matching
from rapidfuzz import fuzz, process

# Collect and broadcast the education levels data
education_levels_list = spark.read.parquet('data/education_level_synonyms_simplified.parquet').collect()
broadcasted_education_levels = spark.sparkContext.broadcast(education_levels_list)

# For each entry in df_education, determine the education level by trying to match the required_education field with the level_references
def determine_education_level(education_input: str) -> int:
    if education_input is None:
        return None

    best_score = 0
    best_match_level = 0
    # Use rapidfuzz to find the best match
    for row in broadcasted_education_levels.value:
        match_result = process.extractOne(education_input.lower(), row.level_references, scorer=fuzz.partial_token_set_ratio)
        if match_result and match_result[1] > best_score:
            best_score, best_match_level = match_result[1], row.level_scale

    return best_match_level


# iterate through the DataFrame and apply the function
df_education = df_education.withColumn(
    'education_level',
    F.udf(determine_education_level, T.IntegerType())(F.col('required_education'))
)
df_education.show(10, truncate=False)
pyspark_df_info(df_education)

+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------+
|job_id     |_id                     |snapshot_date|job_snapshot|required_education                                                                                                                      |education_level|
+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------+
|JD_Z7yf1tu6|684547e670634603a66d8deb|2021-07-24   |2021-07-24  |Bachelor's degree in computer science or a related discipline required. Master's degree preferred.                                      |3              |
|JD_RrDNNvMz|684547f470634603a66d8df4|2021-07-13   |2021-07-13  |Bachelor's degree in Business Administration, Information T

In [23]:
for l in range(1, 6):
    c = df_education.filter(F.col('education_level') == l).count()
    print(f"Count for education level {l}: {c}")


Count for education level 1: 78
Count for education level 2: 30
Count for education level 3: 3639
Count for education level 4: 348
Count for education level 3: 3639
Count for education level 4: 348
Count for education level 5: 283
Count for education level 5: 283


## Certification Parsing

In [17]:
df = spark.read.parquet('data/silver/*.parquet')
df_certifications = df.select('job_id', 'required_education', 'jd_certifications').filter(
    F.col('jd_certifications').isNotNull() & (F.size('jd_certifications') > 0)
)

pyspark_df_info(df_certifications)
df_certifications.show(5, truncate=False)


Total entries: 1778
Data columns (total 3 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   job_id                    1778               string         
1   required_education        1403               string         
2   jd_certifications         1778               array<string>  


+-----------+---------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|job_id     |required_education                                 |jd_certifications                                                                                                                                                         |
+-----------+---------------------------------------------------+---------------------------------------------------------------------------------

In [29]:
# Explode the jd_certifications array to have one row per certification
df_exploded_certs = df_certifications.withColumn("certification", F.explode("jd_certifications"))

# Count the number of unique certifications
unique_certifications_count = df_exploded_certs.select("certification").distinct().count()

print(f"Number of unique certifications: {unique_certifications_count}")

# Show some of the unique certifications
print("Sample of unique certifications:")
df_exploded_certs.select("certification").distinct().show(150, truncate=False)

Number of unique certifications: 143
Sample of unique certifications:
+---------------------------------------------------------------------------------+
|certification                                                                    |
+---------------------------------------------------------------------------------+
|Airflow                                                                          |
|Ansible                                                                          |
|Quickbooks Certified                                                             |
|CSA (ServiceNow Certified System Administrator)                                  |
|EA                                                                               |
|AWS Certifications                                                               |
|MLFlow                                                                           |
|Salesforce Certification                                                         |
|CPA (

In [4]:
from utils import edu_utils as U

# Save the certification categories as Parquet files
U.save_education_synonyms(spark, 'data/certification_categories.parquet', U.certification_categories)


In [19]:
# Using rapidfuzz for fuzzy matching
from utils.edu_utils import determine_edu_mapping

# Collect and broadcast the education levels data
certification_list = spark.read.parquet('data/certification_categories.parquet').collect()
cert_categories = spark.sparkContext.broadcast(certification_list)

# iterate through the DataFrame and apply the function
df_certifications = df_certifications.withColumn(
    'education_field',
    F.udf(lambda x: determine_edu_mapping(education_input=x, mapping=cert_categories.value), T.StringType())(F.col('jd_certifications'))
)
df_certifications.show(10, truncate=False)
pyspark_df_info(df_certifications)

+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
|job_id     |required_education                                                                                                                                                      |jd_certifications                                                                                                                                                         |education_field           |
+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------

## All together:

In [22]:
# imports
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import udf

from utils.edu_utils import determine_edu_mapping

# Re-read the original DataFrame
df = spark.read.parquet('data/silver/*.parquet')

# Broadcast the education levels, fields, and certification categories
edu_levels = spark.sparkContext.broadcast(spark.read.parquet('data/education_level_synonyms.parquet').collect())
edu_fields = spark.sparkContext.broadcast(spark.read.parquet('data/education_field_synonyms.parquet').collect())
cert_categories = spark.sparkContext.broadcast(spark.read.parquet('data/certification_categories.parquet').collect())

# Apply the UDFs to map the required education and certifications
df = (
    df
    .withColumn("required_edu_level", udf(lambda x: determine_edu_mapping(x, edu_levels.value), StringType())("required_education"))
    .withColumn("required_edu_field", udf(lambda x: determine_edu_mapping(x, edu_fields.value), StringType())("required_education"))
    .withColumn("required_cert_field", udf(lambda x: determine_edu_mapping(x, cert_categories.value), StringType())("jd_certifications"))
    .withColumn("no_of_certs", udf(lambda x: len(x) if isinstance(x, list) else 0, IntegerType())("jd_certifications"))
    .drop("required_education")
    .drop("jd_certifications")
)

# Show the final DataFrame
df.select("_id", "required_edu_level", "required_edu_field", "required_cert_field", "no_of_certs").show(10, truncate=False)
pyspark_df_info(df)

+------------------------+------------------+---------------------+--------------------------+-----------+
|_id                     |required_edu_level|required_edu_field   |required_cert_field       |no_of_certs|
+------------------------+------------------+---------------------+--------------------------+-----------+
|6845479b07df3572368fc32c|NULL              |NULL                 |Others                    |0          |
|684547e670634603a66d8deb|Bachelor's Degree |Computer Science & IT|Others                    |0          |
|684547ed70634603a66d8def|NULL              |NULL                 |Others                    |0          |
|684547f370634603a66d8df3|NULL              |NULL                 |Others                    |0          |
|684547f470634603a66d8df4|Bachelor's Degree |Business & Management|Others                    |0          |
|684547fc70634603a66d8df9|Bachelor's Degree |Others               |Agile & Project Management|1          |
|684547ff70634603a66d8dfb|NULL       