## Processing of Job Descriptions Required Education

Goal:</br>
Transform the single variable 'required_education' to two variables 'required_education_level' 'required_education_field'

Assumtions:</br>
- Choose education level based on lowest level referred to (e.g. bachelor required, master preferred --> bachelor)
- something else?

In [15]:
# Initialisation
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T

from utils.spark_utils import pyspark_df_info

In [16]:
spark = SparkSession.builder \
    .appName("Silver Processing Education") \
    .master("local[*]") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [17]:
# Loading data 
df = spark.read.parquet('data/silver/*.parquet')
pyspark_df_info(df)


Total entries: 6241
Data columns (total 32 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   resume_id                 6241               string         
1   job_id                    6241               string         
2   _id                       6241               string         
3   fit                       6241               string         
4   snapshot_date             6241               date           
5   fit_score                 6241               double         
6   company_name              3908               string         
7   role_title                6046               string         
8   employment_type           6196               string         
9   job_location              4075               string         
10  about_the_company         3664               string         
11  job_responsibilities      6241               array<string>  
12  required_hard_skills      6241  

In [18]:
# Filtering to relevant columns & removing null values
df_education = df['job_id', '_id', 'snapshot_date', 'job_snapshot', 'required_education']
df_education.show(5, truncate=False)
print(f"Total records: {df_education.count()}")

df_education = df_education.filter(F.col('required_education').isNotNull())
print(f"Records after filtering nulls: {df_education.count()}")

+-----------+------------------------+-------------+------------+--------------------------------------------------------------------------------------------------+
|job_id     |_id                     |snapshot_date|job_snapshot|required_education                                                                                |
+-----------+------------------------+-------------+------------+--------------------------------------------------------------------------------------------------+
|JD_s93wTCLp|6845479b07df3572368fc32c|2021-07-15   |2021-07-15  |NULL                                                                                              |
|JD_Z7yf1tu6|684547e670634603a66d8deb|2021-07-24   |2021-07-24  |Bachelor's degree in computer science or a related discipline required. Master's degree preferred.|
|JD_hAp1XnJZ|684547ed70634603a66d8def|2021-07-21   |2021-07-21  |NULL                                                                                              |
|JD_zsgGxd

In [19]:
df_education_levels = spark.createDataFrame(
    data=[
        ("High School", 1, "Completion of high school or equivalent", 
            [
                "high school", "ged", "secondary", "college", "diploma"
            ]
        ),
        ("Associate's Degree", 2, "Completion of a two-year degree program or certificate", 
            [
                "associate's", "a.a", "a.s", "a.a.s", "certificate"
            ]
        ),
        ("Bachelor's Degree", 3, "Completion of a undergraduate degree program", 
            [
                "bachelor", "bachelor's degree", "bachelor's in", "undergraduate degree",
                "ba", "bs", "bsc", "bba", "beng"
                
            ]
        ),
        ("Master's Degree", 4, "Completion of a postgraduate degree program", 
            [
                "master", "master's degree", "master's in", "postgraduate degree",
                "ma", "ms", "msc", "mba", "m.eng"
            ]
        ),
        ("Doctorate", 5, "Completion of a doctoral degree program", 
            [
                "phd", "md", "jd", "doctor", "doctorate", "doctorate degree", "doctoral degree"
            ]
        )
    ],
    schema=T.StructType([
        T.StructField('level_name', T.StringType(), False),
        T.StructField('level_scale', T.IntegerType(), False),
        T.StructField('level_description', T.StringType(), True),
        T.StructField('level_references', T.ArrayType(T.StringType()), True)
    ])
)

df_education_levels.show(5, truncate=False)
pyspark_df_info(df_education_levels)
# Save as Parquet instead of CSV
df_education_levels.write.mode('overwrite').parquet('data/education_level_synonyms_simplified.parquet')

# Read back from Parquet (preserves schema and array types)
df_education_levels = spark.read.parquet('data/education_level_synonyms_simplified.parquet')
pyspark_df_info(df_education_levels)

+------------------+-----------+------------------------------------------------------+------------------------------------------------------------------------------------------+
|level_name        |level_scale|level_description                                     |level_references                                                                          |
+------------------+-----------+------------------------------------------------------+------------------------------------------------------------------------------------------+
|High School       |1          |Completion of high school or equivalent               |[high school, ged, secondary, college, diploma]                                           |
|Associate's Degree|2          |Completion of a two-year degree program or certificate|[associate's, a.a, a.s, a.a.s, certificate]                                               |
|Bachelor's Degree |3          |Completion of a undergraduate degree program          |[bachelor, bachelo

In [None]:
education_levels_list = spark.read.parquet('data/education_level_synonyms_simplified.parquet').collect()
broadcasted_education_levels = spark.sparkContext.broadcast(education_levels_list)

print(f"Broadcasted education levels: {len(broadcasted_education_levels.value)}")
print(f"Broadcasted education levels sample: \n{broadcasted_education_levels.value[:2]}")

Broadcasted education levels: 5
Broadcasted education levels sample: 
[Row(level_name="Associate's Degree", level_scale=2, level_description='Completion of a two-year degree program or certificate', level_references=["associate's", 'a.a', 'a.s', 'a.a.s', 'certificate']), Row(level_name="Bachelor's Degree", level_scale=3, level_description='Completion of a undergraduate degree program', level_references=['bachelor', "bachelor's degree", "bachelor's in", 'undergraduate degree'])]


In [None]:
# Using rapidfuzz for fuzzy matching
from rapidfuzz import fuzz, process

# Collect and broadcast the education levels data
education_levels_list = spark.read.parquet('data/education_level_synonyms_simplified.parquet').collect()
broadcasted_education_levels = spark.sparkContext.broadcast(education_levels_list)

# For each entry in df_education, determine the education level by trying to match the required_education field with the level_references
def determine_education_level(education_input: str) -> int:
    if education_input is None:
        return None

    best_score = 0
    best_match_level = 0
    # Use rapidfuzz to find the best match
    for row in broadcasted_education_levels.value:
        match_result = process.extractOne(education_input.lower(), row.level_references, scorer=fuzz.partial_token_set_ratio)
        if match_result and match_result[1] > best_score:
            best_score, best_match_level = match_result[1], row.level_scale

    return best_match_level


# iterate through the DataFrame and apply the function
df_education = df_education.withColumn(
    'education_level',
    F.udf(determine_education_level, T.IntegerType())(F.col('required_education'))
)
df_education.show(10, truncate=False)
pyspark_df_info(df_education)

+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------+
|job_id     |_id                     |snapshot_date|job_snapshot|required_education                                                                                                                      |education_level|
+-----------+------------------------+-------------+------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------+
|JD_Z7yf1tu6|684547e670634603a66d8deb|2021-07-24   |2021-07-24  |Bachelor's degree in computer science or a related discipline required. Master's degree preferred.                                      |3              |
|JD_RrDNNvMz|684547f470634603a66d8df4|2021-07-13   |2021-07-13  |Bachelor's degree in Business Administration, Information T

In [23]:
for l in range(1, 6):
    c = df_education.filter(F.col('education_level') == l).count()
    print(f"Count for education level {l}: {c}")


Count for education level 1: 78
Count for education level 2: 30
Count for education level 3: 3639
Count for education level 4: 348
Count for education level 3: 3639
Count for education level 4: 348
Count for education level 5: 283
Count for education level 5: 283
