## Gold Transformation Education Features

In [9]:
# Starting Spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Education Gold Processing") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# Loading silver layer data 
silver_df = spark.read.parquet("datamart/silver/combined_resume_jd/*.parquet")

# Structure Silver Data
from utils.spark_utils import pyspark_df_info
pyspark_df_info(silver_df)
silver_df.show(5, truncate=False)


Total entries: 6241
Data columns (total 36 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   resume_id                 6241               string         
1   job_id                    6241               string         
2   snapshot_date             6241               date           
3   fit                       6241               string         
4   company_name              3908               string         
5   role_title                6046               string         
6   about_the_company         3664               string         
7   job_responsibilities      6241               array<string>  
8   jd_soft_skills            6241               array<string>  
9   required_language_proficiencies 6241               array<string>  
10  job_snapshot              6241               date           
11  jd_hard_skills_general    6240               array<string>  
12  jd_hard_skills_specific   

## Education Level Matching

In [2]:
print("Distinct required_edu_level values:")
print(silver_df.select('required_edu_level').distinct().count())

for edu_level in silver_df.select('required_edu_level').distinct().collect():
    c = silver_df.filter(silver_df.required_edu_level == edu_level.required_edu_level).count()
    print(f"{edu_level.required_edu_level} : {c}")

print("Distinct highest_edu_level values:")
print(silver_df.select('edu_highest_level').distinct().count())

for edu_level in silver_df.select('edu_highest_level').distinct().collect():
    c = silver_df.filter(silver_df.edu_highest_level == edu_level.edu_highest_level).count()
    print(f"{edu_level.edu_highest_level} : {c}")

Distinct required_edu_level values:
7
High School : 170
Master's Degree : 1069
Bachelor's Degree : 2984
Associate's Degree : 23
Others : 81
Doctorate : 51
None : 0
Distinct highest_edu_level values:
7
High School : 395
Master's Degree : 2365
Bachelor's Degree : 2147
Associate's Degree : 426
Others : 296
Doctorate : 150
None : 0


In [3]:
# education level matching
# 1. Transform education level to numeric values
from pyspark.sql.types import BooleanType, FloatType, StructType, StructField
from pyspark.sql.functions import udf, col


scale = spark.sparkContext.broadcast(spark.read.parquet("datamart/references/education_level_synonyms.parquet").collect())

def map_edu_level(req: str, given: str, scale: list):
    if req is None:
        if given is None:
            return (None, None)
        else:
            # No requirement, but education provided.
            return (True, 0.75)

    # From here, req is not None.
    if given is None:
        return (False, 0.0) # Requirement, but no education provided.

    req_n_list = [row.group_scale for row in scale if row.group_name == req]
    given_n_list = [row.group_scale for row in scale if row.group_name == given]

    # If required or given education level is not in our reference scale, we can't compare.
    if not req_n_list or not given_n_list:
        return (False, 0.0)

    req_n = req_n_list[0]
    given_n = given_n_list[0]

    if given_n == req_n:
        return (True, 1.0)
    elif given_n > req_n:
        return (True, 0.75)
    else: # given_n < req_n
        return (False, 0.0)

EDU_TMP_SCHEMA = StructType([
    StructField("edu_match", BooleanType(), True),
    StructField("edu_score", FloatType(), True),
])


df = (
    silver_df
    .withColumn("tmp_edu_match", udf(lambda req, giv: map_edu_level(req, giv, scale.value), EDU_TMP_SCHEMA)(col("required_edu_level"), col("edu_highest_level")))
    .withColumn("edu_match", col("tmp_edu_match.edu_match"))
    .withColumn("edu_score", col("tmp_edu_match.edu_score"))
    .drop("tmp_edu_match")
)

print("Distinct edu_match values:")
for match in df.select('edu_match').distinct().collect():
    c = df.filter(df.edu_match == match.edu_match).count()
    print(f"{match.edu_match} : {c}")

print("Distinct edu_score values:")
for score in df.select('edu_score').distinct().collect():
    c = df.filter(df.edu_score == score.edu_score).count()
    print(f"{score.edu_score} : {c}")

df.select("required_edu_level", "edu_highest_level", "edu_match", "edu_score").show(10, truncate=False)


Distinct edu_match values:
True : 4570
False : 1521
None : 0
Distinct edu_score values:
0.75 : 3075
1.0 : 1495
None : 0
0.0 : 1521
+------------------+-----------------+---------+---------+
|required_edu_level|edu_highest_level|edu_match|edu_score|
+------------------+-----------------+---------+---------+
|NULL              |Master's Degree  |true     |0.75     |
|Bachelor's Degree |Master's Degree  |true     |0.75     |
|NULL              |Master's Degree  |true     |0.75     |
|NULL              |Master's Degree  |true     |0.75     |
|Bachelor's Degree |Others           |false    |0.0      |
|Bachelor's Degree |Doctorate        |true     |0.75     |
|NULL              |Bachelor's Degree|true     |0.75     |
|Master's Degree   |Bachelor's Degree|false    |0.0      |
|NULL              |Bachelor's Degree|true     |0.75     |
|NULL              |Others           |true     |0.75     |
+------------------+-----------------+---------+---------+
only showing top 10 rows


In [4]:
pyspark_df_info(df)


Total entries: 6241
Data columns (total 38 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   resume_id                 6241               string         
1   job_id                    6241               string         
2   snapshot_date             6241               date           
3   fit                       6241               string         
4   company_name              3908               string         
5   role_title                6046               string         
6   about_the_company         3664               string         
7   job_responsibilities      6241               array<string>  
8   jd_soft_skills            6241               array<string>  
9   required_language_proficiencies 6241               array<string>  
10  job_snapshot              6241               date           
11  jd_hard_skills_general    6240               array<string>  
12  jd_hard_skills_specific   

## Education Field Matching

In [19]:
# Loading silver layer data 
df = spark.read.parquet("datamart/silver/combined_resume_jd/*.parquet")

# Structure Silver Data
from utils.spark_utils import pyspark_df_info
pyspark_df_info(df)
df.show(5, truncate=False)


Total entries: 6241
Data columns (total 36 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   resume_id                 6241               string         
1   job_id                    6241               string         
2   snapshot_date             6241               date           
3   fit                       6241               string         
4   company_name              3908               string         
5   role_title                6046               string         
6   about_the_company         3664               string         
7   job_responsibilities      6241               array<string>  
8   jd_soft_skills            6241               array<string>  
9   required_language_proficiencies 6241               array<string>  
10  job_snapshot              6241               date           
11  jd_hard_skills_general    6240               array<string>  
12  jd_hard_skills_specific   

In [11]:
df['required_edu_field', 'edu_field'].show(10, truncate=False)

+---------------------+---------------------+
|required_edu_field   |edu_field            |
+---------------------+---------------------+
|NULL                 |Engineering (General)|
|Computer Science & IT|Computer Science & IT|
|NULL                 |Education & Training |
|NULL                 |Computer Science & IT|
|Business & Management|Engineering (General)|
|Others               |Engineering (General)|
|NULL                 |Others               |
|Finance & Accounting |Business & Management|
|NULL                 |Engineering (General)|
|NULL                 |Finance & Accounting |
+---------------------+---------------------+
only showing top 10 rows


In [12]:
df.select('required_edu_field').distinct().show(truncate=False)

+-------------------------------------+
|required_edu_field                   |
+-------------------------------------+
|Mathematics & Statistics             |
|Business & Management                |
|Computer Science & IT                |
|Architecture & Design                |
|Education & Training                 |
|Environmental & Agricultural Sciences|
|Others                               |
|Finance & Accounting                 |
|Arts & Creative Fields               |
|Economics                            |
|Engineering (General)                |
|NULL                                 |
+-------------------------------------+



In [13]:
from pyspark.sql import functions as F

def map_edu_field(req: str, given: str) -> bool:
    if req is None or req == "Others":
        return True
    elif req == given:
        return True
    else:
        return False

df = (
    df.withColumn("edu_field_match", F.udf(map_edu_field, BooleanType())(F.col("required_edu_field"), F.col("edu_field")))
)

print("Distinct edu_field_match values:")
for match in df.select('edu_field_match').distinct().collect():
    c = df.filter(df.edu_field_match == match.edu_field_match).count()
    print(f"{match.edu_field_match} : {c}")

Distinct edu_field_match values:
True : 3952
False : 2289


In [14]:
pyspark_df_info(df)


Total entries: 6241
Data columns (total 37 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   resume_id                 6241               string         
1   job_id                    6241               string         
2   snapshot_date             6241               date           
3   fit                       6241               string         
4   company_name              3908               string         
5   role_title                6046               string         
6   about_the_company         3664               string         
7   job_responsibilities      6241               array<string>  
8   jd_soft_skills            6241               array<string>  
9   required_language_proficiencies 6241               array<string>  
10  job_snapshot              6241               date           
11  jd_hard_skills_general    6240               array<string>  
12  jd_hard_skills_specific   

## Cert matching

In [15]:
df.select('required_cert_categories', 'cert_categories').show(10, truncate=False)

+-----------------------------------------------+---------------+
|required_cert_categories                       |cert_categories|
+-----------------------------------------------+---------------+
|[]                                             |[]             |
|[]                                             |[]             |
|[]                                             |[]             |
|[]                                             |[]             |
|[]                                             |[]             |
|[Business Analysis, Agile & Project Management]|[]             |
|[ERP & Business Software]                      |[]             |
|[]                                             |[]             |
|[ERP & Business Software]                      |[]             |
|[]                                             |[]             |
+-----------------------------------------------+---------------+
only showing top 10 rows


In [22]:
def map_certification(req: list, given: list) -> bool:
    if req is None or not req or len(req) == 0:
        return True
    # from here, req is not None and has at least one element.
    if given is None or not given or len(given) == 0:
        return False
    # from here, req and given are not None and have at least one element.
    # if any given certification matches any required certification, return True.
    for r in req:
        if r in given:
            return True
    return False

df = (
    df.withColumn("cert_match", F.udf(map_certification, BooleanType())(F.col("required_cert_categories"), F.col("cert_categories")))
)

print("Distinct cert_match values:")
for match in df.select('cert_match').distinct().collect():
    c = df.filter(df.cert_match == match.cert_match).count()
    print(f"{match.cert_match} : {c}")

Distinct cert_match values:
True : 4534
False : 1707
