In [22]:
print('hello world')

hello world


In [23]:
# Starting Spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Education Gold Processing") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
# Loading silver layer data 
silver_df = spark.read.parquet("datamart/silver/combined_resume_jd/*.parquet")

# Structure Silver Data
from utils.spark_utils import pyspark_df_info
pyspark_df_info(silver_df)
silver_df.show(5, truncate=False)


Total entries: 6241
Data columns (total 36 columns):
#   Column                    Non-Null Count     Dtype          
--- ------------------------- ------------------ ---------------
0   resume_id                 6241               string         
1   job_id                    6241               string         
2   snapshot_date             6241               date           
3   fit                       6241               string         
4   company_name              3908               string         
5   role_title                6046               string         
6   about_the_company         3664               string         
7   job_responsibilities      6241               array<string>  
8   jd_soft_skills            6241               array<string>  
9   required_language_proficiencies 6241               array<string>  
10  job_snapshot              6241               date           
11  jd_hard_skills_general    6240               array<string>  
12  jd_hard_skills_specific   

In [24]:
from pyspark.sql.functions import col, sum

# Assuming silver_df is your PySpark DataFrame
null_counts = silver_df.select(
    sum(col("required_edu_level").isNull().cast("int")).alias("null_count_required_edu_level"), # Alias is 'null_count_required_edu_level'
    sum(col("edu_highest_level").isNull().cast("int")).alias("null_count_highest_edu_level")
).collect()[0]

# --- This is the line that needs to be corrected ---
print(f"Null values in 'required_edu_level': {null_counts['null_count_required_edu_level']}")
# --- Make sure the key matches the alias exactly: ^ here

print(f"Null values in 'highest_edu_level': {null_counts['null_count_highest_edu_level']}")

Null values in 'required_edu_level': 1863
Null values in 'highest_edu_level': 462


In [25]:
print("Distinct required_edu_level values:")
print(silver_df.select('required_edu_level').distinct().count())



for edu_level in silver_df.select('required_edu_level').distinct().collect():
    c = silver_df.filter(silver_df.required_edu_level == edu_level.required_edu_level).count()
    print(f"{edu_level.required_edu_level} : {c}")

print("Distinct highest_edu_level values:")
print(silver_df.select('edu_highest_level').distinct().count())

for edu_level in silver_df.select('edu_highest_level').distinct().collect():
    c = silver_df.filter(silver_df.edu_highest_level == edu_level.edu_highest_level).count()
    print(f"{edu_level.edu_highest_level} : {c}")

Distinct required_edu_level values:
7
High School : 170
Master's Degree : 1069
Bachelor's Degree : 2984
Associate's Degree : 23
Others : 81
Doctorate : 51
None : 0
Distinct highest_edu_level values:
7
High School : 395
Master's Degree : 2365
Bachelor's Degree : 2147
Associate's Degree : 426
Others : 296
Doctorate : 150
None : 0


In [28]:
# Assuming you have a SparkSession named 'spark' and your DataFrame 'silver_df' is loaded.
# For example:
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("FeatureEngineering").getOrCreate()
# silver_df = spark.read.load(...) # Load your silver data

from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType

# --- Step 1: Define the Ordinal Mapping ---
# This Python dictionary holds the ordinal mapping we've defined.
edu_level_mapping = {
    'Others': 0,
    'High School': 1,
    'Associate\'s Degree': 2,
    'Bachelor\'s Degree': 3,
    'Master\'s Degree': 4,
    'Doctorate': 5
}
# Note: 'None' is intentionally left out. It will become null, which we'll handle.

# --- Step 2: Create a Spark Mapping Expression ---
# We convert the Python dictionary into a Spark mapping expression by first flattening it
# into a list of [key1, value1, key2, value2, ...]. This is a more robust approach.
flat_map_list = [item for sublist in edu_level_mapping.items() for item in sublist]
mapping_expr = F.create_map([F.lit(x) for x in flat_map_list])

# --- Step 3: Add Ordinal Rank Columns to the DataFrame ---
# We create two new temporary columns that hold the integer rank for the candidate's
# education and the job's required education.
# The `mapping_expr[F.col(...)]` syntax performs the lookup.
gold_df = silver_df.withColumn(
    "highest_edu_rank",
    mapping_expr[F.col("edu_highest_level")].cast(IntegerType())
).withColumn(
    "required_edu_rank",
    mapping_expr[F.col("required_edu_level")].cast(IntegerType())
)

# --- Step 4: Create the 'edu_match_flag' Feature (Updated Null Logic) ---
# This feature indicates if the education level is a match, with special handling for nulls.
# - A candidate with a Doctorate (rank 5) is always considered a match (1).
# - If both levels are present, it's a match (1) only if they are exactly equal.
# - If both levels are present and not equal, it's not a match (0).
# - Otherwise (one is null and candidate is not a Doctorate), the outcome is null.
gold_df = gold_df.withColumn(
    "edu_match_flag",
    F.when(F.col("highest_edu_rank") == 5, 1)
     .when(F.col("highest_edu_rank") == F.col("required_edu_rank"), 1)
     .when(F.col("highest_edu_rank").isNotNull() & F.col("required_edu_rank").isNotNull(), 0)
     .otherwise(None)
)


# --- Step 5: Create the 'edu_score' Feature (Updated Null Logic) ---
# This implements the symmetric scoring logic with new exceptions for null values.
# - If candidate's education is null, score is null.
# - A Doctorate (rank 5) vs a null requirement is a perfect match (0.0).
# - A non-Doctorate vs a null requirement is an unknown match (null).
# - Otherwise, the score is calculated based on the rank difference.
max_rank_diff = float(max(edu_level_mapping.values()) - min(edu_level_mapping.values())) # This is 5.0

gold_df = gold_df.withColumn(
    "edu_score",
    F.when(
        F.col("highest_edu_rank").isNull(),
        None
    ).when(
        (F.col("highest_edu_rank") == 5) & F.col("required_edu_rank").isNull(),
        0.0
    ).when(
        F.col("required_edu_rank").isNull(),
        None
    ).otherwise(
        (F.col("highest_edu_rank") - F.col("required_edu_rank")) / max_rank_diff
    ).cast(FloatType())
)

# --- Step 6: Handle Null Values (Optional but Recommended) ---
# Our logic correctly produces nulls where education info was missing.
# For many models, you need to fill these. A common strategy is to fill with 0,
# assuming no information means no match.
# gold_df = gold_df.na.fill(value=0, subset=["edu_match_flag", "edu_score"])

# --- Step 7: Verify the Results and Clean Up ---
# Select the relevant columns to see the result of our transformations.
print("Verification of new education features:")
gold_df.select(
    "resume_id",
    "job_id",
    "edu_highest_level",
    "required_edu_level",
    "highest_edu_rank",
    "required_edu_rank",
    "edu_match_flag",
    "edu_score"
).show(20, truncate=False)

# Finally, you can drop the intermediate rank columns to keep the gold table clean.
final_gold_df = gold_df.drop("highest_edu_rank", "required_edu_rank")

print("\nFinal Gold DataFrame schema:")
final_gold_df.printSchema()


Verification of new education features:
+------------+-----------+-----------------+------------------+----------------+-----------------+--------------+---------+
|resume_id   |job_id     |edu_highest_level|required_edu_level|highest_edu_rank|required_edu_rank|edu_match_flag|edu_score|
+------------+-----------+-----------------+------------------+----------------+-----------------+--------------+---------+
|RES_s93wTCLp|JD_s93wTCLp|Master's Degree  |NULL              |4               |NULL             |NULL          |NULL     |
|RES_Z7yf1tu6|JD_Z7yf1tu6|Master's Degree  |Bachelor's Degree |4               |3                |0             |0.2      |
|RES_hAp1XnJZ|JD_hAp1XnJZ|Master's Degree  |NULL              |4               |NULL             |NULL          |NULL     |
|RES_zsgGxd2s|JD_zsgGxd2s|Master's Degree  |NULL              |4               |NULL             |NULL          |NULL     |
|RES_RrDNNvMz|JD_RrDNNvMz|Others           |Bachelor's Degree |0               |3           

In [None]:
"""
This script contains functions to generate gold-level education features for the 
resume-to-job-description matching project.
"""

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType

# --- Configuration: Define the Ordinal Mapping ---
# This dictionary holds the ordinal mapping for education levels.
# It's defined globally as it's a fixed business rule.
EDU_LEVEL_MAPPING = {
    'Others': 0,
    'High School': 1,
    'Associate\'s Degree': 2,
    'Bachelor\'s Degree': 3,
    'Master\'s Degree': 4,
    'Doctorate': 5
}

def create_education_features(df: DataFrame) -> DataFrame:
    """
    Generates gold-level education features from a silver-level DataFrame.

    This function takes a DataFrame and adds two new columns:
    1.  'edu_match_flag': A binary flag indicating if education levels are an exact match,
        with special handling for nulls and Doctorates.
    2.  'edu_score': A normalized score from -1.0 to 1.0 representing the alignment
        between the candidate's education and the job requirement.

    It then drops the original and intermediate columns used for the calculation.

    Args:
        df (DataFrame): The input silver-level Spark DataFrame.

    Returns:
        DataFrame: A new DataFrame with the added gold features and dropped columns.
    """
    # --- Step 1: Create a Spark Mapping Expression ---
    flat_map_list = [item for sublist in EDU_LEVEL_MAPPING.items() for item in sublist]
    mapping_expr = F.create_map([F.lit(x) for x in flat_map_list])

    # --- Step 2: Add Intermediate Ordinal Rank Columns ---
    # These temporary columns hold the integer rank for education levels.
    df_with_ranks = df.withColumn(
        "highest_edu_rank",
        mapping_expr[F.col("edu_highest_level")].cast(IntegerType())
    ).withColumn(
        "required_edu_rank",
        mapping_expr[F.col("required_edu_level")].cast(IntegerType())
    )

    # --- Step 3: Create the 'edu_match_flag' Feature ---
    df_with_flag = df_with_ranks.withColumn(
        "edu_match_flag",
        F.when(F.col("highest_edu_rank") == 5, 1)
         .when(F.col("highest_edu_rank") == F.col("required_edu_rank"), 1)
         .when(F.col("highest_edu_rank").isNotNull() & F.col("required_edu_rank").isNotNull(), 0)
         .otherwise(None)
    )

    # --- Step 4: Create the 'edu_score' Feature ---
    max_rank = max(EDU_LEVEL_MAPPING.values())
    min_rank = min(EDU_LEVEL_MAPPING.values())
    max_rank_diff = float(max_rank - min_rank)

    df_with_score = df_with_flag.withColumn(
        "edu_score",
        F.when(F.col("highest_edu_rank").isNull(), None)
         .when((F.col("highest_edu_rank") == 5) & F.col("required_edu_rank").isNull(), 0.0)
         .when(F.col("required_edu_rank").isNull(), None)
         .otherwise((F.col("highest_edu_rank") - F.col("required_edu_rank")) / max_rank_diff)
         .cast(FloatType())
    )

    # --- Step 5: Handle Null Values and Clean Up ---
    # Fill any remaining nulls in our new feature columns with 0.
    # Then, drop the columns as requested.
    final_df = df_with_score.na.fill(value=0, subset=["edu_match_flag", "edu_score"])

    columns_to_drop = [
        "edu_highest_level", "required_edu_level",
        "highest_edu_rank", "required_edu_rank"
    ]
    
    return final_df.drop(*columns_to_drop)

# --- Example Usage Block ---
if __name__ == '__main__':
    # This block demonstrates how to use the function.
    # It will only run when the script is executed directly.
    
    spark = SparkSession.builder \
        .appName("EducationFeatureEngineering") \
        .getOrCreate()

    # Create a sample DataFrame that mimics your silver_df structure
    sample_data = [
        ("resume1", "job1", "Doctorate", "Master's Degree"),   # Overqualified
        ("resume2", "job2", "Bachelor's Degree", "Bachelor's Degree"), # Exact match
        ("resume3", "job3", "High School", "Master's Degree"),   # Underqualified
        ("resume4", "job4", "Doctorate", None),                 # Null required, should be 1 / 0.0
        ("resume5", "job5", "Master's Degree", None),          # Null required, should be null/null -> 0/0
        ("resume6", "job6", None, "Bachelor's Degree"),        # Null highest, should be null/null -> 0/0
    ]
    columns = ["resume_id", "job_id", "edu_highest_level", "required_edu_level"]
    silver_df_sample = spark.createDataFrame(sample_data, columns)

    print("--- Sample Silver DataFrame ---")
    silver_df_sample.show()

    # Apply the feature engineering function
    gold_df_sample = create_education_features(silver_df_sample)

    print("\n--- Resulting Gold DataFrame ---")
    gold_df_sample.show()
    
    print("\n--- Gold DataFrame Schema ---")
    gold_df_sample.printSchema()

    spark.stop()


In [31]:
from pyspark.sql.functions import col, sum

# Assuming gold_df is your PySpark DataFrame
total_rows = gold_df.count()

null_counts = gold_df.select(
    sum(col("edu_match_flag").isNull().cast("int")).alias("null_count_edu_match_flag"),
    sum(col("edu_score").isNull().cast("int")).alias("null_count_edu_score")
).collect()[0]

print(f"Total rows in DataFrame: {total_rows}")
print(f"Null values in 'edu_match_flag': {null_counts['null_count_edu_match_flag']}")
print(f"Null values in 'edu_score': {null_counts['null_count_edu_score']}")

Total rows in DataFrame: 6241
Null values in 'edu_match_flag': 2137
Null values in 'edu_score': 2137


# Majors

In [33]:
print("Distinct required_edu_field values:")
print(silver_df.select('required_edu_field').distinct().count())



for edu_level in silver_df.select('required_edu_field').distinct().collect():
    c = silver_df.filter(silver_df.required_edu_field == edu_level.required_edu_field).count()
    print(f"{edu_level.required_edu_field} : {c}")

print("Distinct edu_field values:")
print(silver_df.select('edu_field').distinct().count())

for edu_level in silver_df.select('edu_field').distinct().collect():
    c = silver_df.filter(silver_df.edu_field == edu_level.edu_field).count()
    print(f"{edu_level.edu_field} : {c}")

Distinct required_edu_field values:
12
Mathematics & Statistics : 56
Business & Management : 318
Computer Science & IT : 965
Architecture & Design : 43
Education & Training : 109
Environmental & Agricultural Sciences : 8
Others : 1414
Finance & Accounting : 929
Arts & Creative Fields : 10
Economics : 14
Engineering (General) : 512
None : 0
Distinct edu_field values:
20
Marketing & Communications : 132
Mathematics & Statistics : 201
Business & Management : 1265
Humanities : 10
Computer Science & IT : 1290
Architecture & Design : 14
Education & Training : 81
Environmental & Agricultural Sciences : 136
Law & Legal Studies : 44
Interdisciplinary : 8
Others : 1035
Medicine & Health Sciences : 134
Finance & Accounting : 390
Biological Sciences : 16
Physical Sciences : 114
Social Sciences : 51
Economics : 42
Engineering (General) : 811
Arts & Creative Fields : 5
None : 0
