In [None]:
import re
import pandas as pd

In [None]:
from datasets import load_dataset

# Load the Hugging Face dataset
ds = load_dataset("facehuggerapoorv/resume-jd-match")

train_df_origin = ds["train"].to_pandas()
test_df_origin = ds["test"].to_pandas()

In [None]:
#Separating job description and resume ---------------
def extract_JD_and_resume(text_series):
    data = []
    for text in text_series:
        matches = re.findall(r'<<(.*?)>>', text, flags=re.DOTALL)
        data.append([(matches[0]), (matches[1])])
    return data;
train_data = extract_JD_and_resume(train_df_origin["text"])
test_data = extract_JD_and_resume(test_df_origin["text"])

In [None]:
train_df = pd.DataFrame(train_data, columns=["Job Description", "Resume"])
test_df = pd.DataFrame(test_data, columns=["Job Description", "Resume"])
train_df["Label"] = train_df_origin["label"]
test_df["Label"] = test_df_origin["label"]

In [None]:
#Restoring punctuations using a pre-trained punctuation model. Some resumes are lacking punctuation.

from transformers import pipeline

punct_model = pipeline(
    "token-classification",
    model="oliverguhr/fullstop-punctuation-multilang-large",
    aggregation_strategy="simple",
    device=0,
    batch_size=4
)

def restore_many(texts):
    outputs = punct_model(texts)
    results = []
    for token_list in outputs:
        result = ""
        for t in token_list:
            result += t["word"]
            if t["entity_group"] == "PERIOD":
                result += ". "
            elif t["entity_group"] == "COMMA":
                result += ", "
            elif t["entity_group"] == "QUESTION":
                result += "? "
            else:
                result += " "
        results.append(result.strip())
    return results

def process_series(series, batch_size=32):
    series = series.fillna("")
    texts = series.tolist()
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        results.extend(restore_many(batch))
    return results

train_df["Job Description"] = process_series(train_df["Job Description"])
train_df["Resume"] = process_series(train_df["Resume"])
test_df["Job Description"] = process_series(test_df["Job Description"])
test_df["Resume"] = process_series(test_df["Resume"])


In [None]:
def fix_spacing(text: str) -> str:
    """Clean and normalize spacing using uppercase cues and punctuation rules."""


    # Insert space between camelCase/PascalCase boundaries
  
    text = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", text)
    text = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", " ", text)

    # Ensure spacing after punctuation when followed by a letter
    text = re.sub(r"([.,!?;:])(?=[A-Za-z])", r"\1 ", text)

    # Add spacing around brackets/parentheses
    text = re.sub(r"(?<!\s)([\(\[\{])", r" \1", text)   # before opening
    text = re.sub(r"([\)\]\}])(?!\s)", r"\1 ", text)   # after closing

    #  Add spacing around & symbol
    text = re.sub(r"(?<=\w)&(?=\w)", r" & ", text)

    # Add spacing around other special characters
    text = re.sub(r"(?<!\s)([/\|])(?!\s)", r" \1 ", text)

    # Fix missing space after quotes and apostrophes if needed
    text = re.sub(r"([\"”’])(?=[A-Za-z])", r"\1 ", text)

    # Collapse repeated punctuation spacing (".  ." → ". ")
    text = re.sub(r"\s{2,}", " ", text)

    # Trim edges
    return text.strip()
