# Resume Text Extract & Deep Cleaning

### Text extraction using pymudpf package

In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.6


In [2]:
import fitz
import re

def extract_resume_text(pdf_path):
    doc = fitz.open(pdf_path)
    result = []

    for page in doc:

        # Try standard text extraction (most reliable for many resumes)
        text = page.get_text("text", sort=True)
        if text and text.strip():
            result.append(text)
            continue

        # Try block extraction
        blocks = page.get_text("blocks")
        if blocks:
            block_text = "\n".join(b[4] for b in blocks if len(b) > 4)
            if block_text.strip():
                result.append(block_text)
                continue

        # Try rawdict spans (used in our original code)
        raw = page.get_text("rawdict")
        if raw and "blocks" in raw:
            for block in raw["blocks"]:
                if "lines" in block:
                    for line in block["lines"]:
                        spans = [span["text"] for span in line["spans"]]
                        if spans:
                            result.append(" ".join(spans))

        # Try XML mode (captures weird PDF encodings)
        xml = page.get_text("xml")
        if xml and xml.strip():
            result.append(xml)

        # Last fallback → HTML mode
        html = page.get_text("html")
        if html and html.strip():
            result.append(html)

    doc.close()

    # Normalize bullets & whitespace
    text = "\n".join(result)
    text = re.sub(r"[•●▪■◆▶►]", "-", text)
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)

    return text


In [3]:
pdf_path = "Kas Kiatsukasem Resume.pdf"

text = extract_resume_text(pdf_path)

print("\n===== EXTRACTED TEXT PREVIEW =====\n")
print(text)  # print first 1500 chars
print("\n===================================\n")



===== EXTRACTED TEXT PREVIEW =====

                     Kas Kiatsukasem
          New York, USA - (646) 994-7154 - kasemsuk.k@columbia.edu - linkedin.com/in/kasemsuk/

                                                                                                                                                                                                                                                                                                                                                                                                                   ​
​                       AREAS OF EXPERTISE​    & CERTIFICATIONS
Recurring Revenue Ops Model ∙ Salesforce & Hubspot CRM ∙ Outreach ∙ Gong ∙ Tableau ∙ Zoominfo ∙ SQL ∙ Rstudio
           Revenue Architecture (Winning by Design) ∙ Intro to RevOps (Pavilion) ∙ SQL Essential Training

Revenue operations professional with 3+ years of experience in SaaS recurring revenue models and CRM analytics.
Skilled at translating business 

### Text Deep Cleaning

In [4]:
import re
import unicodedata

def clean_resume_text(text):

    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[\u200b\u200c\u200d\u2060\ufeff]", "", text)

    text = re.sub(r"\S+@\S+", " ", text)
    text = re.sub(r"\+?\d[\d\-\s\(\)]{7,}\d", " ", text)
    text = re.sub(r"(https?:\/\/\S+|www\.\S+)", " ", text)
    text = re.sub(r"(linkedin|github)\S*", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"\b\d{5}(?:-\d{4})?\b", " ", text)

    pii_words = ["email", "phone", "linkedin", "github", "contact", "address"]
    filtered = []
    for line in text.split("\n"):
        if not line.strip().lower().startswith(tuple(pii_words)):
            filtered.append(line)
    text = "\n".join(filtered)

    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&[a-z]+;", " ", text)

    text = re.sub(r"^[•●▪■◆▶►▸⦿⦾]\s*", "- ", text, flags=re.MULTILINE)
    text = re.sub(r"^-(\S)", r"- \1", text, flags=re.MULTILINE)

    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
    text = text.replace("–", "-").replace("—", "-")

    text = text.replace("\t", " ")
    text = re.sub(r" {2,}", " ", text)

    # ---------------------------------------
    # Remove everything before the first real section header
    # ---------------------------------------
    section_headers = [
    # Skills / Expertise
    "skills", "technical skills", "areas of expertise", "expertise",
    "core competencies", "competencies", "strengths",
    "tools", "certifications",

    # Experience
    "professional experience", "experience", "work experience",
    "employment history",

    # Education
    "education", "academic background", "academic history",

    # Projects
    "projects", "project experience", "relevant project experience",
    "academic projects",

    # Summary / Profile
    "summary", "professional summary", "profile",
    "about me", "overview", "objective"
]

    cleaned_lines = []
    found = False
    for line in text.split("\n"):
        stripped = line.strip().lower()
        if any(stripped.startswith(h) for h in section_headers):
            found = True
        if found:
            cleaned_lines.append(line)

    # ---------------------------------------
    # Insert EXACTLY 1 blank line before each section header
    # ---------------------------------------
    final_lines = []
    for i, line in enumerate(cleaned_lines):
        stripped = line.strip().lower()
        if any(stripped.startswith(h) for h in section_headers):
            # Not for first header
            if len(final_lines) > 0 and final_lines[-1].strip() != "":
                final_lines.append("")
        final_lines.append(line)

    cleaned_lines = final_lines

    # ---------------------------------------
    # Collapse extra blank lines, allow max 1
    # ---------------------------------------
    result = []
    blank = False
    for line in cleaned_lines:
        if line.strip() == "":
            if not blank:
                result.append("")
            blank = True
        else:
            result.append(line.rstrip())
            blank = False

    text = "\n".join(result)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


In [5]:
cleaned_text = clean_resume_text(text)
print("\n===== CLEANED TEXT PREVIEW =====\n")
print(cleaned_text)  # print first 1500 chars
print("\n=================================\n")


===== CLEANED TEXT PREVIEW =====

AREAS OF EXPERTISE & CERTIFICATIONS
Recurring Revenue Ops Model ∙ Salesforce & Hubspot CRM ∙ Outreach ∙ Gong ∙ Tableau ∙ Zoominfo ∙ SQL ∙ Rstudio
 Revenue Architecture (Winning by Design) ∙ Intro to RevOps (Pavilion) ∙ SQL Essential Training

Revenue operations professional with 3+ years of experience in SaaS recurring revenue models and CRM analytics.
Skilled at translating business requirements into technical solutions, optimizing GTM tools, and enabling data-driven
processes that strengthen operations and accelerate revenue growth.

Education
Columbia University December 2025
Master of Science in Applied Analytics
Relevant coursework: Generative AI modeling, Natural Language Processing (NLP) and Large Language Models (LLMs),
Database design (SQL), Applied Analytic Framework (RStudio), Data Science Research
San Francisco State University December 2021
Bachelor of Science in Business Administration, concentration in Decision Sciences

Work Experience

### Save Cleaned text as csv file

In [6]:
import pandas as pd
pd.DataFrame([{"cleaned_text": cleaned_text}]).to_csv("Kas_cleaned_resume.csv", index=False)