# Combined Job Posting Data Deep Cleaning

In [3]:
import pandas as pd
import os

path = "Jobs_data.csv"

# Load CSV
job_text = pd.read_csv(path)

# Quick preview
print(job_text.shape)
job_text.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Jobs_data.csv'

In [None]:
def build_job_text(row):
    parts = []

    if pd.notnull(row.get("Job Title")):
        parts.append(str(row["Job Title"]))

    if pd.notnull(row.get("Responsibilities")):
        parts.append(str(row["Responsibilities"]))

    if pd.notnull(row.get("Job Description")):
        parts.append(str(row["Job Description"]))

    if pd.notnull(row.get("skills")):
        parts.append(str(row["skills"]))

    if pd.notnull(row.get("Experience")):
        parts.append("Experience required: " + str(row["Experience"]))

    # Optional: include benefits (but can add noise)
    # if pd.notnull(row.get("Benefits")):
    #    parts.append(str(row["Benefits"]))

    return "\n".join(parts)


In [None]:
job_text["job_text_raw"] = job_text.apply(build_job_text, axis=1)

In [None]:
import re
import unicodedata
import pandas as pd

def clean_job_text(text):
    """
    Cleans job posting text:
    - Removes emails, phone numbers, URLs
    - Removes HTML tags & entities
    - Normalizes bullets and whitespace
    - Collapses extra blank lines
    - Preserves actual content like skills, responsibilities, requirements
    """

    if not isinstance(text, str):
        return ""

    # 1) Unicode normalize + remove zero-width characters
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[\u200b\u200c\u200d\u2060\ufeff]", "", text)

    # 2) Remove emails, phone numbers, URLs
    text = re.sub(r"\S+@\S+", " ", text)                          # emails
    text = re.sub(r"\+?\d[\d\-\s\(\)]{7,}\d", " ", text)          # phone numbers
    text = re.sub(r"(https?:\/\/\S+|www\.\S+)", " ", text)        # URLs

    # Remove names like: linkedin jobs, glassdoor jobs, etc.
    text = re.sub(r"(linkedin|glassdoor|indeed|monster|career|company)\S*",
                  " ", text, flags=re.IGNORECASE)

    # 3) Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&[a-z]+;", " ", text)

    # 4) Normalize bullet points
    text = re.sub(r"[•●▪■◆▶►▸⦿⦾]", "- ", text)
    text = re.sub(r"^-(\S)", r"- \1", text, flags=re.MULTILINE)

    # 5) Normalize dashes
    text = text.replace("–", "-").replace("—", "-")

    # 6) Compact spaces
    text = text.replace("\t", " ")
    text = re.sub(r" {2,}", " ", text)

    # 7) Collapse multiple blank lines (allow max 1)
    lines = [line.strip() for line in text.split("\n")]
    final_lines = []
    blank_seen = False

    for line in lines:
        if line == "":
            if not blank_seen:
                final_lines.append("")
            blank_seen = True
        else:
            final_lines.append(line)
            blank_seen = False

    text = "\n".join(final_lines)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


In [None]:
# Clean all job postings
job_text["job_text_cleaned"] = job_text["job_text_raw"].apply(clean_job_text)

NameError: name 'job_text' is not defined

In [None]:
import textwrap
import numpy as np

# Get last non-empty cleaned job text and print wrapped
s = job_text["job_text_cleaned"].replace(r'^\s*$', np.nan, regex=True).dropna()
if s.empty:
    print("No cleaned job text available")
else:
    last = s.iloc[-1]
    print("---- Last cleaned job text (wrapped, width=100) ----")
    print(textwrap.fill(last, width=100))


---- Last cleaned job text (wrapped, width=100) ----
Business Development Consultant (Travel Partnerships) About the job CloudofGoods.com Role: Business
Development Consultant (Part-Time) Location: Florida (Orlando, Miami, Port Canaveral), Los Angeles,
Anaheim, Las Vegas, Atlanta, Seattle Commitment: Up to 20 hours/week About Cloud of Goods Cloud of
Goods is the largest equipment rental marketplace in the U.S., delivering mobility scooters,
wheelchairs, strollers, cribs, and other travel gear to 300+ U.S. cities and select international
destinations. We make travel easier and more accessible for elderly travelers, families with
children, and those who need equipment rentals at their travel destinations. Role Overview We are
seeking part-time Business Development Consultants in key U.S. markets to establish partnerships
with travel industry organizations - including hotels, attractions, cruise lines, conference
centers, travel agencies, and tour operators. The role is focused on outboun

### Save cleaned job dataset

In [None]:
import pandas as pd

# Save the full dataframe with the cleaned column added
job_text.to_csv("cleaned_job_data.csv", index=False)
print(f"Saved {len(job_text)} jobs to cleaned_job_data.csv")

### Read cleaned job data

In [None]:
path = "cleaned_job_data.csv"
job_data = pd.read_csv(path)
job_data.head(1)

Unnamed: 0,job_text_cleaned
0,Digital Marketing Specialist\nManage and grow ...
1,Web Developer\nDesign and code user interfaces...
2,Operations Manager\nEstablish and enforce qual...
3,"Network Engineer\nDesign, configure, and optim..."
4,Event Manager\nSpecialize in conference and co...
5,Software Tester\nTest software applications an...
6,"Teacher\nPlan and deliver engaging lessons, ad..."
7,UX/UI Designer\nCreate visually appealing user...
8,"UX/UI Designer\nWork on interaction design, de..."
9,Wedding Planner\nOffer expert advice and guida...
