# Combined Job Posting Data Deep Cleaning

In [1]:
import pandas as pd
import os

path = "Jobs_data.csv"

# Load CSV
job_text = pd.read_csv(path)

# Quick preview
print(job_text.shape)
job_text.head()


  job_text = pd.read_csv(path)


(1646539, 13)


Unnamed: 0,Job Id,Job Title,Company,Job Description,Company Link,Job Link,location,Country,Salary Range,Experience,Benefits,skills,Responsibilities
0,1089843540111562,Digital Marketing Specialist,Icahn Enterprises,Social Media Managers oversee an organizations...,,,Douglas,Isle of Man,$59K-$99K,5 to 15 Years,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ..."
1,398454096642776,Web Developer,PNC Financial Services Group,Frontend Web Developers design and implement u...,,,Ashgabat,Turkmenistan,$56K-$116K,2 to 12 Years,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ..."
2,481640072963533,Operations Manager,United Services Automobile Assn.,Quality Control Managers establish and enforce...,,,Macao,"Macao SAR, China",$61K-$104K,0 to 12 Years,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...
3,688192671473044,Network Engineer,Hess,"Wireless Network Engineers design, implement, ...",,,Porto-Novo,Benin,$65K-$91K,4 to 11 Years,"{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo..."
4,117057806156508,Event Manager,Cairn Energy,A Conference Manager coordinates and manages c...,,,Santiago,Chile,$64K-$87K,1 to 12 Years,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...


In [2]:
def build_job_text(row):
    parts = []

    if pd.notnull(row.get("Job Title")):
        parts.append(str(row["Job Title"]))

    if pd.notnull(row.get("Responsibilities")):
        parts.append(str(row["Responsibilities"]))

    if pd.notnull(row.get("Job Description")):
        parts.append(str(row["Job Description"]))

    if pd.notnull(row.get("skills")):
        parts.append(str(row["skills"]))

    if pd.notnull(row.get("Experience")):
        parts.append("Experience required: " + str(row["Experience"]))

    # Optional: include benefits (but can add noise)
    # if pd.notnull(row.get("Benefits")):
    #    parts.append(str(row["Benefits"]))

    return "\n".join(parts)


In [3]:
job_text["job_text_raw"] = job_text.apply(build_job_text, axis=1)

In [4]:
import re
import unicodedata
import pandas as pd

def clean_job_text(text):
    """
    Cleans job posting text:
    - Removes emails, phone numbers, URLs
    - Removes HTML tags & entities
    - Normalizes bullets and whitespace
    - Collapses extra blank lines
    - Preserves actual content like skills, responsibilities, requirements
    """

    if not isinstance(text, str):
        return ""

    # 1) Unicode normalize + remove zero-width characters
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[\u200b\u200c\u200d\u2060\ufeff]", "", text)

    # 2) Remove emails, phone numbers, URLs
    text = re.sub(r"\S+@\S+", " ", text)                          # emails
    text = re.sub(r"\+?\d[\d\-\s\(\)]{7,}\d", " ", text)          # phone numbers
    text = re.sub(r"(https?:\/\/\S+|www\.\S+)", " ", text)        # URLs

    # Remove names like: linkedin jobs, glassdoor jobs, etc.
    text = re.sub(r"(linkedin|glassdoor|indeed|monster|career|company)\S*",
                  " ", text, flags=re.IGNORECASE)

    # 3) Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&[a-z]+;", " ", text)

    # 4) Normalize bullet points
    text = re.sub(r"[•●▪■◆▶►▸⦿⦾]", "- ", text)
    text = re.sub(r"^-(\S)", r"- \1", text, flags=re.MULTILINE)

    # 5) Normalize dashes
    text = text.replace("–", "-").replace("—", "-")

    # 6) Compact spaces
    text = text.replace("\t", " ")
    text = re.sub(r" {2,}", " ", text)

    # 7) Collapse multiple blank lines (allow max 1)
    lines = [line.strip() for line in text.split("\n")]
    final_lines = []
    blank_seen = False

    for line in lines:
        if line == "":
            if not blank_seen:
                final_lines.append("")
            blank_seen = True
        else:
            final_lines.append(line)
            blank_seen = False

    text = "\n".join(final_lines)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


In [5]:
# Clean all job postings
job_text["job_text_cleaned"] = job_text["job_text_raw"].apply(clean_job_text)

In [6]:
import textwrap
import numpy as np

# Get last non-empty cleaned job text and print wrapped
s = job_text["job_text_cleaned"].replace(r'^\s*$', np.nan, regex=True).dropna()
if s.empty:
    print("No cleaned job text available")
else:
    last = s.iloc[-1]
    print("---- Last cleaned job text (wrapped, width=100) ----")
    print(textwrap.fill(last, width=100))


---- Last cleaned job text (wrapped, width=100) ----
Business Development Consultant (Travel Partnerships) About the job CloudofGoods.com Role: Business
Development Consultant (Part-Time) Location: Florida (Orlando, Miami, Port Canaveral), Los Angeles,
Anaheim, Las Vegas, Atlanta, Seattle Commitment: Up to 20 hours/week About Cloud of Goods Cloud of
Goods is the largest equipment rental marketplace in the U.S., delivering mobility scooters,
wheelchairs, strollers, cribs, and other travel gear to 300+ U.S. cities and select international
destinations. We make travel easier and more accessible for elderly travelers, families with
children, and those who need equipment rentals at their travel destinations. Role Overview We are
seeking part-time Business Development Consultants in key U.S. markets to establish partnerships
with travel industry organizations - including hotels, attractions, cruise lines, conference
centers, travel agencies, and tour operators. The role is focused on outboun

### Save cleaned job dataset

In [7]:
import pandas as pd

# Save the full dataframe with the cleaned column added
job_text.to_csv("cleaned_job_data.csv", index=False)
print(f"Saved {len(job_text)} jobs to cleaned_job_data.csv")

Saved 1646539 jobs to cleaned_job_data.csv


### Read cleaned job data

In [8]:
path = "cleaned_job_data.csv"
job_data = pd.read_csv(path)
job_data.head(1)

  job_data = pd.read_csv(path)


Unnamed: 0,Job Id,Job Title,Company,Job Description,Company Link,Job Link,location,Country,Salary Range,Experience,Benefits,skills,Responsibilities,job_text_raw,job_text_cleaned
0,1089843540111562,Digital Marketing Specialist,Icahn Enterprises,Social Media Managers oversee an organizations...,,,Douglas,Isle of Man,$59K-$99K,5 to 15 Years,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Digital Marketing Specialist\nManage and grow ...,Digital Marketing Specialist\nManage and grow ...
