In [11]:
import re
import pandas as pd
import spacy

In [12]:
nlp = spacy.load("en_core_web_lg")

In [None]:
# Read and strip lines
with open('../data/30k_resume_files.txt', 'r', encoding="Windows-1252") as f:
    lines = [line.strip() for line in f if line.strip()]

In [14]:
# Split each line on ':::' and keep only the last part (i.e., the actual resume content)
parsed = [line.split(":::") for line in lines]

In [15]:
# Data Extraction from text document
extracted = []
for resume_id, line in enumerate(lines):
    parts = line.split(":::")
    if len(parts) < 3:
        continue
    section = parts[2]
    # Split on both "•" and "*" markers
    raw_segments = re.split(r"[•\*]", section)
    for seg in raw_segments[1:]:
        bullet = seg.strip()
        if bullet:
            extracted.append({
                "resume_id": resume_id,
                "raw_bullet": bullet
            })

In [16]:
df = pd.DataFrame(extracted)
df.head()

Unnamed: 0,resume_id,raw_bullet
0,0,Responsible for administering and maintaining ...
1,0,Analyzes the current database environment to d...
2,0,Monitors and troubleshoots production environm...
3,0,Creates and maintains documentation for DBA st...
4,0,Ensures that all code changes made in the prod...


In [17]:
df.tail()

Unnamed: 0,resume_id,raw_bullet
1027753,29782,Produce 24 pixel-perfect websites built from t...
1027754,29782,Design elegant UIs and develop 20+ websites fo...
1027755,29782,"Design, develop, and maintain Joomla, WordPres..."
1027756,29782,"Utilize HTML5, CSS3, jQuery to develop UX Focu..."
1027757,29782,Vet and scope budget compliant projects that e...


In [18]:
# Cleaning & normalization
cleaned = []
for item in extracted:
    text = item["raw_bullet"]
    # a) remove HTML tags
    text = re.sub(r"<[^>]+>", "", text)
    # b) collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # c) spaCy processing
    doc = nlp(text)
    tokens = [tok.lemma_.lower() for tok in doc if not tok.is_stop and not tok.is_punct]
    cleaned_text = " ".join(tokens)
    cleaned.append({
        "resume_id": item["resume_id"],
        "clean_bullet": cleaned_text
    })

In [20]:
df_clean = pd.DataFrame(cleaned)
df_clean.head()

Unnamed: 0,resume_id,clean_bullet
0,0,responsible administer maintain 150 database s...
1,0,analyze current database environment determine...
2,0,monitor troubleshoot production environment id...
3,0,create maintain documentation dba standard ope...
4,0,ensure code change production environment sox ...


In [None]:
df['clean_bullet'] = df_clean['clean_bullet']
df.head()

Unnamed: 0,resume_id,raw_bullet,clean_bullet
0,0,Responsible for administering and maintaining ...,responsible administer maintain 150 database s...
1,0,Analyzes the current database environment to d...,analyze current database environment determine...
2,0,Monitors and troubleshoots production environm...,monitor troubleshoot production environment id...
3,0,Creates and maintains documentation for DBA st...,create maintain documentation dba standard ope...
4,0,Ensures that all code changes made in the prod...,ensure code change production environment sox ...


In [None]:
# Save the cleaned DataFrame to a CSV file in data folder 
df.to_csv('../data/cleaned_bullets.csv', index=False)