In [3]:
import re
import pandas as pd
import spacy

In [4]:
nlp = spacy.load("en_core_web_lg")

In [5]:
with open('../data/30k_resume_files.txt', 'r', encoding="Windows-1252") as f:
    lines = [line.strip() for line in f if line.strip()]

In [None]:
# Data Extraction from text document
extracted = []
for resume_id, line in enumerate(lines):
    parts = line.split(":::")
    if len(parts) < 3:
        continue
    section = parts[2]
    # Split on both "•" and "*" markers
    raw_segments = re.split(r"[•\*]", section)
    for seg in raw_segments[1:]:
        bullet = seg.strip()
        if bullet:
            extracted.append({
                "resume_id": resume_id,
                "raw_bullet": bullet
            })

In [None]:
df_raw = pd.DataFrame(extracted)
df_raw.head(20)

Unnamed: 0,resume_id,raw_bullet
0,0,Responsible for administering and maintaining ...
1,0,Analyzes the current database environment to d...
2,0,Monitors and troubleshoots production environm...
3,0,Creates and maintains documentation for DBA st...
4,0,Ensures that all code changes made in the prod...
5,0,Analyses and migrates data using ETL into SQL ...
6,0,Works closely with infrastructure team for pat...
7,0,Upgrades servers as required from SQL Server 2...
8,0,Completes database administration maintenance ...
9,0,Provides 24/7 on call support as needed.


In [26]:
df.tail(20)

Unnamed: 0,resume_id,raw_bullet
960713,29782,Ensure coverage of all aspects of application ...
960714,29782,"Establish and evolve technology frameworks, pl..."
960715,29782,Encourage team engagement in constant skillset...
960716,29782,Design and build websites and components with ...
960717,29782,Integrate new features into complex existing p...
960718,29782,Drive and manage a wide variety of requests fr...
960719,29782,Inspire others to grow in their fields for the...
960720,29782,"Develop and maintain over 80 web presences, 67..."
960721,29782,Collaborate creatively with team members on a ...
960722,29782,Maintain all IT services for multi-directional...


In [29]:
# Cleaning & normalization
cleaned = []
for item in extracted:
    text = item["raw_bullet"]
    # a) remove HTML tags
    text = re.sub(r"<[^>]+>", "", text)
    # b) collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # c) spaCy processing
    doc = nlp(text)
    tokens = [tok.lemma_.lower() for tok in doc if not tok.is_stop and not tok.is_punct]
    cleaned_text = " ".join(tokens)
    cleaned.append({
        "resume_id": item["resume_id"],
        "clean_bullet": cleaned_text
    })

In [32]:
df_clean = pd.DataFrame(cleaned)
df_clean.head(20)

Unnamed: 0,resume_id,clean_bullet
0,0,responsible administer maintain 150 database s...
1,0,analyze current database environment determine...
2,0,monitor troubleshoot production environment id...
3,0,create maintain documentation dba standard ope...
4,0,ensure code change production environment sox ...
5,0,analysis migrate datum etl sql server database...
6,0,work closely infrastructure team patching hard...
7,0,upgrade server require sql server 2005 sql ser...
8,0,complete database administration maintenance p...
9,0,provide 24/7 support need


In [33]:
df_clean.tail(20)

Unnamed: 0,resume_id,clean_bullet
1027738,29782,ensure coverage aspect application development...
1027739,29782,establish evolve technology framework platform...
1027740,29782,encourage team engagement constant skillset gr...
1027741,29782,design build website component php mysql javas...
1027742,29782,integrate new feature complex exist platform l...
1027743,29782,drive manage wide variety request customer cow...
1027744,29782,inspire grow field betterment career team comp...
1027745,29782,develop maintain 80 web presence 67 wordpress ...
1027746,29782,collaborate creatively team member wide array ...
1027747,29782,maintain service multi directional scaling int...


In [None]:
# Save to CSV in data folders
df_raw.to_csv("../data/raw_bullets.csv", index=False)
df_clean.to_csv("../data/clean_bullets.csv", index=False)