Resume parsing notebook.

In [1]:
# Standard libraries
import pandas as pd
import numpy as np
import re
import html
from pathlib import Path
from tqdm.auto import tqdm
from IPython.display import display

tqdm.pandas()  # enable progress bars


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Path to CSV
csv_path = r"C:\Users\abanu\Documents\T-IQ\data\raw\resumes\Resume\Resume.csv"

# Load CSV
df = pd.read_csv(csv_path)

# Normalize column names
df.columns = [c.strip() for c in df.columns]

# Ensure canonical columns exist
for col in ['Resume_str','Resume_html','Category']:
    if col not in df.columns:
        df[col] = None

# Display top rows
display(df.head(3))

# Quick stats
df['len_text'] = df['Resume_str'].astype(str).map(len)
print(df['len_text'].describe())


Unnamed: 0,employee_id,resume_text,resume_html,resume_category,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 162,Unnamed: 163,Unnamed: 164,Unnamed: 165,Unnamed: 166,Unnamed: 167,Unnamed: 168,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,


count    2710.0
mean        4.0
std         0.0
min         4.0
25%         4.0
50%         4.0
75%         4.0
max         4.0
Name: len_text, dtype: float64


In [3]:
from bs4 import BeautifulSoup

# Clean HTML to plain text
def clean_resume_text(text):
    if not isinstance(text, str):
        return ""
    
    # Use BeautifulSoup to remove HTML
    text = BeautifulSoup(text, "lxml").get_text(separator=" ")
    
    # Remove leftover HTML fragments
    text = re.sub(r"</?\w+[^>]*>", " ", text)
    
    # Remove bullets and weird characters
    text = re.sub(r"[\•\●\►\▪\□\·]", " ", text)
    
    # Normalize whitespace
    text = " ".join(text.split())
    
    return text.strip()

# Create clean_text column
df['clean_text'] = df['Resume_html'].fillna(df['Resume_str']).map(clean_resume_text)


In [4]:
# Make sure you have installed the model once in your venv:
# !python -m spacy download en_core_web_sm

import spacy

try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully.")
except OSError:
    print("spaCy model not found. Please run:")
    print("    !python -m spacy download en_core_web_sm")
    raise


spaCy model loaded successfully.


In [5]:
# Regex patterns
email_re = re.compile(r'([A-Za-z0-9\._%+\-]+@[A-Za-z0-9\.\-]+\.[A-Za-z]{2,})', re.I)
phone_re = re.compile(r'(\+?\d[\d\-\s\(\)]{6,}\d)')
linkedin_re = re.compile(r'(https?://(?:www\.)?linkedin\.com/[^\s,;]+)', re.I)
github_re = re.compile(r'(https?://(?:www\.)?github\.com/[^\s,;]+)', re.I)

def extract_contacts(text):
    if not isinstance(text, str):
        return {'emails': [], 'phones': [], 'linkedin': [], 'github': []}
    
    emails = list(dict.fromkeys(email_re.findall(text)))
    phones = []
    for p in phone_re.findall(text):
        digits = re.sub(r'\D', '', p)
        if 7 <= len(digits) <= 15:
            phones.append(p.strip())
    phones = list(dict.fromkeys(phones))
    
    linkedin = list(dict.fromkeys(linkedin_re.findall(text)))
    github = list(dict.fromkeys(github_re.findall(text)))
    
    return {'emails': emails, 'phones': phones, 'linkedin': linkedin, 'github': github}

# Apply contact extraction
df['contacts'] = df['clean_text'].progress_map(extract_contacts)


100%|██████████| 2710/2710 [00:00<00:00, 48987.26it/s]


In [6]:
# Name extraction: spaCy + regex fallback
name_line_re = re.compile(r'^[A-Z][a-z]+(?:[\s\-][A-Z][a-z]+){1,3}$')

def extract_name(text):
    if not isinstance(text, str) or not text.strip():
        return None
    
    snippet = text[:600]  # first 600 chars
    # spaCy NER
    doc = nlp(snippet)
    persons = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]
    if persons:
        return persons[0]
    
    # First line heuristic
    lines = snippet.split("\n")
    if lines:
        first = lines[0].strip()
        if 2 <= len(first.split()) <= 4 and name_line_re.match(first):
            return first
    return None

# Apply name extraction
df['name'] = df['clean_text'].progress_map(extract_name)


100%|██████████| 2710/2710 [00:00<00:00, 434086.84it/s]


In [7]:
# Summary
total = len(df)
rows_with_name = df['name'].notna().sum()
rows_with_email = df['contacts'].map(lambda c: len(c.get('emails',[]))>0).sum()
rows_with_phone = df['contacts'].map(lambda c: len(c.get('phones',[]))>0).sum()

print(f"Total resumes: {total}")
print(f"Resumes with name: {rows_with_name} ({rows_with_name/total:.1%})")
print(f"Resumes with ≥1 email: {rows_with_email} ({rows_with_email/total:.1%})")
print(f"Resumes with ≥1 phone: {rows_with_phone} ({rows_with_phone/total:.1%})")

# Show examples
cols_to_show = ['ID','name','contacts','Category','clean_text']
cols_to_show = [c for c in cols_to_show if c in df.columns]

display(df.loc[df['name'].notna() | df['contacts'].map(lambda c: len(c.get('emails',[]))>0), cols_to_show].head(10))
display(df.loc[df['name'].isna() & df['contacts'].map(lambda c: len(c.get('emails',[]))==0), cols_to_show].head(10))

# Save parsed CSV
out_path = Path(r"C:\Users\abanu\Documents\T-IQ\data\processed\resumes_parsed.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
print("Saved parsed resumes to:", out_path)


Total resumes: 2710
Resumes with name: 0 (0.0%)
Resumes with ≥1 email: 0 (0.0%)
Resumes with ≥1 phone: 0 (0.0%)


Unnamed: 0,name,contacts,Category,clean_text


Unnamed: 0,name,contacts,Category,clean_text
0,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
1,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
2,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
3,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
4,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
5,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
6,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
7,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
8,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,
9,,"{'emails': [], 'phones': [], 'linkedin': [], '...",,


Saved parsed resumes to: C:\Users\abanu\Documents\T-IQ\data\processed\resumes_parsed.csv


In [8]:
from bs4 import BeautifulSoup
import re

def clean_resume_html(text):
    if not isinstance(text, str):
        return ""
    
    # Use BeautifulSoup to get text
    soup = BeautifulSoup(text, "lxml")
    text = soup.get_text(separator=" ")
    
    # Remove leftover HTML fragments
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&nbsp;", " ", text)
    
    # Remove extra whitespace and special characters
    text = re.sub(r"[\•\●\►\▪\□\·]", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    return text.strip()

# Rebuild clean_text column (prefer Resume_html if it exists)
df['clean_text'] = df['Resume_html'].fillna(df['Resume_str']).map(clean_resume_html)

# Quick check
display(df[['clean_text']].head(5))


Unnamed: 0,clean_text
0,
1,
2,
3,
4,


In [9]:
# Contacts
df['contacts'] = df['clean_text'].progress_map(extract_contacts)

# Names
df['name'] = df['clean_text'].progress_map(extract_name)


100%|██████████| 2710/2710 [00:00<00:00, 236650.58it/s]
100%|██████████| 2710/2710 [00:00<00:00, 390563.30it/s]


In [10]:
df['clean_text'].head(10).tolist()


['', '', '', '', '', '', '', '', '', '']

In [11]:
# How many names and emails were extracted
print("Names detected:", df['name'].notna().sum())
print("Resumes with ≥1 email:", df['contacts'].map(lambda c: len(c.get('emails',[]))>0).sum())
print("Resumes with ≥1 phone:", df['contacts'].map(lambda c: len(c.get('phones',[]))>0).sum())

# Show a few examples
display(df[['ID','name','contacts','Category','clean_text']].head(10))


Names detected: 0
Resumes with ≥1 email: 0
Resumes with ≥1 phone: 0


KeyError: "['ID'] not in index"

In [12]:
print(df.columns.tolist())


['employee_id', 'resume_text', 'resume_html', 'resume_category', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 6

In [13]:
df = df.rename(columns={'employee_id': 'ID',
                        'resume_text': 'Resume_str',
                        'resume_html': 'Resume_html',
                        'resume_category': 'Category'})


In [14]:
df = df[[c for c in df.columns if not c.startswith('Unnamed:')]]


In [15]:
print(df.columns.tolist())
# Expected: ['ID','Resume_str','Resume_html','Category','len_text','clean_text','contacts','name']


['ID', 'Resume_str', 'Resume_str', 'Resume_html', 'Resume_html', 'Category', 'Category', 'Resume_str', 'Resume_str', 'Resume_html', 'Resume_html', 'Category', 'Category', 'len_text', 'clean_text', 'contacts', 'name']


In [17]:
# Keep only the **first occurrence** of each useful column
cols_to_keep = ['ID', 'Resume_str', 'Resume_html', 'Category', 'len_text', 'clean_text', 'contacts', 'name']

# Deduplicate columns: take the first occurrence
seen = set()
new_cols = []
for c in df.columns:
    if c in cols_to_keep and c not in seen:
        new_cols.append(c)
        seen.add(c)

df = df[new_cols]

# Reset column order to expected
df = df[['ID', 'Resume_str', 'Resume_html', 'Category', 'len_text', 'clean_text', 'contacts', 'name']]

# Confirm
print(df.columns.tolist())


['ID', 'Resume_str', 'Resume_str', 'Resume_str', 'Resume_str', 'Resume_html', 'Resume_html', 'Resume_html', 'Resume_html', 'Category', 'Category', 'Category', 'Category', 'len_text', 'clean_text', 'contacts', 'name']


In [18]:
# List of essential columns
essential_cols = ['ID', 'Resume_str', 'Resume_html', 'Category', 'len_text', 'clean_text', 'contacts', 'name']

# Keep only the first occurrence of each essential column
seen = set()
cols_to_keep = []
for c in df.columns:
    if c in essential_cols and c not in seen:
        cols_to_keep.append(c)
        seen.add(c)

df = df[cols_to_keep]

# Check the result
print(df.columns.tolist())


['ID', 'Resume_str', 'Resume_str', 'Resume_str', 'Resume_str', 'Resume_html', 'Resume_html', 'Resume_html', 'Resume_html', 'Category', 'Category', 'Category', 'Category', 'len_text', 'clean_text', 'contacts', 'name']


In [19]:
import pandas as pd

# Load CSV with automatic duplicate handling
csv_path = r"C:\Users\abanu\Documents\T-IQ\data\raw\resumes\Resume\Resume.csv"
df = pd.read_csv(csv_path, mangle_dupe_cols=True)

# Inspect columns
print(df.columns.tolist())


TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'

In [20]:
import pandas as pd

csv_path = r"C:\Users\abanu\Documents\T-IQ\data\raw\resumes\Resume\Resume.csv"

# Load CSV
df = pd.read_csv(csv_path)

# Inspect columns
print(df.columns.tolist())


['employee_id', 'resume_text', 'resume_html', 'resume_category', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 6

In [21]:
# Keep the first occurrence of each essential column
essential_cols = ['employee_id', 'resume_text', 'resume_html', 'resume_category']

seen = set()
cols_to_keep = []
for c in df.columns:
    if c in essential_cols and c not in seen:
        cols_to_keep.append(c)
        seen.add(c)

df = df[cols_to_keep]

# Rename to standard names
df = df.rename(columns={
    'employee_id': 'ID',
    'resume_text': 'Resume_str',
    'resume_html': 'Resume_html',
    'resume_category': 'Category'
})

# Add downstream columns if missing
for c in ['len_text','clean_text','contacts','name']:
    if c not in df.columns:
        df[c] = None

# Confirm final columns
print(df.columns.tolist())


['ID', 'Resume_str', 'Resume_html', 'Category', 'len_text', 'clean_text', 'contacts', 'name']


In [22]:
from bs4 import BeautifulSoup
import re

def clean_resume_html(text):
    if not isinstance(text, str):
        return ""
    soup = BeautifulSoup(text, "lxml")
    text = soup.get_text(separator=" ")
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&nbsp;", " ", text)
    text = re.sub(r"[\•\●\►\▪\□\·]", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

df['clean_text'] = df['Resume_html'].fillna(df['Resume_str']).map(clean_resume_html)


In [23]:
import re
from tqdm import tqdm
tqdm.pandas()

def extract_contacts(text):
    contacts = {
        'emails': re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}", text),
        'phones': re.findall(r"\+?\d[\d\s\-]{7,}\d", text),
        'linkedin': re.findall(r"(https?://www\.linkedin\.com/in/[a-zA-Z0-9_-]+)", text)
    }
    return contacts

df['contacts'] = df['clean_text'].progress_map(extract_contacts)


100%|██████████| 2710/2710 [00:01<00:00, 1663.41it/s]


In [24]:
import spacy

# Download model if not already
# !python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

def extract_name(text):
    if not text: 
        return None
    doc = nlp(text[:1000])  # limit to first 1000 chars
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            return ent.text
    # fallback: take first line with 2-4 words
    lines = text.splitlines()
    for line in lines:
        line = line.strip()
        if 2 <= len(line.split()) <= 4:
            return line
    return None

df['name'] = df['clean_text'].progress_map(extract_name)


100%|██████████| 2710/2710 [01:13<00:00, 36.98it/s] 


In [25]:
# How many rows have names, emails, phones
print("Names detected:", df['name'].notna().sum())
print("Resumes with ≥1 email:", df['contacts'].map(lambda c: len(c['emails'])>0).sum())
print("Resumes with ≥1 phone:", df['contacts'].map(lambda c: len(c['phones'])>0).sum())

# Show a few examples
display(df[['ID','name','contacts','Category','clean_text']].head(10))


Names detected: 1727
Resumes with ≥1 email: 18
Resumes with ≥1 phone: 442


Unnamed: 0,ID,name,contacts,Category,clean_text
0,16852973,Highlights Focused,"{'emails': [], 'phones': [], 'linkedin': []}",HR,HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMINI...
1,22323967,Served,"{'emails': [], 'phones': [], 'linkedin': []}",HR,"HR SPECIALIST, US HR OPERATIONS Summary Versat..."
2,33176873,ASHHRA,"{'emails': [], 'phones': [], 'linkedin': []}",HR,HR DIRECTOR Summary Over 20 years experience i...
3,27018550,,"{'emails': [], 'phones': [], 'linkedin': []}",HR,"HR SPECIALIST Summary Dedicated, Driven, and D..."
4,17812897,Skill Highlights,"{'emails': [], 'phones': [], 'linkedin': []}",HR,HR MANAGER Skill Highlights HR SKILLS HR Depar...
5,11592605,Maintained,"{'emails': [], 'phones': [], 'linkedin': []}",HR,HR GENERALIST Summary Dedicated and focused Ad...
6,25824789,Mandated Training,"{'emails': [], 'phones': [], 'linkedin': []}",HR,HR MANAGER Summary HUMAN RESOURCES MANAGER Ext...
7,15375009,"management, vendor","{'emails': [], 'phones': [], 'linkedin': []}",HR,HR MANAGER Professional Summary Senior HR prof...
8,11847784,,"{'emails': [], 'phones': [], 'linkedin': []}",HR,HR SPECIALIST Summary Possess 15+ years of exp...
9,32896934,,"{'emails': [], 'phones': [], 'linkedin': []}",HR,HR CLERK Summary Translates business vision in...


In [26]:
output_path = r"C:\Users\abanu\Documents\T-IQ\data\processed\resumes_parsed.csv"
df.to_csv(output_path, index=False)
print("✅ Processed CSV saved.")


✅ Processed CSV saved.
