notebook 6 - pesume parsing

Create a clean candidate feature table by:

Using already-processed resume features

Aligning IDs correctly

Merging safely with HRMS

Producing a model-ready dataset

In [12]:
import pandas as pd
import numpy as np
import re
from pathlib import Path


In [13]:
BASE_PATH = Path(r"C:\Users\abanu\Documents\t_iq_hr\data\processed")

RESUMES_PATH = BASE_PATH / "Resumes_cleaned.csv"
HRMS_PATH = BASE_PATH / "HRMS_cleaned.csv"


In [14]:
resumes_df = pd.read_csv(RESUMES_PATH)
hrms_df = pd.read_csv(HRMS_PATH)

print(resumes_df.shape)
print(hrms_df.shape)


(2481, 11)
(10000, 11)


In [15]:
def standardize_emp_id(series, prefix="EMP", pad=4):
    return (
        series
        .astype(str)
        .str.extract(r'(\d+)')[0]
        .astype(float)
        .astype('Int64')
        .apply(lambda x: f"{prefix}{str(x).zfill(pad)}" if pd.notna(x) else None)
    )


In [16]:
hrms_df['EmployeeID'] = standardize_emp_id(hrms_df['EmployeeID'])


KeyError: 'EmployeeID'

In [17]:
print(hrms_df.columns.tolist())


['employee_id', 'name', 'department', 'job_role', 'location', 'current_salary', 'satisfaction_score', 'engagement_score', 'num_skills', 'years_at_company', 'trainings_count']


In [18]:
# Rename employee_id → EmployeeID
hrms_df.rename(columns={'employee_id': 'EmployeeID'}, inplace=True)

# Standardize ID format
hrms_df['EmployeeID'] = standardize_emp_id(hrms_df['EmployeeID'])

print(hrms_df[['EmployeeID']].head())


  EmployeeID
0    EMP0001
1    EMP0002
2    EMP0003
3    EMP0004
4    EMP0005


In [19]:
print(hrms_df['EmployeeID'].isna().sum())
print(hrms_df['EmployeeID'].nunique())


0
10000


In [20]:
print(resumes_df.columns.tolist())
print(resumes_df.shape)
resumes_df.head(3)


['employee_id', 'Resume_str', 'resume_html', 'Category', 'resume_len', 'resume_clean', 'num_words', 'resume_word_count', 'resume_sent_count', 'Category_enc', 'num_sentences']
(2481, 11)


Unnamed: 0,employee_id,Resume_str,resume_html,Category,resume_len,resume_clean,num_words,resume_word_count,resume_sent_count,Category_enc,num_sentences
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,5442,hr administratormarketing associate hr adminis...,674,674.0,26,25,26
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,5572,hr specialist us hr operations summary versati...,708,708.0,26,25,26
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,7720,hr director summary over 20 years experience i...,1017,1017.0,40,25,40


In [21]:
resumes_df.rename(columns={'employee_id': 'CandidateID'}, inplace=True)


In [22]:
def standardize_candidate_id(series, prefix="CAND", pad=5):
    return (
        series
        .astype(str)
        .str.extract(r'(\d+)')[0]
        .astype(float)
        .astype('Int64')
        .apply(lambda x: f"{prefix}{str(x).zfill(pad)}" if pd.notna(x) else None)
    )


In [23]:
resumes_df['CandidateID'] = standardize_candidate_id(resumes_df['CandidateID'])


In [24]:
resumes_df[['CandidateID']].head()


Unnamed: 0,CandidateID
0,CAND16852973.0
1,CAND22323967.0
2,CAND33176873.0
3,CAND27018550.0
4,CAND17812897.0


In [25]:
resumes_df['cleaned_resume'] = resumes_df['resume_clean'].astype(str)


In [26]:
resumes_df[['CandidateID']].head()


Unnamed: 0,CandidateID
0,CAND16852973.0
1,CAND22323967.0
2,CAND33176873.0
3,CAND27018550.0
4,CAND17812897.0


In [27]:
# Reset index to guarantee sequence
resumes_df = resumes_df.reset_index(drop=True)

# Assign clean sequential Candidate IDs
resumes_df['CandidateID'] = [
    f"CAND{str(i+1).zfill(5)}" for i in range(len(resumes_df))
]

resumes_df[['CandidateID']].head()


Unnamed: 0,CandidateID
0,CAND00001
1,CAND00002
2,CAND00003
3,CAND00004
4,CAND00005


In [29]:
resumes_df['cleaned_resume'] = resumes_df['resume_clean'].astype(str)

resumes_df[['CandidateID', 'cleaned_resume']].head(2)


Unnamed: 0,CandidateID,cleaned_resume
0,CAND00001,hr administratormarketing associate hr adminis...
1,CAND00002,hr specialist us hr operations summary versati...


In [30]:
SKILLS = [
    'python', 'sql', 'excel', 'machine learning', 'deep learning',
    'nlp', 'pandas', 'numpy', 'scikit-learn',
    'tensorflow', 'pytorch', 'power bi', 'tableau',
    'hr', 'recruitment', 'payroll', 'talent management'
]

def extract_skills(text):
    text = text.lower()
    return sorted({skill for skill in SKILLS if skill in text})


In [31]:
resumes_df['skills'] = resumes_df['cleaned_resume'].apply(extract_skills)
resumes_df['num_skills'] = resumes_df['skills'].apply(len)

resumes_df[['CandidateID', 'skills', 'num_skills']].head()


Unnamed: 0,CandidateID,skills,num_skills
0,CAND00001,"[hr, payroll]",2
1,CAND00002,"[hr, recruitment]",2
2,CAND00003,"[excel, hr, payroll, recruitment]",4
3,CAND00004,"[excel, hr, payroll]",3
4,CAND00005,"[excel, hr, payroll, recruitment]",4


In [32]:
import re

def extract_experience(text):
    match = re.search(r'(\d+)\s+years?', text)
    return int(match.group(1)) if match else 0


In [33]:
resumes_df['experience_years'] = resumes_df['cleaned_resume'].apply(extract_experience)

resumes_df[['CandidateID', 'experience_years']].head()


Unnamed: 0,CandidateID,experience_years
0,CAND00001,15
1,CAND00002,0
2,CAND00003,20
3,CAND00004,20
4,CAND00005,0


In [34]:
resume_features = resumes_df[
    ['CandidateID', 'Category', 'num_skills', 'experience_years']
].copy()

resume_features.head()


Unnamed: 0,CandidateID,Category,num_skills,experience_years
0,CAND00001,HR,2,15
1,CAND00002,HR,2,0
2,CAND00003,HR,4,20
3,CAND00004,HR,3,20
4,CAND00005,HR,4,0


In [35]:
from pathlib import Path

OUT_PATH = Path(r"C:\Users\abanu\Documents\t_iq_hr\data\processed")

resume_features.to_csv(OUT_PATH / "resume_features.csv", index=False)
resumes_df.to_csv(OUT_PATH / "Candidates_features.csv", index=False)

print("✅ Notebook-06 completed successfully")


✅ Notebook-06 completed successfully
