# 04_skill_embeddings.ipynb
**Objective:** Extract skills from resumes and generate embeddings for each candidate.
This notebook is a continuation of the Talent Optimization project and uses the cleaned resumes dataset from `03_resume_parsing.ipynb`.


In [1]:
# Standard imports
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

# Load parsed resumes
csv_path = r"C:\Users\abanu\Documents\T-IQ\data\processed\resumes_parsed.csv"
df = pd.read_csv(csv_path)

# Inspect
print(f"Rows: {len(df)}")
print(df.columns.tolist())
df.head(3)


Rows: 2710
['ID', 'Resume_str', 'Resume_html', 'Category', 'len_text', 'clean_text', 'contacts', 'name']


Unnamed: 0,ID,Resume_str,Resume_html,Category,len_text,clean_text,contacts,name
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,,HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMINI...,"{'emails': [], 'phones': [], 'linkedin': []}",Highlights Focused
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,,"HR SPECIALIST, US HR OPERATIONS Summary Versat...","{'emails': [], 'phones': [], 'linkedin': []}",Served
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,,HR DIRECTOR Summary Over 20 years experience i...,"{'emails': [], 'phones': [], 'linkedin': []}",ASHHRA


In [2]:
# Example skill set (expand as needed)
skills_list = [
    'Python', 'SQL', 'Excel', 'Power BI', 'Tableau', 'Machine Learning',
    'Deep Learning', 'NLP', 'R', 'Java', 'C++', 'Communication',
    'Leadership', 'HR', 'Recruitment', 'Project Management'
]

# Compile regex pattern for faster matching
skills_pattern = re.compile(r'\b(' + '|'.join(skills_list) + r')\b', flags=re.IGNORECASE)

def extract_skills(text):
    if not isinstance(text, str):
        return []
    return list(set(match.group(0).title() for match in skills_pattern.finditer(text)))

# Apply to clean_text
df['skills'] = df['clean_text'].progress_map(extract_skills)

# Quick check
df[['ID','name','skills']].head(10)


100%|██████████| 2710/2710 [00:01<00:00, 1860.70it/s]


Unnamed: 0,ID,name,skills
0,16852973,Highlights Focused,"[Leadership, Hr]"
1,22323967,Served,"[Recruitment, Project Management, Communicatio..."
2,33176873,ASHHRA,"[Recruitment, Leadership, Hr, Project Manageme..."
3,27018550,,"[Communication, Excel, Hr]"
4,17812897,Skill Highlights,"[Recruitment, Leadership, Hr, Project Manageme..."
5,11592605,Maintained,"[Recruitment, Excel, Hr]"
6,25824789,Mandated Training,"[Leadership, Project Management, Hr]"
7,15375009,"management, vendor","[Recruitment, Project Management, Communicatio..."
8,11847784,,"[Recruitment, Leadership, Communication, Hr, P..."
9,32896934,,"[Recruitment, R, Hr]"


In [1]:
from sentence_transformers import SentenceTransformer

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Test embedding
sample = "Python, SQL, Machine Learning"
embedding = model.encode(sample)
print("Embedding vector length:", len(embedding))


  from .autonotebook import tqdm as notebook_tqdm


Embedding vector length: 384


In [2]:
df['skills_embedding'] = df['skills_str'].progress_map(lambda x: model.encode(x))


NameError: name 'df' is not defined

In [3]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Load processed resumes with skills
csv_path = r"C:\Users\abanu\Documents\T-IQ\data\processed\resumes_skills.csv"
df = pd.read_csv(csv_path)

# Optional: recreate 'skills_str' column
df['skills_str'] = df['skills'].fillna('').astype(str)

# Re-import model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\abanu\\Documents\\T-IQ\\data\\processed\\resumes_skills.csv'

In [4]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

csv_path = r"C:\Users\abanu\Documents\T-IQ\data\processed\resumes_parsed.csv"
df = pd.read_csv(csv_path)

# Recreate skills_str column (empty for now)
df['skills_str'] = ''


In [5]:
df['skills_embedding'] = df['skills_str'].progress_map(lambda x: model.encode(x))


100%|██████████| 2710/2710 [00:30<00:00, 88.39it/s] 


In [6]:
df[['ID','name','skills','skills_str']].head(10)


KeyError: "['skills'] not in index"

In [7]:
import re
from tqdm import tqdm
tqdm.pandas()

# Define your skill set
skills_list = [
    'Python', 'SQL', 'Excel', 'Power BI', 'Tableau', 'Machine Learning',
    'Deep Learning', 'NLP', 'R', 'Java', 'C++', 'Communication',
    'Leadership', 'HR', 'Recruitment', 'Project Management'
]

# Compile regex for faster matching
skills_pattern = re.compile(r'\b(' + '|'.join(skills_list) + r')\b', flags=re.IGNORECASE)

# Function to extract skills
def extract_skills(text):
    if not isinstance(text, str):
        return []
    return list(set(match.group(0).title() for match in skills_pattern.finditer(text)))

# Apply to clean_text
df['skills'] = df['clean_text'].progress_map(extract_skills)

# Optional: create skills_str for embeddings
df['skills_str'] = df['skills'].apply(lambda x: ', '.join(x) if x else '')

# Verify
df[['ID','name','skills','skills_str']].head(10)


100%|██████████| 2710/2710 [00:01<00:00, 1406.39it/s]


Unnamed: 0,ID,name,skills,skills_str
0,16852973,Highlights Focused,"[Hr, Leadership]","Hr, Leadership"
1,22323967,Served,"[Hr, Recruitment, Communication, Project Manag...","Hr, Recruitment, Communication, Project Manage..."
2,33176873,ASHHRA,"[Project Management, Excel, Leadership, Hr, Re...","Project Management, Excel, Leadership, Hr, Rec..."
3,27018550,,"[Hr, Excel, Communication]","Hr, Excel, Communication"
4,17812897,Skill Highlights,"[Project Management, Excel, Leadership, Hr, Re...","Project Management, Excel, Leadership, Hr, Rec..."
5,11592605,Maintained,"[Hr, Excel, Recruitment]","Hr, Excel, Recruitment"
6,25824789,Mandated Training,"[Hr, Project Management, Leadership]","Hr, Project Management, Leadership"
7,15375009,"management, vendor","[Hr, Recruitment, Communication, Project Manag...","Hr, Recruitment, Communication, Project Manage..."
8,11847784,,"[Communication, Project Management, Excel, Lea...","Communication, Project Management, Excel, Lead..."
9,32896934,,"[Hr, R, Recruitment]","Hr, R, Recruitment"


In [8]:
from sentence_transformers import SentenceTransformer

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each resume
df['skills_embedding'] = df['skills_str'].progress_map(lambda x: model.encode(x))


100%|██████████| 2710/2710 [00:37<00:00, 73.01it/s]


In [9]:
import pickle

# Save CSV without embeddings
df.drop(columns=['skills_embedding']).to_csv(r"C:\Users\abanu\Documents\T-IQ\data\processed\resumes_skills.csv", index=False)

# Save embeddings separately
with open(r"C:\Users\abanu\Documents\T-IQ\data\processed\skills_embeddings.pkl", "wb") as f:
    pickle.dump(df['skills_embedding'].tolist(), f)

print("✅ Skills CSV and embeddings saved successfully.")


✅ Skills CSV and embeddings saved successfully.
