In [2]:
import pandas as pd
import spacy
import re

nlp = spacy.load("en_core_web_sm")

In [2]:
df = pd.read_csv("data/dataset_resume/Resume/Resume.csv")

In [3]:
keywords = [
    "experience", "project", "management", "responsibility",
    "work history", "work experience", "job description",
    "role", "tasks", "positions", "certification", "abilities",
    "technical skills", "summary", "profile", "accomplishments"
]

In [4]:
print("Missing Values")
print(df.isnull().sum())

Missing Values
ID             0
Resume_str     0
Resume_html    0
Category       0
dtype: int64


In [5]:

df = df.drop_duplicates(subset="ID")
df

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR
...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION


In [6]:
def clean_html(text):
    text = re.sub(r"<[^>]+>", " ", str(text))
    return re.sub(r"\s+", " ", text).strip()

In [60]:
def extract_relevant_sentences(text):
    if pd.isna(text): return []
    text = clean_html(text)
    doc = nlp(text)

    
    keywords = ["experience", "skills", "expertise", "knowledge", "familiar", "proficient"]
    relevant_sentences = []

    for sent in doc.sents:
        sentence_text = sent.text.strip()
        sentence_lower = sentence_text.lower()
        for keyword in keywords:
            if keyword in sentence_lower:
                relevant_sentences.append(sentence_text)
                break
    return relevant_sentences

In [8]:
df.loc[0:3, "Relevant Sentences"] = df.loc[0:3, "Resume_html"].apply(extract_relevant_sentences)

In [9]:
print(df["Relevant Sentences"].head())

df["Relevant Sentences"].to_csv("relevant.csv",index=False)

0    [HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1    [HR SPECIALIST, US HR OPERATIONS Summary Versa...
2    [HR DIRECTOR Summary Over 20 years experience ...
3    [HR SPECIALIST Summary Dedicated, Driven, and ...
4                                                  NaN
Name: Relevant Sentences, dtype: object


In [10]:
df.loc[4:50, "Relevant Sentences"] = df.loc[4:50, "Resume_html"].apply(extract_relevant_sentences)

In [11]:
print(df["Relevant Sentences"].head(10))

df["Relevant Sentences"].to_csv("relevant.csv",index=False)

0    [HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1    [HR SPECIALIST, US HR OPERATIONS Summary Versa...
2    [HR DIRECTOR Summary Over 20 years experience ...
3    [HR SPECIALIST Summary Dedicated, Driven, and ...
4    [HR MANAGER Skill Highlights HR SKILLS HR Depa...
5    [HR GENERALIST Summary Dedicated and focused A...
6    [HR MANAGER Summary HUMAN RESOURCES MANAGER Ex...
7    [HR MANAGER Professional Summary Senior HR pro...
8    [HR SPECIALIST Summary Possess 15+ years of ex...
9    [HR CLERK Summary Translates business vision i...
Name: Relevant Sentences, dtype: object


In [12]:
df

Unnamed: 0,ID,Resume_str,Resume_html,Category,Relevant Sentences
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST, US HR OPERATIONS Summary Versa..."
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR DIRECTOR Summary Over 20 years experience ...
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST Summary Dedicated, Driven, and ..."
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Skill Highlights HR SKILLS HR Depa...
...,...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,


In [13]:
batch_size = 50
for start in range(51, len(df), batch_size):
    end = min(start + batch_size, len(df))
    print(f"Processing {start} to {end - 1}")

    df.loc[start:end - 1, "Relevant Sentences"] = df.loc[start:end - 1, "Resume_html"].apply(extract_relevant_sentences)


Processing 51 to 100
Processing 101 to 150
Processing 151 to 200
Processing 201 to 250
Processing 251 to 300
Processing 301 to 350
Processing 351 to 400
Processing 401 to 450
Processing 451 to 500
Processing 501 to 550
Processing 551 to 600
Processing 601 to 650
Processing 651 to 700
Processing 701 to 750
Processing 751 to 800
Processing 801 to 850
Processing 851 to 900
Processing 901 to 950
Processing 951 to 1000
Processing 1001 to 1050
Processing 1051 to 1100
Processing 1101 to 1150
Processing 1151 to 1200
Processing 1201 to 1250
Processing 1251 to 1300
Processing 1301 to 1350
Processing 1351 to 1400
Processing 1401 to 1450
Processing 1451 to 1500
Processing 1501 to 1550
Processing 1551 to 1600
Processing 1601 to 1650
Processing 1651 to 1700
Processing 1701 to 1750
Processing 1751 to 1800
Processing 1801 to 1850
Processing 1851 to 1900
Processing 1901 to 1950
Processing 1951 to 2000
Processing 2001 to 2050
Processing 2051 to 2100
Processing 2101 to 2150
Processing 2151 to 2200
Proces

In [14]:
df.head(50)

Unnamed: 0,ID,Resume_str,Resume_html,Category,Relevant Sentences
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST, US HR OPERATIONS Summary Versa..."
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR DIRECTOR Summary Over 20 years experience ...
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST Summary Dedicated, Driven, and ..."
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Skill Highlights HR SKILLS HR Depa...
5,11592605,HR GENERALIST Summary Dedic...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR GENERALIST Summary Dedicated and focused A...
6,25824789,HR MANAGER Summary HUMAN RES...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Summary HUMAN RESOURCES MANAGER Ex...
7,15375009,HR MANAGER Professional Summa...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Professional Summary Senior HR pro...
8,11847784,HR SPECIALIST Summary Posses...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR SPECIALIST Summary Possess 15+ years of ex...
9,32896934,HR CLERK Summary Translates ...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR CLERK Summary Translates business vision i...


In [15]:
df.to_csv(r"D:\Project7\Career-Link\ml_model\data\Resume_processed.csv", index=False)
print("New CSV successfully saved.")


New CSV successfully saved.


In [33]:
skills_domain = [
    "python", "java", "c++", "html", "css", "javascript", "react", "angular",
    "sql", "mongodb", "postgresql", "aws", "azure", "google cloud",
    "network administration", "cybersecurity", "sdlc", "git", "svn", "windows",
    "linux", "macos", "technical support", "devops", "patient care",
    "electronic health records", "ehr", "pharmacology", "hipaa", "diagnostic testing",
    "patient education", "budgeting & forecasting", "investment analysis",
    "risk management", "financial reporting", "auditing", "market research", "data entry",
    "bookkeeping", "accounts payable", "accounts receivable", "payroll processing",
    "customer relationship management", "statistical analysis", "data visualization",
    "tableau", "power bi", "predictive modeling", "machine learning", "data cleaning",
    "report generation", "business intelligence", "a/b testing", "excel", "active listening",
    "empathy", "troubleshooting", "complaint resolution", "product knowledge",
    "ticketing systems", "phone etiquette", "email communication", "live chat support",
    "service recovery", "digital marketing", "seo", "sem", "smm", "content marketing",
    "email marketing", "campaign management", "brand management", "public relations",
    "advertising", "google ads", "facebook ads", "google analytics", "copywriting",
    "hr", "spss", "sas", "hypothesis testing", "regression analysis", "anova",
    "probability theory", "experimental design", "data interpretation",
    "sampling techniques", "quantitative research", "statistical modeling",
    "project planning", "resource allocation", "team leadership",
    "stakeholder management", "agile methodologies", "scrum", "gantt charts", "jira",
    "asana", "trello", "verbal communication", "written communication",
    "presentation skills", "public speaking", "interpersonal skills",
    "technical writing", "critical thinking", "analytical skills",
    "root cause analysis", "decision making", "creative solutions",
    "strategic thinking", "resourcefulness", "motivation", "mentoring", "delegation",
    "performance management", "coaching", "strategic vision", "time management",
    "prioritization", "multitasking", "attention to detail", "record keeping",
    "meeting coordination", "filing systems", "workflow optimization", "r",
    "photoshop", "adobe photoshop", "final cut pro", "illustrator", "microsoft office",
    "figma", "accounting", "client relations", "data analysis", "customer service",
    "marketing", "statistics", "project management", "financial analysis",
    "communication", "problem-solving", "programming", "python", "java", "c++",
    "nlp", "tensorflow", "pytorch", "analytic skills", "leadership", "teamwork",
    "collaboration", "debugging", "testing", "agile", "databases", "networking",
    "cloud computing", "project coordination", "decision making", "conflict resolution",
    "creative thinking", "research", "report writing", "self-motivated", "self-starter",
    "photoshop", "wordpress"
]


In [68]:
def add_skills_to_sentences(sentences):
    matched= []
    text_lower = " ".join(sentences).lower()
    for skill in skills_domain:
        skill_lower = skill.lower()
        if re.search(rf'\b{re.escape(skill_lower)}\b', text_lower):
            matched.append(skill)
    return matched

df_reader = pd.read_csv("data/dataset_resume/Resume/Resume.csv",chunksize=100)


for i,chunk in enumerate(df_reader):
    print(f"Processing chunk {i+1}")
    
    # Extract relevant sentences from Resume_html
    chunk["Relevant Sentences"] = chunk["Resume_html"].apply(extract_relevant_sentences)   

    # Extract skills from relevant sentences
    chunk["matched_skills"] = chunk["Relevant Sentences"].apply(add_skills_to_sentences)

    # Merge both into raw_data column
    chunk["raw_data"] = chunk.apply(lambda row: (row["Relevant Sentences"], row["matched_skills"]), axis=1)

    # Save to CSV (first chunk with header, rest without)
    chunk[["raw_data"]].to_csv("relevant.csv", mode="a", index=False, header=(i==0))


    

    


Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
500    [Skills Word programs (including Excel and Acc...
501    [Skills Superb sales professional Store planni...
502    [CLINICAL SERVICE ADVOCATE Professional Summar...
503    [University of Phoenix 2011 Bachelor of Scienc...
504    [CUSTOMER SERVICE ADVOCATE Summary Enthusiasti...
Name: Relevant Sentences, dtype: object
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25


In [32]:
import ast

df= pd.read_csv("relevant.csv")

def cleaned_text(raw):
    
# raw =df["raw_data"][800]
    parsed = ast.literal_eval(raw)

    cleaned_sentences = [re.sub(r'\\', '', s) for s in parsed[0]]
    cleaned_skills = [re.sub(r'\\', '', s) for s in parsed[1]]

    cleaned_tuple = (cleaned_sentences, cleaned_skills)
    return cleaned_tuple
# print(cleaned_tuple)

df["raw_data"] = df["raw_data"].apply(cleaned_text)
df["raw_data"].to_csv("raw_data.csv")


In [33]:
df["raw_data"][800]

(['Summary Enthusiastic customer service/telesales representative with in-depth knowledge of sales, account management and training.',
  'Highlights High customer service standards Dedicated to process improvement Strong problem solving ability Strong organizational skills Active listening skills Seasoned in conflict resolution Energetic work attitude Adaptive team player Self-motivated Excellent communication skills Natural leader Thrives under pressure Fast learning Customer friendly Coordination skills Experience Manager / Fitness Instructor January 2014 to Current Company Name － City , State Cultivated positive relationships with participants by interacting with them during group fitness classes.',
  'Cocktail Waitress July 2013 to October 2013 Company Name － City , State Maintained updated knowledge of all menu items, specials, liquor brands, beers and non-alcoholic selections.',
  'Continually provided exceptional service to customers by being friendly, knowledgeable and accommod

In [40]:

df = pd.read_csv("raw_data.csv",names=["raw_data"])


def cleaned_text(raw):
    try:
        # Only parse if raw is a string
        if isinstance(raw, str):
            parsed = ast.literal_eval(raw)
        else:
            return ("", [])  # fallback for non-string rows

        # Clean slashes
        cleaned_sentences = [re.sub(r'\\', '', s) for s in parsed[0]]
        cleaned_skills = [re.sub(r'\\', '', s) for s in parsed[1]]

        # Join sentences and remove single/double quotes
        joined_sentences = " ".join(cleaned_sentences)
        joined_sentences = re.sub(r"[\"']", '', joined_sentences)

        return (joined_sentences, cleaned_skills)
    
    except (ValueError, SyntaxError, TypeError) as e:
        print(f"Skipping bad row: {raw} — {e}")
        return ("", [])

# Apply to 'raw_data' column
df["raw_data"] = df["raw_data"].apply(cleaned_text)

# Save cleaned output
df["raw_data"].to_csv("raw_data.csv", index=False)

    
               


Skipping bad row: raw_data — malformed node or string on line 1: <ast.Name object at 0x0000025DB3D41910>


In [43]:
print(df["raw_data"][0])

('HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMINISTRATOR Summary Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management. Experience HR Administrator/Marketing Associate HR Administrator Dec 2013 to Current Company Name － City , State Helps to develop policies, directs and coordinates activities such as employment, compensation, labor relations, benefits, training, and employee services. Required to have organizational and analytical skills as well as computer skills, knowledge of medical terminology and procedures, statistics, billing standards, data analysis and laws regarding medical billing. studies 1998 Sainte Genevieve Senior High － City , State Awarded American Shrubel Leadership Scholarship to Jefferson College Skills Accounting, ads, advertising, analytical skills, benefits, billing, budgeting, clients, Customer Service, data analysis, delivery, documentation, employee relations, financial management, government relations, 

In [47]:
import pandas as pd
import ast

# Read the cleaned CSV
df = pd.read_csv("raw_data.csv")

# Create list to hold the tuples
raw_data = []

for i, row in df.iterrows():
    try:
        value = row.iloc[0]  # ✅ safe and future-proof
        if isinstance(value, str):
            parsed = ast.literal_eval(value)
            raw_data.append(parsed)
    except Exception as e:
        print(f"Error at row {i}: {e}")

# Preview output
print(raw_data[1:3])  # Show first 2 items


[('HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMINISTRATOR Summary Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management. Experience HR Administrator/Marketing Associate HR Administrator Dec 2013 to Current Company Name － City , State Helps to develop policies, directs and coordinates activities such as employment, compensation, labor relations, benefits, training, and employee services. Required to have organizational and analytical skills as well as computer skills, knowledge of medical terminology and procedures, statistics, billing standards, data analysis and laws regarding medical billing. studies 1998 Sainte Genevieve Senior High － City , State Awarded American Shrubel Leadership Scholarship to Jefferson College Skills Accounting, ads, advertising, analytical skills, benefits, billing, budgeting, clients, Customer Service, data analysis, delivery, documentation, employee relations, financial management, government relations,

In [54]:
def create_training_example( text, skill_list):
    entities = []
    text_lower = text.lower()

    for skill in skill_list:
        skill_lower = skill.lower()
        start = text_lower.find(skill_lower)
        if start != -1:
            end = start + len(skill)
            entities.append((start, end, "SKILL"))

    return (text, {"entities": entities})


Train_data = [
    create_training_example(text, skills)
    for text, skills in raw_data
]

print(Train_data[800])

('GROUP FITNESS INSTRUCTOR Executive Summary To obtain a position as an experienced Training and Development professional with strong leadership and relationship-building skills. Core Qualifications Team Building Team Leadership Communication Skills Planning Organizational Skills Professional Experience Group Fitness Instructor March 2014 to April 2014 Company Name An 8 week course getting trained in fitness classes to instruct group exercises for the on campus gym. Texas Tech Languages Speak and read basic Spanish Skills basic, Coach, Human Resource, Leadership, read, Spanish, Teaching, Time Management', {'entities': [(211, 226, 'SKILL'), (593, 608, 'SKILL'), (227, 240, 'SKILL'), (132, 142, 'SKILL')]})


In [61]:
import spacy
from spacy.training.example import Example
from spacy.training import offsets_to_biluo_tags
from spacy.tokens import DocBin
import random

for loops in range(2):
    random.shuffle(Train_data)
    losses = {}
    for text, annotations in Train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    # print(f"Loop {loops+1} - Loss: {losses['ner']:.4f}")

model_dir = "skill_ner_model"
nlp.to_disk(model_dir)
print(f"\nModel saved to: {model_dir}")


ValueError: [E103] Trying to set conflicting doc.ents: '(253, 274, 'SKILL')' and '(261, 274, 'SKILL')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.

In [35]:
batch_size = 100

for start in range(4, len(df), batch_size):  # adjust range as needed
    end = min(start + batch_size, len(df))
    print(f"Processing skills for resumes {start} to {end - 1}")

    matched_skills_list = []

    for row in df["Relevant Sentences"]:
        sentences = " ".join(row)
        matched_skills = add_skills_to_sentences(sentences)
        matched_skills_list.append(match_skills)
print(matched_skills_list)

    
  

Processing skills for resumes 4 to 103


KeyError: 'Relevant Sentences'

In [36]:
import pprint
#pprint.pprint(df.loc[0, "Relevant Sentences"])
df.to_csv("raw_data.csv",index=False)

In [26]:
dg = pd.read_csv("raw_data.csv")
print(dg["Relevant Sentences"][800])

['excel', 'active listening', 'hr', 'r', 'customer service', 'communication', 'conflict resolution', 'self-motivated']


In [25]:

import pprint

pprint.pprint(df['Relevant Sentences'][800])

['Summary Enthusiastic customer service/telesales representative with in-depth '
 'knowledge of sales, account management and training.',
 'Highlights High customer service standards Dedicated to process improvement '
 'Strong problem solving ability Strong organizational skills Active listening '
 'skills Seasoned in conflict resolution Energetic work attitude Adaptive team '
 'player Self-motivated Excellent communication skills Natural leader Thrives '
 'under pressure Fast learning Customer friendly Coordination skills '
 'Experience Manager / Fitness Instructor January 2014 to Current Company Name '
 '－ City , State Cultivated positive relationships with participants by '
 'interacting with them during group fitness classes.',
 'Monitored guest for intoxication and immediately reported concerns to '
 'management.',
 'Maintained a positive working relationship with fellow staff and management.',
 ['excel',
  'active listening',
  'hr',
  'r',
  'customer service',
  'communication'