In [1]:
import pandas as pd
import spacy
import re

nlp = spacy.load("en_core_web_sm")

In [2]:
df = pd.read_csv("data/dataset_resume/Resume/Resume.csv")

In [3]:
keywords = [
    "experience", "project", "management", "responsibility",
    "work history", "work experience", "job description",
    "role", "tasks", "positions", "certification", "abilities",
    "technical skills", "summary", "profile", "accomplishments"
]

In [4]:
print("Missing Values")
print(df.isnull().sum())

Missing Values
ID             0
Resume_str     0
Resume_html    0
Category       0
dtype: int64


In [5]:
df = df.drop_duplicates(subset="ID")
df

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR
...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION


In [6]:
def clean_html(text):
    text = re.sub(r"<[^>]+>", " ", str(text))
    return re.sub(r"\s+", " ", text).strip()

In [7]:
def extract_relevant_sentences(text):
    if pd.isna(text): return []
    text = clean_html(text)
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if any(k in sent.text.lower() for k in keywords)]

In [8]:
df.loc[0:3, "Relevant Sentences"] = df.loc[0:3, "Resume_html"].apply(extract_relevant_sentences)

In [9]:
print(df["Relevant Sentences"].head())

df["Relevant Sentences"].to_csv("relevant.csv",index=False)

0    [HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1    [HR SPECIALIST, US HR OPERATIONS Summary Versa...
2    [HR DIRECTOR Summary Over 20 years experience ...
3    [HR SPECIALIST Summary Dedicated, Driven, and ...
4                                                  NaN
Name: Relevant Sentences, dtype: object


In [10]:
df.loc[4:50, "Relevant Sentences"] = df.loc[4:50, "Resume_html"].apply(extract_relevant_sentences)

In [11]:
print(df["Relevant Sentences"].head(10))

df["Relevant Sentences"].to_csv("relevant.csv",index=False)

0    [HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1    [HR SPECIALIST, US HR OPERATIONS Summary Versa...
2    [HR DIRECTOR Summary Over 20 years experience ...
3    [HR SPECIALIST Summary Dedicated, Driven, and ...
4    [HR MANAGER Skill Highlights HR SKILLS HR Depa...
5    [HR GENERALIST Summary Dedicated and focused A...
6    [HR MANAGER Summary HUMAN RESOURCES MANAGER Ex...
7    [HR MANAGER Professional Summary Senior HR pro...
8    [HR SPECIALIST Summary Possess 15+ years of ex...
9    [HR CLERK Summary Translates business vision i...
Name: Relevant Sentences, dtype: object


In [12]:
df

Unnamed: 0,ID,Resume_str,Resume_html,Category,Relevant Sentences
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST, US HR OPERATIONS Summary Versa..."
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR DIRECTOR Summary Over 20 years experience ...
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST Summary Dedicated, Driven, and ..."
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Skill Highlights HR SKILLS HR Depa...
...,...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,


In [13]:
batch_size = 50
for start in range(51, len(df), batch_size):
    end = min(start + batch_size, len(df))
    print(f"Processing {start} to {end - 1}")

    df.loc[start:end - 1, "Relevant Sentences"] = df.loc[start:end - 1, "Resume_html"].apply(extract_relevant_sentences)


Processing 51 to 100
Processing 101 to 150
Processing 151 to 200
Processing 201 to 250
Processing 251 to 300
Processing 301 to 350
Processing 351 to 400
Processing 401 to 450
Processing 451 to 500
Processing 501 to 550
Processing 551 to 600
Processing 601 to 650
Processing 651 to 700
Processing 701 to 750
Processing 751 to 800
Processing 801 to 850
Processing 851 to 900
Processing 901 to 950
Processing 951 to 1000
Processing 1001 to 1050
Processing 1051 to 1100
Processing 1101 to 1150
Processing 1151 to 1200
Processing 1201 to 1250
Processing 1251 to 1300
Processing 1301 to 1350
Processing 1351 to 1400
Processing 1401 to 1450
Processing 1451 to 1500
Processing 1501 to 1550
Processing 1551 to 1600
Processing 1601 to 1650
Processing 1651 to 1700
Processing 1701 to 1750
Processing 1751 to 1800
Processing 1801 to 1850
Processing 1851 to 1900
Processing 1901 to 1950
Processing 1951 to 2000
Processing 2001 to 2050
Processing 2051 to 2100
Processing 2101 to 2150
Processing 2151 to 2200
Proces

In [14]:
df.head(50)

Unnamed: 0,ID,Resume_str,Resume_html,Category,Relevant Sentences
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMIN...
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST, US HR OPERATIONS Summary Versa..."
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR DIRECTOR Summary Over 20 years experience ...
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[HR SPECIALIST Summary Dedicated, Driven, and ..."
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Skill Highlights HR SKILLS HR Depa...
5,11592605,HR GENERALIST Summary Dedic...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR GENERALIST Summary Dedicated and focused A...
6,25824789,HR MANAGER Summary HUMAN RES...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Summary HUMAN RESOURCES MANAGER Ex...
7,15375009,HR MANAGER Professional Summa...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR MANAGER Professional Summary Senior HR pro...
8,11847784,HR SPECIALIST Summary Posses...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR SPECIALIST Summary Possess 15+ years of ex...
9,32896934,HR CLERK Summary Translates ...,"<div class=""fontsize fontface vmargins hmargin...",HR,[HR CLERK Summary Translates business vision i...


In [16]:
df.to_csv(r"D:\Project7\Career-Link\ml_model\data\Resume_processed.csv", index=False)
print("New CSV successfully saved.")


New CSV successfully saved.
