In [2]:
import pandas as pd

job_description = pd.read_csv(r'C:\\Users\User\Desktop\Python Programming\NLP\group_project\NLP_project\job_descriptions.csv')
resumes = pd.read_csv(r'C:\Users\User\Desktop\Python Programming\NLP\group_project\NLP_project\resumes.csv')

In [7]:
print(resumes.columns)
print(job_description.columns)

Index(['ID', 'Professional_Summary', 'Work_Experience', 'Projects',
       'Certifications', 'Education', 'Skills', 'resume_text'],
      dtype='object')
Index(['ID', 'Company_Overview', 'Responsibilities', 'Required_Qualifications',
       'Preferred_Qualifications'],
      dtype='object')


## Keyword extraction
### TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert resume and job text to string format
resumes['resume_text'] = (
    resumes['Professional_Summary'].fillna('') + ' ' +
    resumes['Work_Experience'].fillna('') + ' ' +
    resumes['Projects'].fillna('') + ' ' +
    resumes['Certifications'].fillna('') + ' ' +
    resumes['Education'].fillna('') + ' ' +
    resumes['Skills'].fillna('')
)

job_description['job_description'] = (
    job_description['Company_Overview'].fillna('') + ' ' +
    job_description['Responsibilities'].fillna('') + ' ' +
    job_description['Required_Qualifications'].fillna('') + ' ' +
    job_description['Preferred_Qualifications'].fillna('')
)


resumes_clean = resumes['resume_text'].astype(str)
jobs_clean = job_description['job_description'].astype(str)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_resumes = vectorizer.fit_transform(resumes_clean)
resume_keywords = vectorizer.get_feature_names_out()

tfidf_jobs = vectorizer.fit_transform(jobs_clean)
job_keywords = vectorizer.get_feature_names_out()


In [3]:
import yake
kw_extractor = yake.KeywordExtractor(top=5)

# Convert resume and job text to string format
resumes['resume_text'] = (
    resumes['Professional_Summary'].fillna('') + ' ' +
    resumes['Work_Experience'].fillna('') + ' ' +
    resumes['Projects'].fillna('') + ' ' +
    resumes['Certifications'].fillna('') + ' ' +
    resumes['Education'].fillna('') + ' ' +
    resumes['Skills'].fillna('')
)

job_description['job_description'] = (
    job_description['Company_Overview'].fillna('') + ' ' +
    job_description['Responsibilities'].fillna('') + ' ' +
    job_description['Required_Qualifications'].fillna('') + ' ' +
    job_description['Preferred_Qualifications'].fillna('')
)

resumes_clean = resumes['resume_text'].astype(str)
jobs_clean = job_description['job_description'].astype(str)

# Apply keyword extraction
job_keywords_yake = jobs_clean.apply(lambda text: [kw for kw, _ in kw_extractor.extract_keywords(text)])
resume_keywords_yake = resumes_clean.apply(lambda text: [kw for kw, _ in kw_extractor.extract_keywords(text)])

def find_missing_keywords(job_kw, resume_kw):
    return list(set(job_kw) - set(resume_kw))

# Example for one-to-one comparison
for i in range(len(job_description)):
    job_kw = job_keywords_yake[i]
    resume_kw = resume_keywords_yake[i]
    missing = find_missing_keywords(job_kw, resume_kw)
    print(f"Missing in Resume {i} for Job {i}: {missing}")

Missing in Resume 0 for Job 0: ['Finance company focused', 'Mentor junior staff', 'Organic empowering architecture', 'digital marketing specialist', 'Implement data-driven approaches']
Missing in Resume 1 for Job 1: ['customer loyalty Stay', 'Centralized reciprocal customer', 'Implement data-driven approaches', 'Manufacturing company focused', 'Lead cross-functional teams']
Missing in Resume 2 for Job 2: ['Universal dynamic forecast', 'dynamic forecast Implement', 'Implement data-driven approaches', 'Creative company focused', 'forecast Implement data-driven']
Missing in Resume 3 for Job 3: ['Centralized asynchronous focus', 'Mentor junior staff', 'Hospitality company focused', 'Implement data-driven approaches', 'group Mentor junior']
Missing in Resume 4 for Job 4: ['Synchronized homogeneous system', 'Mentor junior staff', 'technical support specialist', 'Creative company focused', 'Lead cross-functional teams']
Missing in Resume 5 for Job 5: ['machine learning engineer', 'Education c

In [4]:
output_df = pd.DataFrame({
    'job_id': job_description['ID'],
    'missing_keywords': [find_missing_keywords(job_keywords_yake[i], resume_keywords_yake[i]) for i in range(len(job_description))]
})
output_df.to_csv("missing_keywords_summary.csv", index=False)
