In [1]:
import pandas as pd

resumes_df = pd.read_csv("../data/raw/resumes.csv")
jobs_df = pd.read_csv("../data/raw/jobs.csv")

In [2]:
jobs_df["Title"] = jobs_df["Title"].fillna("Unknown Title")

In [3]:
print(jobs_df.isnull().sum())

JobID                0
Title                0
ExperienceLevel      0
YearsOfExperience    0
Skills               0
Responsibilities     0
Keywords             0
dtype: int64


In [4]:
jobs_df["job_text"] = (
    jobs_df["Title"] + " " +
    jobs_df["Skills"] + " " +
    jobs_df["Responsibilities"] + " " +
    jobs_df["Keywords"]
)


In [5]:
print(jobs_df["job_text"][0])

.NET Developer C#; VB.NET basics; .NET Framework; .NET Core fundamentals; ASP.NET; MVC; HTML; CSS; JavaScript basics; SQL Server; Entity Framework basics; LINQ; Visual Studio; Git; Unit Testing basics Assist in coding and debugging applications; Learn and apply .NET Framework and Core fundamentals; Support team in building ASP.NET MVC web applications; Write basic SQL queries and work with Entity Framework; Collaborate with peers to solve issues; Participate in code reviews for learning; Follow best practices in coding; Work with version control (Git) .NET; C#; ASP.NET MVC; Entity Framework; SQL Server; LINQ; Visual Studio; Unit Testing


In [6]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_resume(text):
    text = text.lower()
    text = re.sub(r"\S+@\S+", " ", text)          # emails
    text = re.sub(r"\+?\d[\d -]{8,}\d", " ", text) # phone numbers
    text = re.sub(r"http\S+", " ", text)          # urls
    text = re.sub(r"\s+", " ", text)
    
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct
    ]
    return " ".join(tokens)

def clean_job(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct
    ]
    return " ".join(tokens)


In [7]:
resumes_df["clean_text"] = resumes_df["Resume"].apply(clean_resume)
jobs_df["clean_text"] = jobs_df["job_text"].apply(clean_job)

In [8]:
print(resumes_df.iloc[0]["clean_text"][:500])
print(jobs_df.iloc[0]["clean_text"][:500])


skill programming language python panda numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm naã¯ve baye knn random forest decision tree boost technique cluster analysis word embed sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural net database visualization mysql sqlserver cassandra hbase elasticsearch d3.js dc.js plotly kibana matplotlib ggplot tableau regular expression html css angular 6 logstas
.net developer c vb.net basic .net framework .net core fundamental asp.net mvc html css javascript basic sql server entity framework basic linq visual studio git unit test basic assist cod debug application learn apply .net framework core fundamental support team build asp.net mvc web application write basic sql query work entity framework collaborate peer solve issue participate code review learning follow good practice coding work version control git .net c asp.net mvc entity framework sql se

In [None]:
import os

os.makedirs("../data/processed/", exist_ok=True)

resumes_df[["Category", "clean_text"]].to_csv(
    "../data/processed/resumes_clean.csv", index=False
)

jobs_df[["Title", "clean_text"]].to_csv(
    "../data/processed/jobs_clean.csv", index=False
)
