# This is the part where the data handling pipeline is created.
- here data will be turned into json format for spacy training.
- Pandas will be used for cleaning and preparation of the data.
- Then they will be put through preatrained spacy model and then NER will be created.

## Step 1. Importing Libraries/Frameworks
- Formatting of the datasets into one format (JSON).
- save it in a docbin format
- Then feed it to a NER model (spacy)

In [3]:
import json 
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import chardet
import ftfy
from functools import reduce
import re

### Open a new directory to save the prepared data.

In [4]:
out_dir = Path("NER_ready_data")
out_dir.mkdir(parents=True, exist_ok=True)

## Step 2. Handling Datasets

### 2.1 Preparing CSV data.

#### Loading datasets

- Since all of them has unique structure and naming I will open them one by one.

#### Global Functions to make the data more consistent.

In [39]:
def lowercase(data):
    df = pd.read_csv(data, encoding="utf-8")
    for column in df.columns:
        df[column] = df[column].astype(str).str.lower()
    return df

def text_standardizer(data):
    bachelors = ["bs", "bsc", ""]
    masters = []
    mapping = {
        bachelors: "bachelor's",
        masters: "master's"
    }

path_of_clean_data = os.path.join(os.getcwd(), "NER_ready_data")
print(len(os.listdir(path_of_clean_data)))
for i in range(len(os.listdir(path_of_clean_data))):
    lowercase(os.path.join(path_of_clean_data, f"dataset{i+1}.csv"))

5


#### Dataset 1.

In [4]:
data_path = os.path.join(os.getcwd(), "ahmedheakl_resume_atlas.csv")
df = pd.read_csv(data_path, index_col='Unnamed: 0')
df['Category'].unique()
it_jobs_list = ["Blockchain", "Data Science", "Database", "DevOps", "DotNet Developer", "ETL Developer", "Information Technology", "Java Developer", 
                "Network Security Engineer", "Python Developer", "React Developer", "SAP Developer", "SQL Developer", "Web Designing"]
df = df[df['Category'].isin(it_jobs_list)]
df.reset_index(drop=True, inplace=True)
df.rename(columns={"Category": "category", "Text": "text"}, inplace=True)
## dataset does not contain empty values so there is no need to do anything more for spaCy training.
file_path = out_dir / "dataset1.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

#### Dataset 2.

In [None]:
data_path = os.path.join(os.getcwd(), "InferencePrince555_Resume_data.csv")
df = pd.read_csv(data_path, index_col='Unnamed: 0')

def splitter(row):
    if not isinstance(row, str):
        if pd.isna(row):
            row = ''
        else:
            row = str(row)
    match = re.match(r'(^[A-Z\s]+)\s(.*)', row)
    if match:
        title = match.group(1).strip()
        cv = match.group(2).strip()
        return title, cv
    return None, row.strip()

source_col = 'Resume_test' if 'Resume_test' in df.columns else ('Resume' if 'Resume' in df.columns else df.columns[0])
series = df[source_col].fillna('').astype(str)
df[["category", "text"]] = series.apply(lambda x: pd.Series(splitter(x)))

df = df[["category", "text"]]
# Save cleaned dataset to NER_ready_data
it_job_list = ['SOFTWARE DEVELOPER', 'TECHNOLOGY PROJECT AND PRODUCT MANAGER', 'LEAD SENIOR SAP AUDITOR', 'IT COMPLIANCE AUDITOR', 'I T SUPPORT TECHNICIAN SPECIALIST', 'SENIOR NETWORK SECURITY ENGINEER', 'SOFTWARE ENGINEER', 'GAME DESIGN INTERN', 'IT MANAGER', 'SOFTWARE SUPPORT SPECIALIST', 'DATASTAGE ETL DEVELOPER', 'SENIOR ARCHITECT MDM', 'DATA ANALYST', 'SOFTWARE ENGINEERING CO OP', 'JAVA INTERN', 'OPERATIONS TECHNICIAN', 'SR NETWORK ENGINEER', 'IT SUPPORT OFFICER', 'TEAM LEAD SENIOR ANALYST', 'REGIONAL IT MANAGER', 'QA TEST ANALYST', 'SOFTWARE QUALITY ASSURANCE ANALYST II', 'IT', 'QA QC MANAGER', 'IT CONSULTANT', 'WEB DEVELOPER', 'DATABASE ADMINISTRATOR DATABASE', 'SENIOR JAVA DEVELOPER SENIOR', 'ITDIGITAL SOLUTIONS PROJECT MANAGER', 'PYTHON DEVELOPER', 'SAP SECURITY SPECIALIST GBS SAP', 'UI', 'SYSYTEM ADMINISTRATOR SYSYTEM', 'IT SERVICE DESK TECHNICIAN IT SERVICE DESK TECHNICIAN IT SERVICE DESK TECHNICIAN FISHER INVESTMENTS', 'SYSTEMS ENGINEER', 'SR SOFTWARE DEVELOPER EMPOWER SOFTWARE SR', 'WEB DEVLOPER INTERN WEB DEVLOPER INTERN', 'CYBER SECURITY ANALYST CYBER', 'INFORMATION SECURITY ENGINEER INFORMATION', 'JAVA WEB SERVICESSOA DEVELOPER', 'JAVA DEVELOPER', 'PYTHON AUTOMATION ENGINEER', 'SAP TEACHING ASSISTANT SAP TEACHING ASSISTANT SAP TEACHING ASSISTANT', 'SR SOFTWARE DEVELOPER SR SOFTWARE', 'FRONT END DEVELOPER UX DESIGNER', 'FULL STACK JAVA DEVELOPER FULL STACK', 'SYSTEMS ADMINISTRATOR', 'UX DESIGNER FRONT END DEVELOPER UX DESIGNER', 'INFORMATION SECURITY ANALYST INFORMATION', 'JUNIOR SYSTEMS ADMINISTRATOR JUNIOR', 'NETWORK ENGINEER', 'TACTICAL DATA SYSTEMS ADMINISTRATOR TACTICAL DATA', 'FRONT END DEVELOPER', 'IT SECURITY ENGINEER', 'ITOFFICE SUPPORT SPECIALIST OVATION TECHNOLOGY GROUP', 'SENIOR APPLICATIONS DEVELOPER SENIOR APPLICATIONS', 'IT CONSULTANT DEVELOPER', 'SENIOR SYSTEMS ADMINISTRATOR SENIOR SYSTEMS', 'PRODUCT SUPPORT ENGINEER PRODUCT SUPPORT ENGINEER PRODUCT SUPPORT ENGINER', 'IT PROJECT MANAGER', 'DATABASE ADMINISTRATOR', 'SR IT PROJECT MANAGER SCRUM MASTER SR', 'IT PROJECT ANALYST', 'IOS', 'FREELANCE WEB DEVELOPER FREELANCE', 'ASSOCIATE NETWORK ADMINISTRATOR ASSOCIATE NETWORK', 'WEB MANAGER', 'RESEARCH DATABASE ADMINISTRATOR RESEARCH', 'IT SECURITY ANALYST', 'SENIOR WEB DEVELOPER SENIOR WEB', 'IT MANAGER IT MANAGER WEB DEVELOPER', 'SCRUM MASTER AGILE SCRUM MASTER AGILE SCRUM MASTER AGILE', 'SYSTEMS ANALYST', 'FRONT END WEB DEVELOPER', 'REMOTE SQL SERVER DATABASE ADMINISTRATOR REMOTE SQL SERVER', 'SR PENETRATION TESTER SR PENETRATION TESTER', 'SENIOR INFORMATION TECHNOLOGY IT PROJECT MANAGER SENIOR INFORMATION TECHNOLOGY', 'SENIOR SOFTWARE DEVELOPER SENIOR', 'IT QA', 'DESKTOP ADMINISTRATOR DESKTOP', 'SENIOR FRONT END WEB DEVELOPER SENIOR', 'STAFF UX DESIGNER STAFF UX DESIGNER UX DESIGNER MICROSOFT POWERAPPS']
mapped_vals = { "SOFTWARE DEVELOPER": "Software Developer", "TECHNOLOGY PROJECT AND PRODUCT MANAGER": "Technology Project and Product Manager", "LEAD SENIOR SAP AUDITOR": "Lead Senior SAP Auditor", "IT COMPLIANCE AUDITOR": "IT Compliance Auditor", "I T SUPPORT TECHNICIAN SPECIALIST": "IT Support Technician / Specialist", "SENIOR NETWORK SECURITY ENGINEER": "Senior Network Security Engineer", "SOFTWARE ENGINEER": "Software Engineer", "GAME DESIGN INTERN": "Game Design Intern", "IT MANAGER": "IT Manager", "SOFTWARE SUPPORT SPECIALIST": "Software Support Specialist", "DATASTAGE ETL DEVELOPER": "DataStage ETL Developer", "SENIOR ARCHITECT MDM": "Senior Architect MDM", "DATA ANALYST": "Data Analyst", "SOFTWARE ENGINEERING CO OP": "Software Engineering Co-op", "JAVA INTERN": "Java Intern", "OPERATIONS TECHNICIAN": "Operations Technician", "SR NETWORK ENGINEER": "Senior Network Engineer", "IT SUPPORT OFFICER": "IT Support Officer", "TEAM LEAD SENIOR ANALYST": "Team Lead Senior Analyst", "REGIONAL IT MANAGER": "Regional IT Manager", "QA TEST ANALYST": "QA Test Analyst", "SOFTWARE QUALITY ASSURANCE ANALYST II": "Software Quality Assurance Analyst II", "IT": "IT", "QA QC MANAGER": "QA/QC Manager", "IT CONSULTANT": "IT Consultant", "WEB DEVELOPER": "Web Developer", "DATABASE ADMINISTRATOR DATABASE": "Database Administrator", "SENIOR JAVA DEVELOPER SENIOR": "Senior Java Developer", "ITDIGITAL SOLUTIONS PROJECT MANAGER": "IT Digital Solutions Project Manager", "PYTHON DEVELOPER": "Python Developer", "SAP SECURITY SPECIALIST GBS SAP": "SAP Security Specialist", "UI": "UI", "SYSYTEM ADMINISTRATOR SYSYTEM": "Systems Administrator", "IT SERVICE DESK TECHNICIAN IT SERVICE DESK TECHNICIAN IT SERVICE DESK TECHNICIAN FISHER INVESTMENTS": "IT Service Desk Technician", "SYSTEMS ENGINEER": "Systems Engineer", "SR SOFTWARE DEVELOPER EMPOWER SOFTWARE SR": "Senior Software Developer", "WEB DEVLOPER INTERN WEB DEVLOPER INTERN": "Web Developer Intern", "CYBER SECURITY ANALYST CYBER": "Cyber Security Analyst", "INFORMATION SECURITY ENGINEER INFORMATION": "Information Security Engineer", "JAVA WEB SERVICESSOA DEVELOPER": "Java Web Services SOA Developer", "JAVA DEVELOPER": "Java Developer", "PYTHON AUTOMATION ENGINEER": "Python Automation Engineer", "SAP TEACHING ASSISTANT SAP TEACHING ASSISTANT SAP TEACHING ASSISTANT": "SAP Teaching Assistant", "SR SOFTWARE DEVELOPER SR SOFTWARE": "Senior Software Developer", "FRONT END DEVELOPER UX DESIGNER": "Front End Developer / UX Designer", "FULL STACK JAVA DEVELOPER FULL STACK": "Full Stack Java Developer", "SYSTEMS ADMINISTRATOR": "Systems Administrator", "UX DESIGNER FRONT END DEVELOPER UX DESIGNER": "UX Designer / Front End Developer", "INFORMATION SECURITY ANALYST INFORMATION": "Information Security Analyst", "JUNIOR SYSTEMS ADMINISTRATOR JUNIOR": "Junior Systems Administrator", "NETWORK ENGINEER": "Network Engineer", "TACTICAL DATA SYSTEMS ADMINISTRATOR TACTICAL DATA": "Tactical Data Systems Administrator", "FRONT END DEVELOPER": "Front End Developer", "IT SECURITY ENGINEER": "IT Security Engineer", "ITOFFICE SUPPORT SPECIALIST OVATION TECHNOLOGY GROUP": "IT Office Support Specialist", "SENIOR APPLICATIONS DEVELOPER SENIOR APPLICATIONS": "Senior Applications Developer", "IT CONSULTANT DEVELOPER": "IT Consultant / Developer", "SENIOR SYSTEMS ADMINISTRATOR SENIOR SYSTEMS": "Senior Systems Administrator", "PRODUCT SUPPORT ENGINEER PRODUCT SUPPORT ENGINEER PRODUCT SUPPORT ENGINER": "Product Support Engineer", "IT PROJECT MANAGER": "IT Project Manager", "DATABASE ADMINISTRATOR": "Database Administrator", "SR IT PROJECT MANAGER SCRUM MASTER SR": "Senior IT Project Manager / Scrum Master", "IT PROJECT ANALYST": "IT Project Analyst", "IOS": "iOS Developer", "FREELANCE WEB DEVELOPER FREELANCE": "Freelance Web Developer", "ASSOCIATE NETWORK ADMINISTRATOR ASSOCIATE NETWORK": "Associate Network Administrator", "WEB MANAGER": "Web Manager", "RESEARCH DATABASE ADMINISTRATOR RESEARCH": "Research Database Administrator", "IT SECURITY ANALYST": "IT Security Analyst", "SENIOR WEB DEVELOPER SENIOR WEB": "Senior Web Developer", "IT MANAGER IT MANAGER WEB DEVELOPER": "IT Manager / Web Developer", "SCRUM MASTER AGILE SCRUM MASTER AGILE SCRUM MASTER AGILE": "Scrum Master (Agile)", "SYSTEMS ANALYST": "Systems Analyst", "FRONT END WEB DEVELOPER": "Front End Web Developer", "REMOTE SQL SERVER DATABASE ADMINISTRATOR REMOTE SQL SERVER": "Remote SQL Server Database Administrator", "SR PENETRATION TESTER SR PENETRATION TESTER": "Senior Penetration Tester", "SENIOR INFORMATION TECHNOLOGY IT PROJECT MANAGER SENIOR INFORMATION TECHNOLOGY": "Senior IT Project Manager", "SENIOR SOFTWARE DEVELOPER SENIOR": "Senior Software Developer", "IT QA": "IT QA Analyst", "DESKTOP ADMINISTRATOR DESKTOP": "Desktop Administrator", "SENIOR FRONT END WEB DEVELOPER SENIOR": "Senior Front End Web Developer", "STAFF UX DESIGNER STAFF UX DESIGNER UX DESIGNER MICROSOFT POWERAPPS": "Staff UX Designer" }
df["category"] = df["category"].map(mapped_vals).fillna(df["category"])
df = df[df["category"].isin(it_job_list)]

file_path = out_dir / "dataset2.csv"
df.to_csv(file_path, index=False, encoding='utf-8')

#### Dataset 3. 

In [6]:
p = os.path.join(os.getcwd(), "Sachinkelenjaguri_resume_Dataset.csv")
with open(p, "rb") as fh:
    head = fh.read(200)
print("First Bytes:", head)
print("has utf-8 BOM:", head.startswith(b"\xef\xbb\xbf"))

has_nul = b"\x00" in head
print("Contains NUL bytes:", has_nul)

df = pd.read_csv("Sachinkelenjaguri_resume_Dataset.csv", encoding="utf-8", low_memory=False, index_col='Unnamed: 0')
df['Category'] = df['Category'].astype(str).apply(ftfy.fix_text)
df["Resume"] = df["Resume"].astype(str).apply(ftfy.fix_text)

df.rename(columns={"Category": "category", "Resume": "text"}, inplace=True)

data_path = out_dir / "dataset3.csv"
df.to_csv(data_path, index=False, encoding='utf-8')

First Bytes: b',Category,Resume\n0,Data Science,"Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, Na\xc3\x83\xc2\xafve Ba'
has utf-8 BOM: False
Contains NUL bytes: False


### 2.2 handling data from the directories.

#### Dataset 4.

In [9]:
dataset_path = os.path.join(os.getcwd(), "wahib04/multilabel-resume-dataset/versions/1/data.csv")
df = pd.read_csv(dataset_path, encoding='utf-8')
df = df.drop(columns='Label').reset_index(drop=True)

def splitter(row):
    if not isinstance(row, str):
        row = '' if pd.isna(row) else str(row)
    if '-' in row:
        title, rest = row.split('-', 1)
        return title.strip(), rest.strip()
    return None, row.strip()
series = df["Resume"]
df[["category", "text"]] = series.apply(lambda x: pd.Series(splitter(x)))
df = df[["category", "text"]]

data_path = out_dir / "dataset4.csv"
df.to_csv(data_path, index=False, encoding="utf-8")

#### Dataset 5

In [11]:
dataset_path = os.path.join(os.getcwd(), "suriyaganesh/resume-dataset-structured/versions/2")
print(os.listdir(dataset_path))

df1_path = os.path.join(dataset_path, "01_people.csv")
df1 = pd.read_csv(df1_path)

df2_path = os.path.join(dataset_path, "02_abilities.csv")
df2 = pd.read_csv(df2_path)

df3_path = os.path.join(dataset_path, "03_education.csv")
df3 = pd.read_csv(df3_path)

df4_path = os.path.join(dataset_path, "04_experience.csv")
df4 = pd.read_csv(df4_path)

df5_path = os.path.join(dataset_path, "05_person_skills.csv")
df5 = pd.read_csv(df5_path)

df6_path = os.path.join(dataset_path, "06_skills.csv")
df6 = pd.read_csv(df6_path)

merged_1 = df1.merge(df2, on="person_id", how="inner")
merged_1 = merged_1.drop(columns=["email", "phone", "linkedin"]).reset_index(drop=True)

rest_dfs = [df3, df4, df5]
merged_2 = reduce(lambda left, right: pd.merge(left, right, on="person_id", how="inner"), rest_dfs)
merged_2 = merged_2.drop(columns="location_x").reset_index(drop=True)

merged_3 = merged_2.merge(df6, on="skill", how="inner")

merged_3["program"] = merged_3["program"].fillna("Not attended to University")

merged_reduced = merged_3.groupby(["person_id"]).agg({
    "program": lambda x: ', '.join(sorted(set(x.dropna().astype(str)))), 
    "title": lambda x: ', '.join(sorted(set(x.dropna().astype(str)))), 
    "firm": lambda x: ', '.join(sorted(set(x.dropna().astype(str)))), 
    "skill": lambda x: ', '.join(sorted(set(x.dropna().astype(str))))}
    ).reset_index()

merged_reduced["resume"] = np.where(
    merged_reduced["program"] != "Not attended to University",
    "Candidate " + merged_reduced["person_id"].astype(str)
    + ", has completed " + merged_reduced["program"]
    + ", and worked in the following positions: " + merged_reduced["title"]
    + ", at the following companies: " + merged_reduced["firm"]
    + ", has skills: " + merged_reduced["skill"],
    
    "Candidate " + merged_reduced["person_id"].astype(str)
    + ", has not attended university, and worked in the following positions: "
    + merged_reduced["title"]
    + ", at the following companies: " + merged_reduced["firm"]
    + ", has skills: " + merged_reduced["skill"]
)

final_set = merged_reduced[["title", "resume"]]

final_set.rename(columns={"title" : "category", "resume" : "text"})

final_path = out_dir / "dataset5.csv"
final_set.to_csv(final_path, index=False, encoding="utf-8")

['05_person_skills.csv', '03_education.csv', '06_skills.csv', '04_experience.csv', '02_abilities.csv', '01_people.csv']
