# This is the part where the data handling pipeline is created.
- here data will be turned into json format for spacy training.
- Pandas will be used for cleaning and preparation of the data.
- Then they will be put through preatrained spacy model and then NER will be created.

## Step 1. Importing Libraries/Frameworks
- Formatting of the datasets into one format (JSON).
- save it in a docbin format
- Then feed it to a NER model (spacy)

In [13]:
import json 
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import chardet
import ftfy
from functools import reduce
import re

### Open a new directory to save the prepared data.

In [14]:
out_dir = Path("NER_ready_data")
out_dir.mkdir(parents=True, exist_ok=True)

## Step 2. Handling Datasets

### 2.1 Preparing CSV data.

#### Loading datasets

- Since all of them has unique structure and naming I will open them one by one.

#### Global Functions to make the data more consistent.

In [8]:
def lowercase(data):
    df = pd.read_csv(data, encoding="utf-8")
    for column in df.columns:
        df[column] = df[column].astype(str).str.lower()
    return df

def text_standardizer(data):
    bachelors = ["bs", "bsc", ""]
    masters = []
    mapping = {
        bachelors: "bachelor's",
        masters: "master's"
    }

path_of_clean_data = os.path.join(os.getcwd(), "NER_ready_data")
print(len(os.listdir(path_of_clean_data)))
for i in range(len(os.listdir(path_of_clean_data))):
    lowercase(os.path.join(path_of_clean_data, f"dataset{i+1}.csv"))

5
           category                                               text
0        blockchain  ukrainian development western management middl...
1        blockchain  mobilunity ukrainian development western manag...
2        blockchain  ron phoenix blockchain developer 15491295157 d...
3        blockchain  first last blockchain developer burlington ver...
4        blockchain  ron phoenix blockchain developer 15491295157 d...
...             ...                                                ...
3843  web designing  jessica claire montgomery street san francisco...
3844  web designing  jessica claire montgomery street san francisco...
3845  web designing  summary jessica claire 100 montgomery st 10th ...
3846  web designing  jessica claire montgomery street san francisco...
3847  web designing  websites portfolios profiles professional summ...

[3848 rows x 2 columns]
                                             Resume_test
0      accountant professional summary results orient...
1      

#### Dataset 1.

In [4]:
data_path = os.path.join(os.getcwd(), "ahmedheakl_resume_atlas.csv")
df = pd.read_csv(data_path, index_col='Unnamed: 0')
df['Category'].unique()
it_jobs_list = ["Blockchain", "Data Science", "Database", "DevOps", "DotNet Developer", "ETL Developer", "Information Technology", "Java Developer", 
                "Network Security Engineer", "Python Developer", "React Developer", "SAP Developer", "SQL Developer", "Web Designing"]
df = df[df['Category'].isin(it_jobs_list)]
df.reset_index(drop=True, inplace=True)
df.rename(columns={"Category": "category", "Text": "text"}, inplace=True)
## dataset does not contain empty values so there is no need to do anything more for spaCy training.
file_path = out_dir / "dataset1.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

#### Dataset 2.

In [None]:
data_path = os.path.join(os.getcwd(), "InferencePrince555_Resume_data.csv")
df = pd.read_csv(data_path, index_col='Unnamed: 0')

def splitter(row):
    # ensure we operate on a string (handles NaN/None)
    if not isinstance(row, str):
        if pd.isna(row):
            row = ''
        else:
            row = str(row)
    match = re.match(r'(^[A-Z\s]+)\s(.*)', row)
    if match:
        title = match.group(1).strip()
        cv = match.group(2).strip()
        return title, cv
    return None, row.strip()

source_col = 'Resume_test' if 'Resume_test' in df.columns else ('Resume' if 'Resume' in df.columns else df.columns[0])
series = df[source_col].fillna('').astype(str)
df[["category", "text"]] = series.apply(lambda x: pd.Series(splitter(x)))

# show a quick preview
df.head()
df = df[["category", "text"]]
# Save cleaned dataset to NER_ready_data
df["category"].unique().tolist()
it_job_list = ['SOFTWARE DEVELOPER', 'TECHNOLOGY PROJECT AND PRODUCT MANAGER']
# file_path = out_dir / "dataset2.csv"
# df.to_csv(file_path, index=False, encoding='utf-8')

['ACCOUNTANT',
 'STAFF ACCOUNTANT',
 'SENIOR ACCOUNTANT',
 'FINANCIAL ACCOUNTANT',
 'CORPORATE ACCOUNTANT',
 'ACCOUNTANT II',
 'PROJECT ACCOUNTANT',
 'CONTRACT ACCOUNTANT',
 'ACCOUNTANT I',
 'SUPERVISOR ACCOUNTANT',
 'ACCOUNTANT III',
 'GENERAL ACCOUNTANT',
 'VOLUNTEER ACCOUNTANT',
 'INVESTMENT ACCOUNTANT',
 'PAYROLL ACCOUNTANT',
 'PRINCIPAL ACCOUNTANT',
 'ASSISTANT ACCOUNTANT',
 'STAFF ACCOUNTANT TAM BUI',
 'LEAD ACCOUNTANT',
 'ACCOUNTANT HELPER',
 None,
 'BILLING ACCOUNTANT',
 'ASSOCIATE CLAIM PROCESSOR MEMBER SERVICE ADVOCATE',
 'SKILLS',
 'PERSONAL BANKER SAFE',
 'QUALIFICATION',
 'CLIENT ADVOCATE ESCALATION SPECIALIST',
 'CUSTOMER SERVICE ADVOCATE',
 'SERVICE ADVOCATE IV CLINICAL SUPPORT',
 'CASE MANAGER OUTREACH ADVOCATE',
 'CUSTOMER CARE ADVOCATE',
 'SENIOR CUSTOMER SERVICE ADVOCATE',
 'FAMILY AND COMMUNITY ADVOCATE',
 'BUSINESS ADVOCATE BANKER',
 'PATIENT HEALTH ADVOCATE',
 'DOMESTIC VIOLENCE COUNSELOR ADVOCATE',
 'CHILD PERFORMER WELFARE ADVOCATE',
 'REVENUE CYCLE ADVOCATE',
 

#### Dataset 3. 

In [6]:
p = os.path.join(os.getcwd(), "Sachinkelenjaguri_resume_Dataset.csv")
with open(p, "rb") as fh:
    head = fh.read(200)
print("First Bytes:", head)
print("has utf-8 BOM:", head.startswith(b"\xef\xbb\xbf"))

has_nul = b"\x00" in head
print("Contains NUL bytes:", has_nul)

df = pd.read_csv("Sachinkelenjaguri_resume_Dataset.csv", encoding="utf-8", low_memory=False, index_col='Unnamed: 0')
df['Category'] = df['Category'].astype(str).apply(ftfy.fix_text)
df["Resume"] = df["Resume"].astype(str).apply(ftfy.fix_text)

df.rename(columns={"Category": "category", "Resume": "text"}, inplace=True)

data_path = out_dir / "dataset3.csv"
df.to_csv(data_path, index=False, encoding='utf-8')

First Bytes: b',Category,Resume\n0,Data Science,"Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, Na\xc3\x83\xc2\xafve Ba'
has utf-8 BOM: False
Contains NUL bytes: False


### 2.2 handling data from the directories.

#### Dataset 4.

In [6]:
dataset_path = os.path.join(os.getcwd(), "wahib04/multilabel-resume-dataset/versions/1/data.csv")
df = pd.read_csv(dataset_path, encoding='utf-8')
df = df.drop(columns='Label').reset_index(drop=True)

data_path = out_dir / "dataset4.csv"
df.to_csv(data_path, index=False, encoding="utf-8")

#### Dataset 5

In [20]:
dataset_path = os.path.join(os.getcwd(), "suriyaganesh/resume-dataset-structured/versions/2")
print(os.listdir(dataset_path))

df1_path = os.path.join(dataset_path, "01_people.csv")
df1 = pd.read_csv(df1_path)

df2_path = os.path.join(dataset_path, "02_abilities.csv")
df2 = pd.read_csv(df2_path)

df3_path = os.path.join(dataset_path, "03_education.csv")
df3 = pd.read_csv(df3_path)

df4_path = os.path.join(dataset_path, "04_experience.csv")
df4 = pd.read_csv(df4_path)

df5_path = os.path.join(dataset_path, "05_person_skills.csv")
df5 = pd.read_csv(df5_path)

df6_path = os.path.join(dataset_path, "06_skills.csv")
df6 = pd.read_csv(df6_path)

merged_1 = df1.merge(df2, on="person_id", how="inner")
merged_1 = merged_1.drop(columns=["email", "phone", "linkedin"]).reset_index(drop=True)

rest_dfs = [df3, df4, df5]
merged_2 = reduce(lambda left, right: pd.merge(left, right, on="person_id", how="inner"), rest_dfs)
merged_2 = merged_2.drop(columns="location_x").reset_index(drop=True)

merged_3 = merged_2.merge(df6, on="skill", how="inner")

merged_3["program"] = merged_3["program"].fillna("Not attended to University")

merged_reduced = merged_3.groupby(["person_id"]).agg({
    "program": lambda x: ', '.join(sorted(set(x.dropna().astype(str)))), 
    "title": lambda x: ', '.join(sorted(set(x.dropna().astype(str)))), 
    "firm": lambda x: ', '.join(sorted(set(x.dropna().astype(str)))), 
    "skill": lambda x: ', '.join(sorted(set(x.dropna().astype(str))))}
    ).reset_index()

merged_reduced["resume"] = np.where(
    merged_reduced["program"] != "Not attended to University",
    "Candidate " + merged_reduced["person_id"].astype(str)
    + ", has completed " + merged_reduced["program"]
    + ", and worked in the following positions: " + merged_reduced["title"]
    + ", at the following companies: " + merged_reduced["firm"]
    + ", has skills: " + merged_reduced["skill"],
    
    "Candidate " + merged_reduced["person_id"].astype(str)
    + ", has not attended university, and worked in the following positions: "
    + merged_reduced["title"]
    + ", at the following companies: " + merged_reduced["firm"]
    + ", has skills: " + merged_reduced["skill"]
)

final_set = merged_reduced[["person_id", "resume"]]

final_path = out_dir / "dataset5.csv"
final_set.to_csv(final_path, index=False, encoding="utf-8")

['05_person_skills.csv', '03_education.csv', '06_skills.csv', '04_experience.csv', '02_abilities.csv', '01_people.csv']
