# This is the part where the data handling pipeline is created.
- here data will be turned into json format for spacy training.
- Pandas will be used for cleaning and preparation of the data.
- Then they will be put through preatrained spacy model and then NER will be created.

## Step 1. Importing Libraries/Frameworks
- Formatting of the datasets into one format (JSON).
- save it in a docbin format
- Then feed it to a NER model (spacy)

In [8]:
import json 
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import chardet
import ftfy

### Open a new directory to save the prepared data.

In [14]:
out_dir = Path("NER_ready_data")
out_dir.mkdir(parents=True, exist_ok=True)

## Step 2. Handling Datasets

### 2.1 Preparing CSV data.

#### Loading datasets

- Since all of them has unique structure and naming I will open them one by one.

#### Dataset 1.

In [4]:
data_path = os.path.join(os.getcwd(), "ahmedheakl_resume_atlas.csv")
df = pd.read_csv(data_path, index_col='Unnamed: 0')
df['Category'].unique()
it_jobs_list = ["Blockchain", "Data Science", "Database", "DevOps", "DotNet Developer", "ETL Developer", "Information Technology", "Java Developer", 
                "Network Security Engineer", "Python Developer", "React Developer", "SAP Developer", "SQL Developer", "Web Designing"]
df = df[df['Category'].isin(it_jobs_list)]
df.reset_index(drop=True, inplace=True)
## dataset does not contain empty values so there is no need to do anything more for spaCy training.
file_path = out_dir / "dataset1.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

#### Dataset 2.

In [15]:
data_path = os.path.join(os.getcwd(), "InferencePrince555_Resume_data.csv")
df = pd.read_csv(data_path, index_col='Unnamed: 0')
df = df.drop(columns=['input','instruction']).reset_index(drop=True)
## this one does not have an empty vals so this one directly goes to 
file_path = out_dir / "dataset2.csv"
df.to_csv(file_path, index=False, encoding='utf-8')

#### Dataset 3. 

In [19]:
p = os.path.join(os.getcwd(), "Sachinkelenjaguri_resume_Dataset.csv")
with open(p, "rb") as fh:
    head = fh.read(200)
print("First Bytes:", head)
print("has utf-8 BOM:", head.startswith(b"\xef\xbb\xbf"))

has_nul = b"\x00" in head
print("Contains NUL bytes:", has_nul)

df = pd.read_csv("Sachinkelenjaguri_resume_Dataset.csv", encoding="utf-8", low_memory=False, index_col='Unnamed: 0')
df['Category'] = df['Category'].astype(str).apply(ftfy.fix_text)
df["Resume"] = df["Resume"].astype(str).apply(ftfy.fix_text)

data_path = out_dir / "dataset3.csv"
df.to_csv(data_path, index=False, encoding='utf-8')

First Bytes: b',Category,Resume\n0,Data Science,"Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, Na\xc3\x83\xc2\xafve Ba'
has utf-8 BOM: False
Contains NUL bytes: False


### 2.2 handling data from the directories.

In [None]:
dataset_path = os.path.join(os.getcwd(), "wahib04/multilabel-resume-dataset/versions/1/data.csv")
df = pd.read_csv(dataset_path, encoding='utf-8')
df = df.drop(columns='Label').reset_index(drop=True)

data_path = out_dir / "dataset4.csv"
df.to_csv(data_path, index=False, encoding="utf-8")

TypeError: unsupported operand type(s) for -: 'str' and 'PosixPath'