In [1]:
from sklearn.model_selection import train_test_split

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [3]:
with open('../../datasets/dataset.json', 'r') as file:
    cv_data = json.load(file)

In [4]:
len(cv_data)

1014

In [None]:
cv_data[0]

In [6]:
!python -m spacy init fill-config ../../config/base-config.cfg ../../config/config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
../../config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [7]:

train,test =train_test_split(cv_data,test_size=0.3)

In [8]:
len(train), len(test)

(709, 305)

In [10]:
def get_spacy_doc(file, data):
    nlp = spacy.blank("en")
    db = DocBin()

    for text, annot in tqdm(data, desc="Processing data"):
        doc = nlp.make_doc(text)
        ents = []
        entity_indices = set()

        for start, end, label in annot.get('entities', []):
            # Check if start index is greater than or equal to end index
            if start >= end:
                file.write(f"Invalid indices: start={start}, end={end}, text={text}\n")
                continue

            # Check for overlapping entities
            if any(idx in entity_indices for idx in range(start, end)):
                continue

            entity_indices.update(range(start, end))

            try:
                span = doc.char_span(start, end, label=label, alignment_mode='strict')
            except ValueError as e:
                # Handle span creation errors
                file.write(f"Error creating span: {e}\n")
                continue

            if span is not None:
                ents.append(span)

        # Set entity spans for the document
        doc.ents = ents
        db.add(doc)

    return db

In [13]:
with open('../../datasets/dataset.json', 'r') as file:

    db =get_spacy_doc(file,train)
    db.to_disk('../../datasets/train.spacy')

    db =get_spacy_doc(file,test)
    db.to_disk('../../datasets/test.spacy')

Processing data: 100%|██████████| 709/709 [00:01<00:00, 367.31it/s]
Processing data: 100%|██████████| 305/305 [00:00<00:00, 361.40it/s]


In [15]:
!python -m spacy train ../../config/config.cfg --output ../../datasets/output --paths.train ../../datasets/train.spacy --paths.dev ../../datasets/test.spacy

[38;5;2m✔ Created output directory: ../../datasets/output[0m
[38;5;4mℹ Saving to output directory: ../../datasets/output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    136.90    0.00    0.00    0.00    0.00
  0     200       6729.96  15860.79   36.26   50.68   28.22    0.36
  0     400       1077.62   6718.59   52.14   51.39   52.91    0.52
  0     600       8557.33   5948.50   60.65   65.14   56.73    0.61
  1     800       3567.67   5417.40   66.10   67.08   65.16    0.66
  1    1000      10452.46   4871.54   69.03   73.06   65.43    0.69
  1    1200       3794.29   4272

In [29]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Example usage
pdf_path = "../../datasets/test/Zia-ur-rehman.pdf"
extracted_text = extract_text_from_pdf(pdf_path)

In [30]:
extracted_text

"Objective\nTechnichal\nExpertise\nProjects\n10/2022  — 01/2023 Work Experience\n08/2020  — 08/2024 Education\nCertificationsEnthusiastic CS student with a passion for web development and a strong foundation in javascript. Seeking opportunities to apply\nmy coding skills, creativity , and problem-solving abilities to contribute to dynamic projects and enhance user experiences. Eager to\ncollaborate with forward-thinking teams and continue to expand my expertise in the world of technology .\nProblem Solving\nProject Management\nJavascript Developer\nReact js\nNode js\nDatabase (MongoDB)\nVersion Control (git, GitHub)\nAdelite (MERN) advertising website to list products around globe\nBlogger (MERN) Blog community \nCoin Bounce App (MERN)\nE-commerce Store (MERN)\nQuiz App (React)\nTennis Game (React)\nWeb Developer Intern\nWorked as web developer intern at smart set. There I worked with senior developer and explore new technologies and enhanced\nmy logical building and development skills

In [31]:
nlp = spacy.load("../../datasets/output/model-best")
doc = nlp(extracted_text)
for ent in doc.ents:
  print(ent.text,  "  ->>>>>>>  ", ent.label_)

CertificationsEnthusiastic CS student with a passion for web development and a strong foundation in javascript. Seeking opportunities to apply
my coding skills, creativity , and problem-solving abilities to contribute to dynamic projects and enhance user experiences. Eager to
collaborate with forward-thinking teams and continue to expand my expertise in the world of technology .
Problem Solving   ->>>>>>>   CERTIFICATION
Project Management   ->>>>>>>   SKILLS
Javascript Developer   ->>>>>>>   SKILLS
React js   ->>>>>>>   SKILLS
Node js   ->>>>>>>   SKILLS
Database (MongoDB)   ->>>>>>>   SKILLS
E-commerce Store (MERN)   ->>>>>>>   SKILLS
Quiz App (React)   ->>>>>>>   SKILLS
Web Developer Intern   ->>>>>>>   WORKED AS
Frontend Development (PFTP)   ->>>>>>>   WORKED AS
Programming Competition (Softec '23)   ->>>>>>>   CERTIFICATION
Python Programming (Hacker Rank) 03174557957   ->>>>>>>   CERTIFICATION
Bs Computer Science   ->>>>>>>   COMPANIES WORKED AT
