In [7]:
import spacy
import pickle
import random

In [9]:
train_data = pickle.load(open('train_data.pkl', 'rb'))
train_data[0]

('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [18]:
import spacy
import pickle
import random
from spacy.training import Example
import warnings

# Load Training Data
train_data = pickle.load(open('train_data.pkl', 'rb'))

# Load blank English model
nlp = spacy.blank('en')

# ✅ Function to Remove Overlapping Entities
def remove_overlapping_entities(entities):
    sorted_entities = sorted(entities, key=lambda x: x[0])  # Sort by start offset
    filtered_entities = []
    last_end = -1

    for start, end, label in sorted_entities:
        if start >= last_end:  # No overlap
            filtered_entities.append((start, end, label))
            last_end = end  # Update last end position
        else:
            print(f"⚠️ Removed Overlapping Entity: {(start, end, label)}")  # Debugging

    return filtered_entities

# ✅ Function to Check Misaligned Entities
def check_alignment(nlp, text, entities):
    doc = nlp.make_doc(text)
    tags = spacy.training.offsets_to_biluo_tags(doc, entities)
    
    if '-' in tags:
        print(f"⚠️ Misaligned entities detected in text: {text[:50]}...")

# ✅ Function to Train Model
def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)

    # Add labels to NER
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])

    # ✅ Preprocess Training Data (Fix Issues)
    for i, (text, annotations) in enumerate(train_data):
        # Remove overlapping entities
        annotations["entities"] = remove_overlapping_entities(annotations["entities"])
        
        # Check for misaligned entities
        check_alignment(nlp, text, annotations["entities"])

    # Train Model
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # Train only NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print(f"🚀 Starting iteration {itn}")
            random.shuffle(train_data)
            losses = {}

            for text, annotations in train_data:
                try:
                    # Convert (text, annotation) into spaCy Example object
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    
                    # Train with corrected format
                    nlp.update(
                        [example],  # Batch of examples
                        drop=0.2,   # Dropout - prevent overfitting
                        sgd=optimizer,  # Optimizer
                        losses=losses
                    )
                except Exception as e:
                    print(f"❌ Error during training: {e}")

            print(f"📉 Losses: {losses}")

# Train Model
train_model(train_data)



⚠️ Removed Overlapping Entity: (1209, 1215, 'Companies worked at')
⚠️ Removed Overlapping Entity: (1417, 1423, 'Companies worked at')
⚠️ Removed Overlapping Entity: (1696, 1702, 'Companies worked at')
⚠️ Removed Overlapping Entity: (1749, 1755, 'Companies worked at')
⚠️ Misaligned entities detected in text: Govardhana K Senior Software Engineer  Bengaluru, ...
⚠️ Misaligned entities detected in text: Harini Komaravelli Test Analyst at Oracle, Hyderab...
⚠️ Misaligned entities detected in text: Hartej Kathuria Data Analyst Intern - Oracle Retai...
⚠️ Misaligned entities detected in text: Ijas Nizamuddin Associate Consultant - State Stree...
⚠️ Misaligned entities detected in text: Imgeeyaul Ansari java developer  Pune, Maharashtra...
⚠️ Misaligned entities detected in text: Jay Madhavi Navi Mumbai, Maharashtra - Email me on...
⚠️ Misaligned entities detected in text: Jitendra Babu FI/CO Consultant in Tech Mahindra - ...
⚠️ Misaligned entities detected in text: Jyotirbindu Patnaik Associ



⚠️ Misaligned entities detected in text: Khushboo Choudhary Developer  Noida, Uttar Pradesh...
⚠️ Misaligned entities detected in text: kimaya sonawane Thane, Maharashtra - Email me on I...
⚠️ Misaligned entities detected in text: Koushik Katta Devops  Hyderabad, Telangana - Email...
⚠️ Misaligned entities detected in text: Kowsick Somasundaram Certified Network Associate T...
⚠️ Misaligned entities detected in text: Lakshika Neelakshi Senior Systems Engineer - Infos...
⚠️ Misaligned entities detected in text: Madas Peddaiah Anantapur, Andhra Pradesh - Email m...
⚠️ Misaligned entities detected in text: Madhuri Sripathi Banglore, Karnataka, Karnataka - ...
⚠️ Removed Overlapping Entity: (3535, 3541, 'Companies worked at')
⚠️ Removed Overlapping Entity: (3714, 3720, 'Companies worked at')
⚠️ Misaligned entities detected in text: Mahesh Vijay Bengaluru, Karnataka - Email me on In...
⚠️ Misaligned entities detected in text: Manisha Bharti Software Automation Engineer  Pune,...
⚠️ Removed 



⚠️ Misaligned entities detected in text: Ramesh HP CES ASSOCIATE CONSULTANT  Bangalore, Kar...
⚠️ Misaligned entities detected in text: Ramya. P Hyderabad, Telangana - Email me on Indeed...
⚠️ Removed Overlapping Entity: (4231, 4238, 'Companies worked at')
⚠️ Misaligned entities detected in text: R Arunravi Functional Consultant / WM Lead - SAP E...
⚠️ Misaligned entities detected in text: Ravi Shankar Working as Escalation Engineer with M...
⚠️ Misaligned entities detected in text: Ravi Shivgond Bidar, Karnataka - Email me on Indee...
⚠️ Removed Overlapping Entity: (13, 34, 'Designation')
⚠️ Removed Overlapping Entity: (370, 391, 'Designation')
⚠️ Misaligned entities detected in text: Roshan Sinha Application Developer - SAP ABAP  Kol...
⚠️ Removed Overlapping Entity: (2528, 2532, 'College Name')
⚠️ Misaligned entities detected in text: Sai Dhir - Email me on Indeed: indeed.com/r/Sai-Dh...
⚠️ Misaligned entities detected in text: Sai Patha Mule ESB Integration Developer - Cisco S...
⚠



⚠️ Misaligned entities detected in text: Shubham Mittal System Engineer - Infosys Limited  ...
⚠️ Misaligned entities detected in text: Sivaganesh Selvakumar DevOps Consultant with Infos...
⚠️ Removed Overlapping Entity: (941, 947, 'Companies worked at')
⚠️ Removed Overlapping Entity: (1077, 1120, 'Email Address')
⚠️ Removed Overlapping Entity: (1198, 1204, 'Companies worked at')
⚠️ Removed Overlapping Entity: (1319, 1325, 'Companies worked at')
⚠️ Removed Overlapping Entity: (1794, 1800, 'Companies worked at')
⚠️ Misaligned entities detected in text: Snehal Jadhav Mumbai, Maharashtra - Email me on In...
⚠️ Removed Overlapping Entity: (3939, 3948, 'Companies worked at')
⚠️ Misaligned entities detected in text: Soumya Balan IT SUPPORT  Sulthan Bathery, Kerala, ...
⚠️ Misaligned entities detected in text: Soumya Balan Soumya Balan - BE Computer Science - ...
⚠️ Removed Overlapping Entity: (1865, 1868, 'Skills')
⚠️ Removed Overlapping Entity: (2058, 2074, 'Skills')
⚠️ Removed Overlapping 



⚠️ Misaligned entities detected in text: Vikas Singh Chandigarh, Chandigarh - Email me on I...
⚠️ Removed Overlapping Entity: (7777, 7784, 'Years of Experience')
⚠️ Removed Overlapping Entity: (7795, 7802, 'Years of Experience')
⚠️ Removed Overlapping Entity: (7810, 7817, 'Years of Experience')
⚠️ Removed Overlapping Entity: (7829, 7836, 'Years of Experience')
⚠️ Misaligned entities detected in text: Yasothai Jayaramachandran Lead Engineer - Automati...
⚠️ Misaligned entities detected in text: Yathishwaran P Maximo Consultant - Infosys Limited...
⚠️ Misaligned entities detected in text: Yogi Pesaru Developer - Infosys Limited  Hyderabad...
⚠️ Misaligned entities detected in text: Anurag Asthana Pune, Maharashtra - Email me on Ind...
⚠️ Misaligned entities detected in text: Syed Sadath ali Coimbatore - Email me on Indeed: i...
⚠️ Removed Overlapping Entity: (10, 33, 'Designation')
⚠️ Misaligned entities detected in text: Nida Khan Tech Support Executive - Teleperformance...
⚠️ Misaligne



⚠️ Misaligned entities detected in text: Viny Khandelwal Self-employed in Family Business -...
⚠️ Misaligned entities detected in text: amarjyot sodhi Voice and Accent Trainer :Masters i...
⚠️ Misaligned entities detected in text: Sameer Kujur Orrisha - Email me on Indeed: indeed....
⚠️ Misaligned entities detected in text: Zaheer Uddin Technical Project Manager  Hyderabad,...
⚠️ Misaligned entities detected in text: Abdul B Arabic Language supporter (Content Analyst...
⚠️ Misaligned entities detected in text: Bike Rally Chief Coordinator of LEAR  Palghat, Ker...
⚠️ Removed Overlapping Entity: (15, 34, 'Designation')
⚠️ Removed Overlapping Entity: (4708, 4717, 'Companies worked at')
⚠️ Misaligned entities detected in text: Girish Acharya Technical Architect & Sr. Software ...
⚠️ Misaligned entities detected in text: Asha Subbaiah (Microsoft Partner Readiness Operati...




⚠️ Misaligned entities detected in text: Divesh Singh Bengaluru, Karnataka - Email me on In...
⚠️ Misaligned entities detected in text: Ramesh chokkala Telangana - Email me on Indeed: in...
⚠️ Misaligned entities detected in text: Ganesh AlalaSundaram A Dev-Test Professional with ...
⚠️ Misaligned entities detected in text: Srinu Naik Ramavath anymore job  Serilingampalle, ...
⚠️ Removed Overlapping Entity: (0, 4, 'Location')
⚠️ Removed Overlapping Entity: (1576, 1580, 'Location')
⚠️ Misaligned entities detected in text: Puneet Bhandari SAP SD lead - Microsoft IT  Pune, ...
⚠️ Misaligned entities detected in text: Aarti Pimplay Operations Center Shift Manager (OCS...
⚠️ Misaligned entities detected in text: Bangalore Tavarekere Volunteer Contestant, Yappon ...
⚠️ Misaligned entities detected in text: Avani Priya - Email me on Indeed: indeed.com/r/Ava...
⚠️ Misaligned entities detected in text: Sanand Pal SQL and MSBI Developer with experience ...
⚠️ Misaligned entities detected in text



⚠️ Misaligned entities detected in text: Laya A Cluster HR Manager - Velammal New  Chennai,...
⚠️ Removed Overlapping Entity: (8133, 8136, 'Degree')
⚠️ Misaligned entities detected in text: Vishwanath P Senior Executive (MIS & Audit) - Job ...
⚠️ Misaligned entities detected in text: Hemil Bhavsar Jr. ASP.NET Developer in True Vision...
⚠️ Misaligned entities detected in text: Siddhartha Chetri 7 years of experience in IT Netw...
⚠️ Misaligned entities detected in text: Pratik Vaidya Pune, Maharashtra - Email me on Inde...
⚠️ Misaligned entities detected in text: Ramakrishna Rao DevOps Consultant - Tech Mahindra ...
⚠️ Misaligned entities detected in text: Keshav Dhawale 3 TCS Security guard Access Control...
⚠️ Misaligned entities detected in text: Praveen Bhaskar Program Manager (Software Delivery...
⚠️ Misaligned entities detected in text: Gunjan Nayyar Hoshiarpur, Punjab - Email me on Ind...
⚠️ Misaligned entities detected in text: Rupesh Reddy Technology Consultant - EIT Services 



⚠️ Misaligned entities detected in text: Debasish Dasgupta Trainer-Finacle-Core Banking Sol...
⚠️ Misaligned entities detected in text: Suresh Kanagala Architecture SharePoint/Office 365...
⚠️ Misaligned entities detected in text: Jaspreet Kaur Oceanic Consultants as a HR Executiv...
⚠️ Misaligned entities detected in text: Somanath Behera Associate, Cognizant technology So...
⚠️ Misaligned entities detected in text: Ashish Indoriya Sr. Systems Engineer at Infosys Li...
⚠️ Misaligned entities detected in text: Dilliraja Baskaran Tamil Nadu - Email me on Indeed...
⚠️ Misaligned entities detected in text: Deepika S Test Engineer - Infosys Ltd  - Email me ...
⚠️ Misaligned entities detected in text: Jacob Philip Kottayam, Kerala - Email me on Indeed...
⚠️ Misaligned entities detected in text: Yogesh Ghatole Engineer / Electrical Supervisor, S...
⚠️ Misaligned entities detected in text: Ajay Elango Software Engineer  Bangalore City, Kar...
⚠️ Misaligned entities detected in text: Shaik Taz



⚠️ Misaligned entities detected in text: Sridevi H Bangalore, Karnataka - Email me on Indee...
⚠️ Misaligned entities detected in text: Raktim Podder 6+ Exp in banking operations and cre...
⚠️ Misaligned entities detected in text: Pavithra M "Infosys" internship  Bengaluru, Karnat...
⚠️ Misaligned entities detected in text: shrikant desai Working as accountant @ infosys  Pu...
⚠️ Misaligned entities detected in text: Kiran Kumar I Having 2.1 years of Experience in IT...
⚠️ Misaligned entities detected in text: Chaban kumar Debbarma Tripura - Email me on Indeed...
⚠️ Misaligned entities detected in text: Akash Gulhane Microsoft Certified System Engineer ...
⚠️ Misaligned entities detected in text: K. Siddharth System Administrator (Server) Microso...
⚠️ Misaligned entities detected in text: Shivam Rathi Microsoft technology Associate (MTA) ...
⚠️ Removed Overlapping Entity: (1258, 1262, 'Location')
⚠️ Misaligned entities detected in text: Nitin Verma Assisting Microsoft Partners - Excha

In [24]:
nlp.to_disk('nlp_model')

In [25]:
nlp_model = spacy.load('nlp_model')

In [26]:
train_data[0][0]

'Raja Chandra Mouli Cuddapah, Andhra Pradesh - Email me on Indeed: indeed.com/r/Raja-Chandra- Mouli/445cbf3eb0a361cd  Willing to relocate to: Cuddapah, Andhra Pradesh - Vijayawada, Andhra Pradesh - Visakhapatnam, Andhra Pradesh  WORK EXPERIENCE  ms office  Microsoft -  Cuddapah, Andhra Pradesh -  May 2018 to Present  EDUCATION  BSc,Mecs,2nd year completed in Computer science  ards collage kadapa -  Cuddapah, Andhra Pradesh  May 2018 to June 2019  SKILLS  ms office, internet,java (Less than 1 year)  CERTIFICATIONS/LICENSES  Degree,BSc(MECs) 2nd year  May 2018 to Present  ADDITIONAL INFORMATION  3-101 KC Narayana Street,new madavaram(v),Vontimitta(M),Kadapa(D)  https://www.indeed.com/r/Raja-Chandra-Mouli/445cbf3eb0a361cd?isid=rex-download&ikw=download-top&co=IN https://www.indeed.com/r/Raja-Chandra-Mouli/445cbf3eb0a361cd?isid=rex-download&ikw=download-top&co=IN'

In [27]:
doc = nlp_model(train_data[0][0])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Raja Chandra Mouli
LOCATION                      - Cuddapah
EMAIL ADDRESS                 - indeed.com/r/Raja-Chandra- Mouli/445cbf3eb0a361cd
LOCATION                      - Cuddapah
COMPANIES WORKED AT           - Microsoft
LOCATION                      - Cuddapah
COLLEGE NAME                  - ards collage kadapa
LOCATION                      - Cuddapah
SKILLS                        - ms office, internet,java (Less than 1 year)
