In [1]:
!pip list

Package                      Version
---------------------------- -----------
absl-py                      2.1.0
asttokens                    2.4.1
astunparse                   1.6.3
blinker                      1.8.1
certifi                      2024.2.2
charset-normalizer           3.3.2
click                        8.1.7
colorama                     0.4.6
comm                         0.2.2
contourpy                    1.2.1
cycler                       0.12.1
debugpy                      1.8.1
decorator                    5.1.1
dlib                         19.24.1
et-xmlfile                   1.1.0
executing                    2.0.1
face-recognition             1.3.0
face_recognition_models      0.3.0
facenet                      1.0.5
Flask                        3.0.3
Flask-Cors                   4.0.1
flatbuffers                  24.3.25
fonttools                    4.51.0
gast                         0.5.4
google-pasta                 0.2.0
grpcio                       1.63.0
h5


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip install transformers torch pandas numpy ipython

In [None]:
import pandas as pd
import re
import json
from transformers import RobertaTokenizer, RobertaForTokenClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import numpy as np
from IPython.display import display, HTML
import pandas as pd
from transformers import pipeline
import json
# Load the resume data
df = pd.read_csv('./Resume.csv')
print(f"Total number of resumes loaded: {len(df)}")
display(df.head())

Total number of resumes loaded: 2484


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [None]:
SECTION_PATTERNS = {
    'education': r'Education\s*(?:.*?)(?=Experience|Skills|$)',
    'experience': r'Experience\s*(?:.*?)(?=Education|Skills|$)',
    'skills': r'Skills\s*(?:.*?)(?=Education|Experience|$)',
    'contact': r'(?:Phone|Email|Address):\s*([^\n]+)',
    'name': r'^([A-Z\s]+(?:\/[A-Z\s]+)*)',
}

In [None]:
def extract_sections(text):
    sections = {}

    # Extract sections using regex
    for section_name, pattern in SECTION_PATTERNS.items():
        matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
        if matches:
            sections[section_name] = matches[0].strip()

    # Clean and structure education information
    if 'education' in sections:
        education_data = []
        education_matches = re.finditer(
            r'(\d{4})\s*(.*?):\s*(.*?)\s*([^,]+)(?:,\s*([^,]+))?',
            sections['education']
        )
        for match in education_matches:
            education_data.append({
                'year': match.group(1),
                'degree': match.group(2),
                'major': match.group(3),
                'institution': match.group(4),
                'location': match.group(5) if match.group(5) else ''
            })
        sections['education'] = education_data

    # Structure work experience
    if 'experience' in sections:
        experience_data = []
        experience_matches = re.finditer(
            r'(\d{2}\/\d{4}|\d{2}\-\d{4})\s*to\s*(\w+\s*\d{4}|\w+|\d{2}\/\d{4}|\d{2}\-\d{4})\s*(.*?)\s*(?:Company Name|$)',
            sections['experience']
        )
        for match in experience_matches:
            experience_data.append({
                'start_date': match.group(1),
                'end_date': match.group(2),
                'title': match.group(3).strip()
            })
        sections['experience'] = experience_data

    # Clean skills section
    if 'skills' in sections:
        skills_list = re.split(r',\s*', sections['skills'])
        sections['skills'] = [skill.strip() for skill in skills_list if skill.strip()]

    return sections

In [None]:
parsed_resumes = []
for idx, row in df.iterrows():
    try:
        sections = extract_sections(row['Resume_str'])
        sections['raw_text'] = row['Resume_str']
        sections['id'] = row['ID']
        parsed_resumes.append(sections)
    except Exception as e:
        print(f"Error processing resume {idx}: {str(e)}")

# Save parsed data to JSON
with open('parsed_resumes.json', 'w') as f:
    json.dump(parsed_resumes, f, indent=2)


In [None]:
def display_parsed_resume(resume_dict):
    print(f"\nResume ID: {resume_dict['id']}")
    print("\n=== Education ===")
    if 'education' in resume_dict:
        for edu in resume_dict['education']:
            print(f"Year: {edu['year']}")
            print(f"Degree: {edu['degree']}")
            print(f"Major: {edu['major']}")
            print(f"Institution: {edu['institution']}")
            print(f"Location: {edu['location']}\n")

    print("\n=== Experience ===")
    if 'experience' in resume_dict:
        for exp in resume_dict['experience']:
            print(f"Title: {exp['title']}")
            print(f"Period: {exp['start_date']} to {exp['end_date']}\n")

    print("\n=== Skills ===")
    if 'skills' in resume_dict:
        print(", ".join(resume_dict['skills']))

    print("\n=== Name ===")
    if 'name' in resume_dict:
        print(resume_dict['name'])

    print("\n=== Contact ===")
    if 'contact' in resume_dict:
        print(resume_dict['contact'])
    print("\n" + "="*50)

# Display the first 3 parsed resumes
print("Displaying first 3 parsed resumes:")
for resume in parsed_resumes[:3]:
    display_parsed_resume(resume)

# Calculate some statistics
print("\nExtraction Statistics:")
total_resumes = len(parsed_resumes)
education_count = sum(1 for r in parsed_resumes if 'education' in r)
experience_count = sum(1 for r in parsed_resumes if 'experience' in r)
skills_count = sum(1 for r in parsed_resumes if 'skills' in r)
name_count = sum(1 for r in parsed_resumes if 'name' in r)
contact_count = sum(1 for r in parsed_resumes if 'contact' in r)

Displaying first 3 parsed resumes:

Resume ID: 16852973

=== Education ===

=== Experience ===

=== Skills ===
skills as well as computer skills, knowledge of medical terminology and procedures, statistics, billing standards, data analysis and laws regarding medical billing.         Assistant General Manager     Jun 2010   to   Dec 2010      Company Name   －   City, State     Performed duties including but not limited to, budgeting and financial management, accounting, human resources, payroll and purchasing.  Established and maintained close working relationships with all departments of the hotel to ensure maximum operation, productivity, morale and guest service.  Handled daily operations and reported directly to the corporate office.  Hired and trained staff on overall objectives and goals with an emphasis on high customer service.  Marketing and Advertising, working on public relations with the media, government and local businesses and Chamber of Commerce.         Executive Suppor

In [None]:
stats_df = pd.DataFrame({
    'Section': ['Education', 'Experience', 'Skills', 'Name', 'Contact'],
    'Count': [education_count, experience_count, skills_count, name_count, contact_count],
    'Percentage': [
        round(education_count/total_resumes * 100, 2),
        round(experience_count/total_resumes * 100, 2),
        round(skills_count/total_resumes * 100, 2),
        round(name_count/total_resumes * 100, 2),
        round(contact_count/total_resumes * 100, 2)
    ]
})

display(HTML(stats_df.to_html(index=False)))

# Most common skills analysis
all_skills = []
for resume in parsed_resumes:
    if 'skills' in resume:
        all_skills.extend(resume['skills'])

skills_freq = pd.Series(all_skills).value_counts().head(20)
print("\nTop 20 Most Common Skills:")
display(skills_freq)

Section,Count,Percentage
Education,2462,99.11
Experience,2429,97.79
Skills,2459,98.99
Name,2484,100.0
Contact,9,0.36



Top 20 Most Common Skills:


Unnamed: 0,count
Excel,246
skills,155
clients,137
PowerPoint,127
sales,114
client,111
Word,108
policies,108
budget,107
inventory,106


In [None]:
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [None]:
# Previous imports and code remain the same up to the ResumeDataset class...

def mark_entity_positions(text, entity, labels, entity_type):
    """
    Mark the positions of entities in the text with their corresponding label IDs.

    Args:
        text (str): The full text of the resume
        entity (str): The entity text to find
        labels (list): List of current labels
        entity_type (str): Type of entity (education, experience, etc.)
    """
    words = text.split()
    entity_words = entity.split()

    for i in range(len(words) - len(entity_words) + 1):
        if [w.lower() for w in words[i:i+len(entity_words)]] == [w.lower() for w in entity_words]:
            for j in range(len(entity_words)):
                labels[i+j] = get_label_id(entity_type)

def get_label_id(entity_type):
    """
    Convert entity type to numeric label ID.
    """
    label_map = {
        'education': 1,
        'experience': 2,
        'skills': 3,
        'contact': 4,
        'name': 5
    }
    return label_map.get(entity_type, 0)

def prepare_training_data(parsed_resumes):
    """
    Prepare training data for the model by converting parsed resumes to texts and labels.

    Args:
        parsed_resumes (list): List of dictionaries containing parsed resume information

    Returns:
        tuple: (texts, labels) where texts is a list of resume texts and labels is a list of label sequences
    """
    texts = []
    labels = []

    for resume in parsed_resumes:
        text = resume['raw_text']
        current_labels = [0] * len(text.split())  # Initialize all tokens as 'O' (outside)

        for entity_type, content in resume.items():
            if entity_type not in ['raw_text', 'id']:
                if isinstance(content, list):
                    for item in content:
                        if isinstance(item, dict):
                            for value in item.values():
                                if value:  # Check if value is not empty
                                    mark_entity_positions(text, str(value), current_labels, entity_type)
                        else:
                            if item:  # Check if item is not empty
                                mark_entity_positions(text, str(item), current_labels, entity_type)
                else:
                    if content:  # Check if content is not empty
                        mark_entity_positions(text, str(content), current_labels, entity_type)

        texts.append(text)
        labels.append(current_labels)

    return texts, labels

# Now we can prepare the training data
print("\nPreparing training data...")
texts, labels = prepare_training_data(parsed_resumes)
print(f"Prepared {len(texts)} resumes for training")

# Let's look at some statistics about our labels
label_counts = {
    'O': 0,  # Outside
    'Education': 0,
    'Experience': 0,
    'Skills': 0,
    'Contact': 0,
    'Name': 0
}

for label_seq in labels:
    for label in label_seq:
        if label == 0:
            label_counts['O'] += 1
        elif label == 1:
            label_counts['Education'] += 1
        elif label == 2:
            label_counts['Experience'] += 1
        elif label == 3:
            label_counts['Skills'] += 1
        elif label == 4:
            label_counts['Contact'] += 1
        elif label == 5:
            label_counts['Name'] += 1

print("\nLabel distribution:")
total_tokens = sum(label_counts.values())
for label, count in label_counts.items():
    percentage = (count / total_tokens) * 100
    print(f"{label}: {count} tokens ({percentage:.2f}%)")

# Display a sample of labeled text for verification
def display_labeled_text_sample(texts, labels, sample_idx=0):
    print("\nSample of labeled text (first 200 tokens):")
    words = texts[sample_idx].split()
    labels_sample = labels[sample_idx]

    label_names = {
        0: 'O',
        1: 'EDU',
        2: 'EXP',
        3: 'SKILL',
        4: 'CONTACT',
        5: 'NAME'
    }

    for word, label in zip(words[:200], labels_sample[:200]):
        print(f"{word} [{label_names[label]}]", end=' ')

# Display a sample of the labeled text
display_labeled_text_sample(texts, labels)


Preparing training data...
Prepared 2484 resumes for training

Label distribution:
O: 1835273 tokens (91.07%)
Education: 53604 tokens (2.66%)
Experience: 11948 tokens (0.59%)
Skills: 105626 tokens (5.24%)
Contact: 2958 tokens (0.15%)
Name: 5924 tokens (0.29%)

Sample of labeled text (first 200 tokens):
HR [NAME] ADMINISTRATOR/MARKETING [NAME] ASSOCIATE [NAME] HR [NAME] ADMINISTRATOR [NAME] Summary [NAME] Dedicated [NAME] Customer [NAME] Service [NAME] Manager [NAME] with [NAME] 15+ [O] years [O] of [O] experience [O] in [O] Hospitality [O] and [O] Customer [SKILL] Service [SKILL] Management. [O] Respected [O] builder [O] and [O] leader [O] of [O] customer-focused [O] teams; [O] strives [O] to [O] instill [O] a [O] shared, [O] enthusiastic [O] commitment [O] to [O] customer [O] service. [O] Highlights [O] Focused [O] on [O] customer [O] satisfaction [O] Team [O] management [O] Marketing [SKILL] savvy [O] Conflict [O] resolution [O] techniques [O] Training [O] and [O] development [O] Sk

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
import pandas as pd

In [None]:
# Load the data
with open('parsed_resumes.json', 'r') as f:
    data = json.load(f)
print(f"Loaded {len(data)} resumes")

Loaded 2484 resumes


In [None]:
LABEL_MAP = {
    'O': 0,  # Outside any entity
    'B-NAME': 1,
    'I-NAME': 2,
    'B-EDU': 3,
    'I-EDU': 4,
    'B-EXP': 5,
    'I-EXP': 6,
    'B-SKILL': 7,
    'I-SKILL': 8
}

In [None]:
INVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

# Create Dataset class
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Adjust labels to match tokenized input
        words = text.split()
        word_ids = encoding.word_ids()
        labels = [-100] * len(word_ids)

        current_word_idx = -1
        for i, word_idx in enumerate(word_ids):
            if word_idx is not None and word_idx != current_word_idx:
                current_word_idx = word_idx
                if current_word_idx < len(label):
                    labels[i] = label[current_word_idx]

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels)
        }


In [None]:
def prepare_training_data(data):
    texts = []
    labels = []

    for resume in data:
        text = resume['raw_text']
        words = text.split()
        current_labels = [0] * len(words)  # Initialize all as 'O'

        # Process NAME entities
        if 'name' in resume and resume['name']:
            name_words = resume['name'].split()
            for i in range(len(words) - len(name_words) + 1):
                if words[i:i+len(name_words)] == name_words:
                    current_labels[i] = LABEL_MAP['B-NAME']
                    for j in range(1, len(name_words)):
                        current_labels[i+j] = LABEL_MAP['I-NAME']

        # Process EDUCATION entities
        if 'education' in resume and resume['education']:
            for edu in resume['education']:
                for field, value in edu.items():
                    if value:
                        edu_words = str(value).split()
                        for i in range(len(words) - len(edu_words) + 1):
                            if words[i:i+len(edu_words)] == edu_words:
                                current_labels[i] = LABEL_MAP['B-EDU']
                                for j in range(1, len(edu_words)):
                                    current_labels[i+j] = LABEL_MAP['I-EDU']

        # Process SKILLS entities
        if 'skills' in resume and resume['skills']:
            for skill in resume['skills']:
                skill_words = skill.split()
                for i in range(len(words) - len(skill_words) + 1):
                    if words[i:i+len(skill_words)] == skill_words:
                        current_labels[i] = LABEL_MAP['B-SKILL']
                        for j in range(1, len(skill_words)):
                            current_labels[i+j] = LABEL_MAP['I-SKILL']

        texts.append(text)
        labels.append(current_labels)

    return texts, labels

print("Preparing training data...")
texts, labels = prepare_training_data(data)

Preparing training data...


In [None]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")


Training samples: 1987
Validation samples: 497


In [None]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

model = RobertaForTokenClassification.from_pretrained(
    'roberta-base',
    num_labels=len(LABEL_MAP)
)

# Create datasets and dataloaders
train_dataset = ResumeDataset(train_texts, train_labels, tokenizer)
val_dataset = ResumeDataset(val_texts, val_labels, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_dataloader) * 3
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Using device: cuda


In [None]:
# Training function
def train_model(train_dataloader, model, optimizer, scheduler, device, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        print(f"\nEpoch {epoch+1}/{num_epochs}")

        for batch_idx, batch in enumerate(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

            if (batch_idx + 1) % 50 == 0:
                print(f"Batch {batch_idx+1}, Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_dataloader)
        print(f"Average loss for epoch {epoch+1}: {avg_loss:.4f}")

# Train the model
print("Starting training...")
train_model(train_dataloader, model, optimizer, scheduler, device)

Starting training...

Epoch 1/5
Batch 50, Loss: 0.3690
Batch 100, Loss: 0.1650
Batch 150, Loss: 1.2275
Batch 200, Loss: 0.2208
Batch 250, Loss: 1.0188
Batch 300, Loss: 0.2519
Batch 350, Loss: 0.3557
Batch 400, Loss: 0.3155
Batch 450, Loss: 1.0877
Average loss for epoch 1: 0.4043

Epoch 2/5
Batch 50, Loss: 0.6754
Batch 100, Loss: 0.1663
Batch 150, Loss: 0.6030
Batch 200, Loss: 0.3775
Batch 250, Loss: 0.2530
Batch 300, Loss: 0.1791
Batch 350, Loss: 0.2074
Batch 400, Loss: 0.5070
Batch 450, Loss: 0.1557
Average loss for epoch 2: 0.3375

Epoch 3/5
Batch 50, Loss: 0.2133
Batch 100, Loss: 0.2532
Batch 150, Loss: 0.2265
Batch 200, Loss: 0.4277
Batch 250, Loss: 0.2913
Batch 300, Loss: 0.3324
Batch 350, Loss: 0.3215
Batch 400, Loss: 0.2322
Batch 450, Loss: 0.2403
Average loss for epoch 3: 0.3091

Epoch 4/5
Batch 50, Loss: 0.4180
Batch 100, Loss: 0.1613
Batch 150, Loss: 0.2192
Batch 200, Loss: 0.4737
Batch 250, Loss: 0.1741
Batch 300, Loss: 0.1895
Batch 350, Loss: 0.1432
Batch 400, Loss: 0.3839


In [None]:
# Save the model
model.save_pretrained('./resume_parser_model')
tokenizer.save_pretrained('./resume_parser_model')
print("Model saved to ./resume_parser_model")


Model saved to ./resume_parser_model


In [None]:
import json
import random

# Load and test resumes with debug prints
def load_test_resumes(json_file_path='parsed_resumes.json', num_samples=5):
    print(f"\nLoading resumes from {json_file_path}...")

    with open(json_file_path, 'r') as f:
        resumes = json.load(f)

    print(f"Total resumes loaded: {len(resumes)}")

    # Randomly sample resumes
    sampled_resumes = random.sample(resumes, min(num_samples, len(resumes)))

    print(f"\nSampled {num_samples} resumes for testing")
    print("Sample IDs:", [resume.get('id', 'No ID') for resume in sampled_resumes])

    # Print example of first resume
    print("\nExample of first sampled resume:")
    first_resume = sampled_resumes[0]
    print("Available fields:", list(first_resume.keys()))
    print("\nFirst 200 characters:")
    print(first_resume['raw_text'][:200])

    test_texts = [resume['raw_text'] for resume in sampled_resumes]
    return test_texts, sampled_resumes

In [None]:
def analyze_and_display_results(resume_text, predictions, resume_id=None):
    """
    Analyze predictions and display results as structured sentences
    """
    print(f"\nAnalyzing Resume {resume_id if resume_id else ''}")
    print("=" * 50)

    # Group predictions by entity type
    entities = {
        'NAME': [],
        'EDU': [],
        'SKILL': [],
        'EXP': []
    }

    current_entity = None
    current_text = []

    for word, label in predictions:
        if label != 'O':  # Any non-Outside label
            entity_type = label.split('-')[-1]  # Extract entity type
            if entity_type in entities:
                if label.startswith('B-') or not current_text:
                    if current_text:
                        entities[current_entity].append(' '.join(current_text))
                    current_entity = entity_type
                    current_text = [word]
                else:
                    current_text.append(word)
        elif current_text:
            if current_entity:
                entities[current_entity].append(' '.join(current_text))
            current_entity = None
            current_text = []

    if current_text and current_entity:
        entities[current_entity].append(' '.join(current_text))

    # Generate proper sentences
    results_summary = []
    if entities['NAME']:
        results_summary.append(f"The candidate's name is {', '.join(entities['NAME'])}.")
    if entities['EDU']:
        results_summary.append(f"They have educational qualifications in {', '.join(entities['EDU'])}.")
    if entities['SKILL']:
        results_summary.append(f"Their skills include {', '.join(entities['SKILL'])}.")
    if entities['EXP']:
        results_summary.append(f"They have professional experience in {', '.join(entities['EXP'])}.")

    if not results_summary:
        results_summary.append("No significant information could be extracted from this resume.")

    # Display structured output
    print("\nExtracted Information:")
    print("-" * 20)
    for sentence in results_summary:
        print(sentence)

    print("\nDebug Statistics:")
    print(f"Total predictions processed: {len(predictions)}")
    print("Entity counts:", {key: len(val) for key, val in entities.items()})
    print("\n" + "=" * 50)

    return entities


def predict_resume(text, model, tokenizer, device):
    """
    Enhanced prediction function with better debugging
    """
    encoding = tokenizer(
        text,
        truncation=True,
        padding=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=2)

        # Get prediction probabilities
        probs = torch.nn.functional.softmax(outputs.logits, dim=2)
        max_probs, _ = torch.max(probs, dim=2)

    # Convert predictions to labels with confidence check
    pred_labels = []
    word_ids = encoding.word_ids(0)
    words = text.split()
    current_word = None
    confidence_threshold = 0.5  # Adjust this threshold as needed

    for i, (word_id, pred, conf) in enumerate(zip(word_ids, predictions[0], max_probs[0])):
        if word_id is not None and word_id != current_word:
            current_word = word_id
            if current_word < len(words):
                if conf.item() > confidence_threshold:
                    pred_label = INVERSE_LABEL_MAP[pred.item()]
                else:
                    pred_label = 'O'  # Mark as outside if confidence is low
                pred_labels.append((words[current_word], pred_label))

    return pred_labels

In [None]:
# Execute testing
print("Starting resume testing...")
test_texts, sampled_resumes = load_test_resumes()

print("\nTesting model predictions...")
for idx, (test_text, original_resume) in enumerate(zip(test_texts, sampled_resumes), 1):
    print(f"\nProcessing resume {idx}/5")
    print(f"Resume ID: {original_resume.get('id', 'No ID')}")
    print(f"Text length: {len(test_text)} characters")

    predictions = predict_resume(test_text, model, tokenizer, device)
    results = analyze_and_display_results(
        test_text,
        predictions,
        resume_id=original_resume.get('id', f'Resume {idx}')
    )

    print("\nComparison:")
    print("Original sections:", list(original_resume.keys()))
    print("Extracted sections:", [k for k, v in results.items() if v])

Starting resume testing...

Loading resumes from parsed_resumes.json...
Total resumes loaded: 2484

Sampled 5 resumes for testing
Sample IDs: [41523474, 16861758, 26622051, 18589927, 13264796]

Example of first sampled resume:
Available fields: ['education', 'experience', 'skills', 'name', 'raw_text', 'id']

First 200 characters:
         HR EXECUTIVE       Summary     Dual specialization in the domain of Human Resource Management and Finance.       Highlights          Human resources management  People-oriented      New emplo

Testing model predictions...

Processing resume 1/5
Resume ID: 41523474
Text length: 5504 characters

Analyzing Resume 41523474

Extracted Information:
--------------------
No significant information could be extracted from this resume.

Debug Statistics:
Total predictions processed: 336
Entity counts: {'NAME': 0, 'EDU': 0, 'SKILL': 0, 'EXP': 0}


Comparison:
Original sections: ['education', 'experience', 'skills', 'name', 'raw_text', 'id']
Extracted sections: [

In [None]:
# Load the dataset
# Assuming your data is in a CSV file format
data_path = './Resume.csv'  # Replace with your dataset file path
df = pd.read_csv(data_path)

# Preprocessing: Convert Resume_str into a clean format for NLP processing
def preprocess_text(text):
    # Replace special characters, extra whitespaces, and normalize
    text = text.replace("Â", "").replace("ï¼", "-")
    text = ' '.join(text.split())
    return text
df['Cleaned_Resume'] = df['Resume_str'].apply(preprocess_text)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [None]:



# Load pretrained model pipeline for information extraction
# Here, we're using a QA pipeline as an example; you can customize as needed
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Define questions for key information extraction
questions = {
    "work_experience": "What is the work experience mentioned?",
    "skills": "What are the skills mentioned?",
    "education": "What is the education background?",
}

# Function to extract details using the QA pipeline

In [None]:
# Define the function to extract details using the QA pipeline
def extract_details(resume_text, questions):
    extracted_info = {}
    for key, question in questions.items():
        # Use the QA pipeline to get the answer to each question from the resume text
        result = qa_pipeline(question=question, context=resume_text)
        extracted_info[key] = result['answer']

        # Print the extracted detail for each question
        print(f"Question: {question}")
        print(f"Answer: {result['answer']}")
        print("-" * 50)

    return extracted_info

In [None]:
import pandas as pd
import re
from transformers import pipeline

# Load the NER pipeline with a RoBERTa model
ner_pipeline = pipeline(
    "ner",
    model="Jean-Baptiste/roberta-large-ner-english",  # Replace with a RoBERTa NER model
    grouped_entities=True
)

# Define functions for extracting specific details
def extract_contact_info(text):
    email = re.findall(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", text)
    phone = re.findall(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b", text)
    return {
        "email": email[0] if email else None,
        "phone": phone[0] if phone else None
    }

def extract_ner_details(text):
    ner_results = ner_pipeline(text)
    print("NER Results:", ner_results)  # Log intermediate NER output for debugging
    entities = {
        "education": [],
        "skills": [],
        "experience": []
    }
    for entity in ner_results:
        label = entity['entity_group']
        word = entity['word']
        if label.lower() == 'education':
            entities["education"].append(word)
        elif label.lower() == 'skills':
            entities["skills"].append(word)
        elif label.lower() == 'experience':
            entities["experience"].append(word)
    return entities

def process_resume(text):
    contact_info = extract_contact_info(text)
    ner_details = extract_ner_details(text)

    extracted_details = {
        "name": None,
        "contact_info": contact_info,
        "education": " ".join(ner_details["education"]),
        "skills": list(set(ner_details["skills"])),
        "experience": " ".join(ner_details["experience"])
    }
    return extracted_details

# Load and process the CSV file
resume_data = pd.read_csv("./Resume.csv")

if 'Resume_str' not in resume_data.columns:
    raise ValueError("The CSV file must have a 'Resume_str' column containing the resume text.")

processed_resumes = []
for index, row in resume_data.head(5).iterrows():
    resume_text = row['Resume_str']
    extracted_data = process_resume(resume_text)
    processed_resumes.append(extracted_data)
    # Print the extracted data
    print(f"Resume {index + 1}:")
    for key, value in extracted_data.items():
        print(f"  {key}: {value}")
    print()

# Save the processed data to a CSV file
output_df = pd.DataFrame(processed_resumes)
output_df.to_csv("Extracted_Resume_Details.csv", index=False)

print("Extraction complete for the first 5 resumes. Results saved to Extracted_Resume_Details.csv.")


config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


NER Results: [{'entity_group': 'ORG', 'score': 0.8811511, 'word': ' Missouri DOT', 'start': 563, 'end': 575}, {'entity_group': 'ORG', 'score': 0.9922519, 'word': ' IHG', 'start': 624, 'end': 627}, {'entity_group': 'ORG', 'score': 0.7221176, 'word': ' Hilton Worldwide', 'start': 675, 'end': 691}, {'entity_group': 'MISC', 'score': 0.9750762, 'word': ' Hilton OnQ', 'start': 801, 'end': 811}, {'entity_group': 'MISC', 'score': 0.9349847, 'word': ' Micros', 'start': 817, 'end': 823}, {'entity_group': 'MISC', 'score': 0.9805248, 'word': ' Opera PMS', 'start': 827, 'end': 836}, {'entity_group': 'MISC', 'score': 0.9866514, 'word': ' Fidelio', 'start': 841, 'end': 848}, {'entity_group': 'MISC', 'score': 0.9608848, 'word': ' OPERA', 'start': 852, 'end': 857}, {'entity_group': 'MISC', 'score': 0.8179652, 'word': ' Reservation System', 'start': 861, 'end': 879}, {'entity_group': 'MISC', 'score': 0.9722882, 'word': 'ORS', 'start': 881, 'end': 884}, {'entity_group': 'MISC', 'score': 0.96265316, 'word

In [None]:
# Apply the extraction only on the first 10 entries
df_subset = df.head(10)  # Select the first 10 entries
df_extracted = df_subset['Cleaned_Resume'].apply(lambda x: extract_details(x, questions))
df_extracted = pd.json_normalize(df_extracted)

# Combine extracted data with the original subset
final_df = pd.concat([df_subset, df_extracted], axis=1)

# Save the final subset dataframe
output_path = 'processed_resumes_subset.csv'
final_df.to_csv(output_path, index=False)

print(f"Processed data for the first 10 entries saved to {output_path}")

Question: What is the work experience mentioned?
Answer: 15+ years of experience in Hospitality and Customer Service Management
--------------------------------------------------
Question: What are the skills mentioned?
Answer: Accounting, ads, advertising, analytical skills
--------------------------------------------------
Question: What is the education background?
Answer: 15+ years of experience in Hospitality and Customer Service Management
--------------------------------------------------
Question: What is the work experience mentioned?
Answer: Versatile media professional with background in Communications, Marketing, Human Resources and Technology
--------------------------------------------------
Question: What are the skills mentioned?
Answer: Communications, Marketing, Human Resources and Technology
--------------------------------------------------
Question: What is the education background?
Answer: Communications, Marketing, Human Resources and Technology
-----------------

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Question: What is the education background?
Answer: State Develped Supervisory Education
--------------------------------------------------
Question: What is the work experience mentioned?
Answer: Education and Training Certificate of Completion *Business Administration 1997
--------------------------------------------------
Question: What are the skills mentioned?
Answer: Type 50 wpm and 10-Key by touch
--------------------------------------------------
Question: What is the education background?
Answer: Business Administration 1997
--------------------------------------------------
Question: What is the work experience mentioned?
Answer: Fanatics core values
--------------------------------------------------
Question: What are the skills mentioned?
Answer: HR MANAGER
--------------------------------------------------
Question: What is the education background?
Answer: Bachelor of Business Administration
--------------------------------------------------
Question: What is the work exp