In [3]:
from transformers import pipeline
import torch
from collections import defaultdict
import pandas as pd
import json
import pandas as pd

In [2]:
def create_ner_pipelines(model_paths):
    """–°–æ–∑–¥–∞–µ—Ç NER –ø–∞–π–ø–ª–∞–π–Ω—ã –¥–ª—è –≤—Å–µ—Ö –º–æ–¥–µ–ª–µ–π"""
    pipelines = {}
    
    for group_name, path in model_paths.items():
        print(f"–°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è {group_name} –º–æ–¥–µ–ª–∏...")
        
        ner_pipe = pipeline(
            "ner",
            model=path,
            tokenizer=path,
            aggregation_strategy="first",
            stride=64,
            device= "mps" if torch.backends.mps.is_available() else "cpu", # –¥–ª—è Apple Silicon
        )
        
        pipelines[group_name] = ner_pipe
    
    return pipelines

def process_text_with_pipelines(text, pipelines):
    """–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç —Ç–µ–∫—Å—Ç –≤—Å–µ–º–∏ –ø–∞–π–ø–ª–∞–π–Ω–∞–º–∏ –∏ –æ–±—ä–µ–¥–∏–Ω—è–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã"""
    all_entities = []
    
    for group_name, ner_pipe in pipelines.items():
        try:
            entities = ner_pipe(text)
            
            # –î–æ–±–∞–≤–ª—è–µ–º —Å—É—â–Ω–æ—Å—Ç–∏ —Å –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–µ–π –æ –≥—Ä—É–ø–ø–µ
            for entity in entities:
                entity['model_group'] = group_name
                all_entities.append(entity)
                
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –≤ –º–æ–¥–µ–ª–∏ {group_name}: {e}")
    
    return all_entities

def process_all_resumes_pipeline(resume_texts, model_paths):
    """–û–±—Ä–∞–±–æ—Ç–∫–∞ –≤—Å–µ—Ö —Ä–µ–∑—é–º–µ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º pipeline"""
    
    print("–°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–æ–≤...")
    pipelines = create_ner_pipelines(model_paths)
    
    all_results = []
    
    print(f"–û–±—Ä–∞–±–æ—Ç–∫–∞ {len(resume_texts)} —Ä–µ–∑—é–º–µ —Å –≤—Å—Ç—Ä–æ–µ–Ω–Ω—ã–º–∏ —Å–∫–æ–ª—å–∑—è—â–∏–º–∏ –æ–∫–Ω–∞–º–∏")
    
    for i, text in enumerate(resume_texts):
        if i % 10 == 0:
            print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ {i}/{len(resume_texts)} —Ä–µ–∑—é–º–µ")
        
        if not text or str(text).strip() == "":
            all_results.append({
                'text': text,
                'entities': [],
                'entity_count': 0,
                'overall_confidence': 0.0
            })
            continue
            
        try:
            entities = process_text_with_pipelines(str(text), pipelines)
            
            # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≤ —Å–æ–≤–º–µ—Å—Ç–∏–º—ã–π —Ñ–æ—Ä–º–∞—Ç 
            formatted_entities = []
            for entity in entities:
                formatted_entities.append({
                    'start': entity['start'],
                    'end': entity['end'],
                    'label': entity['entity_group'],
                    'text': entity['word'],
                    'confidence': float(entity['score']),  
                    'model_group': entity['model_group']
                })
            
            # –í—ã—á–∏—Å–ª—è–µ–º –æ–±—â—É—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å –ø–æ –¥–æ–∫—É–º–µ–Ω—Ç—É
            overall_confidence = 0.0
            if formatted_entities:
                overall_confidence = sum(e['confidence'] for e in formatted_entities) / len(formatted_entities)
            
            all_results.append({
                'text': text,
                'entities': formatted_entities,
                'entity_count': len(formatted_entities),
                'overall_confidence': overall_confidence
            })
            
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ —Å —Ä–µ–∑—é–º–µ {i}: {e}")
            all_results.append({
                'text': text,
                'entities': [],
                'entity_count': 0,
                'overall_confidence': 0.0
            })
    
    # –°–æ—Ä—Ç–∏—Ä—É–µ–º –ø–æ —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏
    all_results_sorted = sorted(all_results, key=lambda x: x['overall_confidence'], reverse=True)
    
    print(f"–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: –æ—Ç {all_results_sorted[-1]['overall_confidence']:.3f} –¥–æ {all_results_sorted[0]['overall_confidence']:.3f}")
    
    return all_results_sorted

def export_to_label_studio_pipeline(predictions, output_file="label_studio_pipeline.json"):
    """–≠–∫—Å–ø–æ—Ä—Ç –≤ Label Studio —Å confidence scores"""
    label_studio_tasks = []
    
    for i, pred in enumerate(predictions):
        text = pred['text']
        entities = pred['entities']
        
        # –°–æ–∑–¥–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –¥–ª—è Label Studio
        results = []
        for entity in entities:
            result = {
                "value": {
                    "start": entity['start'],
                    "end": entity['end'], 
                    "text": entity['text'],
                    "labels": [entity['label']]
                },
                "id": f"pred_{i}_{entity['start']}_{entity['end']}",
                "from_name": "label",
                "to_name": "text", 
                "type": "labels",
                "score": entity['confidence']  # —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å –¥–ª—è –∫–∞–∂–¥–æ–π —Å—É—â–Ω–æ—Å—Ç–∏
            }
            results.append(result)
        
        # –°–æ–∑–¥–∞–µ–º –∑–∞–¥–∞—á—É –¥–ª—è Label Studio
        task = {
            "data": {
                "text": str(text),
                "id": i + 1,
                "score": pred['overall_confidence'] # –û–±—â–∞—è —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å –ø–æ –¥–æ–∫—É–º–µ–Ω—Ç—É 
            },
            "predictions": [{
                "result": results,
                "model_version": "pipeline-3-model-ensemble",
                "score": pred['overall_confidence']  # –û–±—â–∞—è —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å –ø–æ –¥–æ–∫—É–º–µ–Ω—Ç—É 
            }]
        }
        
        label_studio_tasks.append(task)
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ —Ñ–∞–π–ª
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(label_studio_tasks, f, ensure_ascii=False, indent=2)
    
    print(f"–≠–∫—Å–ø–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–æ {len(label_studio_tasks)} –∑–∞–¥–∞—á")
    return label_studio_tasks

def save_predictions_csv_pipeline(predictions, output_file="predictions_pipeline.csv"):
    """–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ CSV —Å confidence scores"""
    rows = []
    for i, pred in enumerate(predictions):
        text_preview = str(pred['text'])[:100] + "..." if len(str(pred['text'])) > 100 else str(pred['text'])
        
        entities_str = ""
        if pred['entities']:
            entities_list = [f"{e['text']} ({e['label']}:{e['confidence']:.3f})" for e in pred['entities']]
            entities_str = "; ".join(entities_list)
        
        rows.append({
            'sort_order': i + 1,
            'text_preview': text_preview,
            'entities_found': entities_str,
            'entity_count': pred['entity_count'],
            'overall_confidence': f"{pred['overall_confidence']:.3f}"  # –æ–±—â–∞—è —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å
        })
    
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"CSV —Å–æ—Ö—Ä–∞–Ω–µ–Ω: {output_file}")

def analyze_confidence(predictions):
    """–ê–Ω–∞–ª–∏–∑ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è confidence scores"""
    all_confidences = []
    for pred in predictions:
        for entity in pred['entities']:
            all_confidences.append(entity['confidence'])
    
    if all_confidences:
        print(f"–ê–Ω–∞–ª–∏–∑ confidence scores:")
        print(f"–°—Ä–µ–¥–Ω–µ–µ: {sum(all_confidences) / len(all_confidences):.3f}")
        print(f"–ú–µ–¥–∏–∞–Ω–∞: {sorted(all_confidences)[len(all_confidences)//2]:.3f}")
        print(f"Min: {min(all_confidences):.3f}")
        print(f"Max: {max(all_confidences):.3f}")
        
        # –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –¥–∏–∞–ø–∞–∑–æ–Ω–∞–º
        ranges = [(0, 0.5), (0.5, 0.7), (0.7, 0.9), (0.9, 1.0)]
        for r_min, r_max in ranges:
            count = sum(1 for c in all_confidences if r_min <= c < r_max)
            print(f"{r_min:.1f}-{r_max:.1f}: {count} —Å—É—â–Ω–æ—Å—Ç–µ–π ({count/len(all_confidences)*100:.1f}%)")

# –ì–ª–∞–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è
def full_pipeline(resume_texts, model_paths):
    print("–ó–∞–ø—É—Å–∫ –ø–∞–π–ø–ª–∞–π–Ω–∞...")
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å pipeline
    all_results = process_all_resumes_pipeline(resume_texts, model_paths)
    
    # –ê–Ω–∞–ª–∏–∑ –∫–∞—á–µ—Å—Ç–≤–∞
    analyze_confidence(all_results)
    
    # –≠–∫—Å–ø–æ—Ä—Ç
    export_to_label_studio_pipeline(all_results, "final_pipeline_predictions.json")
    save_predictions_csv_pipeline(all_results, "predictions_pipeline.csv")
    print("–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –º–æ–¥–µ–ª–µ–π —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã")



In [5]:
import pandas as pd

FOLDER_PATH = '../models/'
resumes_df = pd.read_csv('../datasets/test_batch_resumes.csv', index_col=0)
unlabeled_resumes = resumes_df['description'].dropna().tolist()
model_paths = {
    'group1': FOLDER_PATH + 'model_1_final',
    'group2': FOLDER_PATH + 'model_2_final', 
    'group3': FOLDER_PATH + 'model_3_final'
}

final_results = full_pipeline(unlabeled_resumes, model_paths)

üöÄ –ó–∞–ø—É—Å–∫ –Ω–æ–≤–æ–≥–æ –ø–∞–π–ø–ª–∞–π–Ω–∞ —Å –≤—Å—Ç—Ä–æ–µ–Ω–Ω—ã–º–∏ —Å–∫–æ–ª—å–∑—è—â–∏–º–∏ –æ–∫–Ω–∞–º–∏...
üîÑ –°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–æ–≤...
üîÑ –°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è group1 –º–æ–¥–µ–ª–∏...


Device set to use mps
Device set to use mps


üîÑ –°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è group2 –º–æ–¥–µ–ª–∏...


Device set to use mps


üîÑ –°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è group3 –º–æ–¥–µ–ª–∏...
üöÄ –û–±—Ä–∞–±–æ—Ç–∫–∞ 100 —Ä–µ–∑—é–º–µ —Å –≤—Å—Ç—Ä–æ–µ–Ω–Ω—ã–º–∏ —Å–∫–æ–ª—å–∑—è—â–∏–º–∏ –æ–∫–Ω–∞–º–∏
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 0/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 10/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 20/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 30/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 40/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 50/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 60/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 70/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 80/100 —Ä–µ–∑—é–º–µ
üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 90/100 —Ä–µ–∑—é–º–µ
‚úÖ –û–±—Ä–∞–±–æ—Ç–∫–∞ –∑–∞–≤–µ—Ä—à–µ–Ω–∞!
üìà –£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: –æ—Ç 0.882 –¥–æ 1.000
üìä –ê–Ω–∞–ª–∏–∑ confidence scores:
   –°—Ä–µ–¥–Ω–µ–µ: 0.953
   –ú–µ–¥–∏–∞–Ω–∞: 1.000
   Min: 0.291
   Max: 1.000
   0.0-0.5: 89 —Å—É—â–Ω–æ—Å—Ç–µ–π (1.0%)
   0.5-0.7: 452 —Å—É—â–Ω–æ—Å—Ç–µ–π (5.2%)
   0.7-0.9: 692 —Å—É—â–Ω–æ—Å—Ç–µ–π (7.9%)
   0.9-1.0: 7466 —Å—É—â–Ω–æ—Å—Ç–µ

In [6]:
# import pandas as pd

# FOLDER_PATH = '/Users/artemzmailov/Desktop/–í–ö–†/Learning/models_307_resumes_stride_64/'
# resumes_df = pd.read_csv('/Users/artemzmailov/Desktop/–í–ö–†/test_batch_resumes.csv', index_col=0)
# unlabeled_resumes = resumes_df['description'].dropna().tolist()
# model_paths = {
#     'group1': FOLDER_PATH + 'model_1_final',
# }

# final_results = full_pipeline_new(unlabeled_resumes, model_paths)

In [28]:
test_df = pd.read_json('../hh_ru_API_parser/resumes_json/for_label_studio/label_studio_all_117.json')
test_df['text'] = test_df['data'].apply(lambda x: x['text'])
text_lst = test_df['text'].to_list()
FOLDER_PATH = '../models/'
model_paths = {
    'group1': FOLDER_PATH + 'model_1_final',
    'group2': FOLDER_PATH + 'model_2_final', 
    'group3': FOLDER_PATH + 'model_3_final'
}

final_results = full_pipeline(text_lst, model_paths)

–ó–∞–ø—É—Å–∫ –ø–∞–π–ø–ª–∞–π–Ω–∞...
–°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–æ–≤...
–°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è group1 –º–æ–¥–µ–ª–∏...


Device set to use mps
Device set to use mps


–°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è group2 –º–æ–¥–µ–ª–∏...


Device set to use mps


–°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è group3 –º–æ–¥–µ–ª–∏...
–û–±—Ä–∞–±–æ—Ç–∫–∞ 117 —Ä–µ–∑—é–º–µ —Å –≤—Å—Ç—Ä–æ–µ–Ω–Ω—ã–º–∏ —Å–∫–æ–ª—å–∑—è—â–∏–º–∏ –æ–∫–Ω–∞–º–∏
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 0/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 10/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 20/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 30/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 40/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 50/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 60/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 70/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 80/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 90/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100/117 —Ä–µ–∑—é–º–µ
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 110/117 —Ä–µ–∑—é–º–µ
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: –æ—Ç 0.832 –¥–æ 0.988
–ê–Ω–∞–ª–∏–∑ confidence scores:
–°—Ä–µ–¥–Ω–µ–µ: 0.944
–ú–µ–¥–∏–∞–Ω–∞: 0.999
Min: 0.311
Max: 1.000
0.0-0.5: 97 —Å—É—â–Ω–æ—Å—Ç–µ–π (0.6%)
0.5-0.7: 1009 —Å—É—â–Ω–æ—Å—Ç–µ–π (6.7%)
0.7-0.9: 1504 —Å—É—â–Ω–æ—Å—Ç–µ–π (10.0%)
0.9-1.0: 12424 —Å—É—â–Ω–æ—Å—Ç–µ–π (82.5%)
–≠–∫—Å–ø–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–æ 117 –∑–∞–