In [6]:
from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional, Literal
from transformers import AutoModel, AutoTokenizer
import torch
from PIL import Image
import json
import re

# --- Pydantic Models for Structured Output ---
class PersonalInfo(BaseModel):
    full_name: str = Field(..., description="Candidate's full name")
    email: Optional[str] = Field(None, description="Contact email")
    phone: Optional[str] = Field(None, description="Phone number")
    location: Optional[str] = Field(None, description="Current location")
    linkedin: Optional[str] = Field(None, description="LinkedIn URL")
    github: Optional[str] = Field(None, description="GitHub URL")

class EducationEntry(BaseModel):
    institution: str = Field(..., description="School/university name")
    degree: Literal["bachelor", "master", "phd", "diploma", "certificate", "associate", "other"] = Field(..., description="Degree level")
    field_of_study: str = Field(..., description="Major/specialization")
    start_year: int = Field(..., description="Start year")
    end_year: Optional[int] = Field(None, description="Graduation year")

class WorkExperienceEntry(BaseModel):
    company: str = Field(..., description="Employer name")
    position: str = Field(..., description="Job title")
    start_date: str = Field(..., description="Start date (MM/YYYY)")
    end_date: Optional[str] = Field(None, description="End date (MM/YYYY or 'Present')")
    responsibilities: List[str] = Field(..., description="Key achievements")

class SkillEntry(BaseModel):
    name: str = Field(..., description="Skill name")
    category: str = Field(..., description="Skill category")
    proficiency: Literal["beginner", "intermediate", "advanced", "expert", "native"] = Field(..., description="Proficiency level")

class CVAnalysisResult(BaseModel):
    personal_info: PersonalInfo = Field(..., description="Personal details")
    education: List[EducationEntry] = Field(..., description="Education history")
    work_experience: List[WorkExperienceEntry] = Field(..., description="Work experience")
    technical_skills: List[SkillEntry] = Field(..., description="Technical skills")
    soft_skills: List[str] = Field(..., description="Soft skills")
    summary: str = Field(..., description="Professional summary")
    match_score: float = Field(..., ge=0, le=1, description="Job match score (0-1)")
    strengths: List[str] = Field(..., description="Candidate strengths for this role")
    improvement_areas: List[str] = Field(..., description="Areas needing improvement")

# --- Advanced MiniCPM-O 2.6 Integration with Language Control ---
class ProfessionalCVAnalyzer:
    SUPPORTED_LANGUAGES = ["en", "fr", "es", "de", "ar", "zh", "ja", "ru"]
    
    def __init__(self):
        self.model, self.tokenizer = self.initialize_model()
        
    def initialize_model(self):
        """Initialize MiniCPM-O 2.6 with multimodal capabilities"""
        model_id = "/teamspace/studios/this_studio/.cache/modelscope/hub/models/OpenBMB/MiniCPM-o-2_6"


        model = AutoModel.from_pretrained(
            model_id,
            trust_remote_code=True,
            attn_implementation='sdpa', # sdpa or flash_attention_2
            torch_dtype=torch.bfloat16,
            init_vision=True,
            init_audio=True,
            init_tts=True
        )
        model = model.eval().cuda()


        
        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            trust_remote_code=True
        )
        
        return model, tokenizer
    
    def create_prompt(self, job_description: str, language: str = "en") -> list:
        """Precision-engineered prompt with language control"""
        if language not in self.SUPPORTED_LANGUAGES:
            raise ValueError(f"Unsupported language. Choose from: {', '.join(self.SUPPORTED_LANGUAGES)}")
        
        schema = CVAnalysisResult.schema_json(indent=2)
        
        # Language-specific instructions
        language_instructions = {
            "en": "Output language: English",
            "fr": "Langue de sortie: Français",
            "es": "Idioma de salida: Español",
            "de": "Ausgabesprache: Deutsch",
            "ar": "لغة الإخراج: العربية",
            "zh": "输出语言: 中文",
            "ja": "出力言語: 日本語",
            "ru": "Язык вывода: Русский"
        }
        
        return [
            {
                "role": "system",
                "content": (
                    "## EXPERT PROFILE ##\n"
                    "Senior CV Analyst | Fortune 500 Recruitment Expert | Multilingual Specialist\n\n"
                    
                    "## CORE MISSION ##\n"
                    "Extract CV data and evaluate job fit with 99.8% accuracy\n\n"
                    
                    "## OPERATIONAL RULES ##\n"
                    "1. OCR EXTRACTION: Digitize all CV elements with pixel-perfect accuracy\n"
                    "2. STRUCTURED OUTPUT: Generate VALID JSON matching schema exactly\n"
                    "3. LANGUAGE CONTROL: All text output must be in specified language\n"
                    "4. JOB MATCHING: Critical evaluation against requirements\n"
                    "5. DATA NORMALIZATION:\n"
                    "   - Dates: MM/YYYY\n"
                    "   - Skills: Infer proficiency from context\n"
                    "   - Scores: Objective 0-1 scale\n"
                    "6. OUTPUT FORMAT: JSON between ```json markers\n\n"
                    
                    f"## LANGUAGE DIRECTIVE ##\n"
                    f"{language_instructions[language]}\n\n"
                    
                    f"## JOB DESCRIPTION ##\n"
                    f"{job_description}\n\n"
                    
                    f"## OUTPUT SCHEMA ##\n"
                    f"{schema}\n\n"
                    
                    "## EXECUTION PROTOCOL ##\n"
                    "1. Perform high-accuracy OCR\n"
                    "2. Extract structured data\n"
                    "3. Analyze job fit\n"
                    "4. Generate localized JSON output"
                )
            }
        ]
    
    def extract_json(self, response: str) -> dict:
        """Military-grade JSON extraction"""
        # Multi-layered extraction strategy
        patterns = [
            r'```json(.*?)```',  # Explicit JSON marker
            r'```(.*?)```',      # Generic code block
            r'\{.*\}',           # Raw JSON object
        ]
        
        for pattern in patterns:
            match = re.search(pattern, response, re.DOTALL)
            if match:
                try:
                    json_str = match.group(1).strip() if pattern != r'\{.*\}' else match.group(0)
                    # Clean non-JSON content
                    if json_str.startswith('json\n'):
                        json_str = json_str[5:]
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    continue
        
        # Final fallback: AI-powered repair
        return self.ai_json_repair(response)
    
    def ai_json_repair(self, response: str) -> dict:
        """Use model intelligence to fix malformed JSON"""
        repair_prompt = [
            {"role": "user", "content": response},
            {"role": "system", "content": "Transform this text into valid JSON matching the schema. Return ONLY valid JSON."}
            
        ]
        fixed = self.model.chat(
            msgs=repair_prompt,
            tokenizer=self.tokenizer,
            #sampling=True,
            temperature=0.1
        )
        try:
            return json.loads(fixed)
        except:
            # Ultimate fallback: Extract first valid JSON
            start = fixed.find('{')
            end = fixed.rfind('}') + 1
            return json.loads(fixed[start:end])
    
    def analyze(self, cv_image_path: str, job_description: str, language: str = "en") -> CVAnalysisResult:
        """End-to-end multilingual CV analysis"""
        # Validate language
        if language not in self.SUPPORTED_LANGUAGES:
            raise ValueError(f"Unsupported language '{language}'. Valid options: {', '.join(self.SUPPORTED_LANGUAGES)}")
        
        # Load CV image
        cv_image = Image.open(cv_image_path).convert('RGB')
        
        # Prepare multimodal input with language control
        messages = self.create_prompt(job_description, language)
        messages.append({
            "role": "user",
            "content": [
                cv_image,
                "Generate professional CV analysis:"
            ]
        })
        
        # Call MiniCPM-O 2.6 with precision tuning
        response = self.model.chat(
            msgs=messages,
            tokenizer=self.tokenizer,
            sampling=True,
            temperature=0.3,  # Balance creativity and accuracy
            max_new_tokens=1800,
            top_p=0.95,
            repetition_penalty=1.1
        )
        
        # Extract and validate JSON
        try:
            json_data = self.extract_json(response)
            return CVAnalysisResult(**json_data)
        except ValidationError as e:
            # Self-healing validation system
            return self.handle_validation_error(json_data, e)
    
    def handle_validation_error(self, data: dict, error: ValidationError) -> CVAnalysisResult:
        """AI-powered schema correction"""
        error_details = str(error)
        fix_prompt = [
            {"role": "user", "content": f"Invalid JSON:\n{json.dumps(data, indent=2)}\n\nErrors:\n{error_details}"},
            {"role": "system", "content": "Correct this JSON to strictly match the schema. Return ONLY valid JSON."}
            
        ]
        fixed = self.model.chat(
            msgs=fix_prompt,
            tokenizer=self.tokenizer,
            sampling=True,
            temperature=0.1
        )
        fixed_json = self.extract_json(fixed)
        return CVAnalysisResult(**fixed_json)


In [17]:
import torch
from PIL import Image
from modelscope import AutoModel, AutoTokenizer

# to see the path you can execute the download command :

my_model_dir = "/teamspace/studios/this_studio/.cache/modelscope/hub/models/OpenBMB/MiniCPM-o-2_6"

model = AutoModel.from_pretrained(
    my_model_dir,
    trust_remote_code=True,
    attn_implementation='sdpa', # sdpa or flash_attention_2
    torch_dtype=torch.bfloat16,
    init_vision=True,
    init_audio=True,
    init_tts=True
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(my_model_dir, trust_remote_code=True)

# In addition to vision-only mode, tts processor and vocos also needs to be initialized
model.init_tts()

image = Image.open('1131w-uHRaEYx8dVI (1).webp').convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image, question]}]

res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print("the response of the model is : ",res)

AttributeError: 'Resampler' object has no attribute '_initialize_weights'

In [7]:

# --- Professional Usage Interface ---
if __name__ == "__main__":
    print("🚀 Launching Enterprise CV Analyzer v2.0")
    print("⚙️ Initializing MiniCPM-O 2.6 with multimodal capabilities...")
    analyzer = ProfessionalCVAnalyzer()
    
    job_desc = """
    Senior AI Engineer Requirements:
    - 5+ years ML production experience
    - Expertise in Python, PyTorch, TensorFlow
    - Cloud deployment (AWS/Azure/GCP)
    - PhD/MS in Computer Science
    - Publications in top AI conferences
    - Fluent English communication
    """
    
    print("\n🌍 Language Options:", ", ".join(analyzer.SUPPORTED_LANGUAGES))
    target_language = input("Select output language (default: en): ") or "en"
    
    print(f"\n🔍 Analyzing CV in {target_language.upper()}...")
    result = analyzer.analyze(
        cv_image_path="1131w-uHRaEYx8dVI (1).webp",
        job_description=job_desc,
        language=target_language
    )
    
    # Professional result presentation
    print("\n✅ ANALYSIS COMPLETE")
    print(f"📌 Candidate: {result.personal_info.full_name}")
    print(f"⭐ Match Score: {result.match_score:.0%}")
    print(f"📧 Contact: {result.personal_info.email or 'Not provided'}")
    
    print("\n🎓 Education:")
    for edu in result.education:
        print(f"- {edu.degree.capitalize()} in {edu.field_of_study}, {edu.institution} ({edu.start_year}-{edu.end_year or 'Present'})")
    
    print("\n💻 Technical Skills:")
    for skill in result.technical_skills[:5]:
        print(f"- {skill.name} ({skill.proficiency})")
    
    print("\n🌟 Key Strengths:")
    for strength in result.strengths[:3]:
        print(f"- {strength}")
    
    print("\n📈 Improvement Areas:")
    for area in result.improvement_areas:
        print(f"- {area}")
    
    print("\n💼 Professional Summary:")
    print(result.summary)

🚀 Launching Enterprise CV Analyzer v2.0
⚙️ Initializing MiniCPM-O 2.6 with multimodal capabilities...


AttributeError: 'Resampler' object has no attribute '_initialize_weights'