In [None]:
import os
import json
import sqlite3
from pathlib import Path
from dotenv import load_dotenv
import google.generativeai as genai
import PyPDF2

# Load environment variables
env_path = "/home/labuser/VSCODE_training/.env"
load_dotenv(env_path)

api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    print("Error: GEMINI_API_KEY not found in .env file")
    exit(1)

genai.configure(api_key=api_key)

# Define the CV schema for function calling
CV_SCHEMA = {
    "name": "extract_cv_data",
    "description": "Extract structured data from a CV/Resume",
    "parameters": {
        "type": "object",
        "properties": {
            "personal_info": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "email": {"type": "string"},
                    "phone": {"type": "string"},
                    "location": {"type": "string"},
                    "linkedin": {"type": "string"},
                    "github": {"type": "string"}
                },
                "required": ["name"]
            },
            "professional_summary": {
                "type": "string"
            },
            "experience": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "job_title": {"type": "string"},
                        "company": {"type": "string"},
                        "location": {"type": "string"},
                        "start_date": {"type": "string"},
                        "end_date": {"type": "string"},
                        "description": {"type": "string"}
                    },
                    "required": ["job_title", "company"]
                }
            },
            "education": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "degree": {"type": "string"},
                        "field": {"type": "string"},
                        "institution": {"type": "string"},
                        "graduation_year": {"type": "string"},
                        "gpa": {"type": "string"}
                    },
                    "required": ["degree", "institution"]
                }
            },
            "skills": {
                "type": "array",
                "items": {"type": "string"}
            },
            "certifications": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string"},
                        "issuer": {"type": "string"},
                        "date": {"type": "string"}
                    }
                }
            },
            "languages": {
                "type": "array",
                "items": {"type": "string"}
            }
        },
        "required": ["personal_info", "experience", "education", "skills"]
    }
}

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF file"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return ""

def parse_cv_with_function_calling(cv_text: str) -> dict:
    """Parse CV using Gemini's function calling to ensure consistent JSON"""
    model = genai.GenerativeModel("gemini-2.0-flash-lite")
    
    tools = [
        genai.types.Tool(
            function_declarations=[
                genai.types.FunctionDeclaration(
                    name="extract_cv_data",
                    description="Extract structured data from CV/Resume",
                    parameters=genai.types.Schema(
                        type=genai.types.Type.OBJECT,
                        properties={
                            "personal_info": genai.types.Schema(
                                type=genai.types.Type.OBJECT,
                                properties={
                                    "name": genai.types.Schema(type=genai.types.Type.STRING),
                                    "email": genai.types.Schema(type=genai.types.Type.STRING),
                                    "phone": genai.types.Schema(type=genai.types.Type.STRING),
                                    "location": genai.types.Schema(type=genai.types.Type.STRING),
                                    "linkedin": genai.types.Schema(type=genai.types.Type.STRING),
                                    "github": genai.types.Schema(type=genai.types.Type.STRING)
                                }
                            ),
                            "professional_summary": genai.types.Schema(type=genai.types.Type.STRING),
                            "experience": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(
                                    type=genai.types.Type.OBJECT,
                                    properties={
                                        "job_title": genai.types.Schema(type=genai.types.Type.STRING),
                                        "company": genai.types.Schema(type=genai.types.Type.STRING),
                                        "location": genai.types.Schema(type=genai.types.Type.STRING),
                                        "start_date": genai.types.Schema(type=genai.types.Type.STRING),
                                        "end_date": genai.types.Schema(type=genai.types.Type.STRING),
                                        "description": genai.types.Schema(type=genai.types.Type.STRING)
                                    }
                                )
                            ),
                            "education": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(
                                    type=genai.types.Type.OBJECT,
                                    properties={
                                        "degree": genai.types.Schema(type=genai.types.Type.STRING),
                                        "field": genai.types.Schema(type=genai.types.Type.STRING),
                                        "institution": genai.types.Schema(type=genai.types.Type.STRING),
                                        "graduation_year": genai.types.Schema(type=genai.types.Type.STRING),
                                        "gpa": genai.types.Schema(type=genai.types.Type.STRING)
                                    }
                                )
                            ),
                            "skills": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(type=genai.types.Type.STRING)
                            ),
                            "certifications": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(
                                    type=genai.types.Type.OBJECT,
                                    properties={
                                        "name": genai.types.Schema(type=genai.types.Type.STRING),
                                        "issuer": genai.types.Schema(type=genai.types.Type.STRING),
                                        "date": genai.types.Schema(type=genai.types.Type.STRING)
                                    }
                                )
                            ),
                            "languages": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(type=genai.types.Type.STRING)
                            )
                        }
                    )
                )
            ]
        )
    ]
    
    prompt = f"""Extract all relevant information from this CV and structure it according to the schema.
    
CV Content:
{cv_text}

Please extract all the information and provide it in the structured format."""
    
    response = model.generate_content(prompt, tools=tools)
    
    # Parse function call response
    try:
        if response.candidates[0].content.parts:
            for part in response.candidates[0].content.parts:
                if part.function_call:
                    cv_data = type(part.function_call).to_dict(part.function_call)
                    return cv_data.get('args', {})
    except Exception as e:
        print(f"Error processing function call: {e}")
    
    return {}

def create_database(db_path: str = "cv_database.db"):
    """Create SQLite database with tables for CV data"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Create resumes table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS resumes (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT NOT NULL,
            email TEXT,
            phone TEXT,
            location TEXT,
            linkedin TEXT,
            github TEXT,
            professional_summary TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    # Create experience table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS experience (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            resume_id INTEGER NOT NULL,
            job_title TEXT,
            company TEXT,
            location TEXT,
            start_date TEXT,
            end_date TEXT,
            description TEXT,
            FOREIGN KEY (resume_id) REFERENCES resumes(id)
        )
    ''')
    
    # Create education table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS education (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            resume_id INTEGER NOT NULL,
            degree TEXT,
            field TEXT,
            institution TEXT,
            graduation_year TEXT,
            gpa TEXT,
            FOREIGN KEY (resume_id) REFERENCES resumes(id)
        )
    ''')
    
    # Create skills table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS skills (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            resume_id INTEGER NOT NULL,
            skill TEXT,
            FOREIGN KEY (resume_id) REFERENCES resumes(id)
        )
    ''')
    
    # Create certifications table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS certifications (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            resume_id INTEGER NOT NULL,
            name TEXT,
            issuer TEXT,
            date TEXT,
            FOREIGN KEY (resume_id) REFERENCES resumes(id)
        )
    ''')
    
    # Create languages table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS languages (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            resume_id INTEGER NOT NULL,
            language TEXT,
            FOREIGN KEY (resume_id) REFERENCES resumes(id)
        )
    ''')
    
    conn.commit()
    conn.close()

def insert_cv_data_to_database(cv_data: dict, db_path: str = "cv_database.db"):
    """Insert parsed CV data into SQLite database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    try:
        # Insert personal info
        personal_info = cv_data.get('personal_info', {})
        cursor.execute('''
            INSERT INTO resumes (name, email, phone, location, linkedin, github, professional_summary)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        ''', (
            personal_info.get('name'),
            personal_info.get('email'),
            personal_info.get('phone'),
            personal_info.get('location'),
            personal_info.get('linkedin'),
            personal_info.get('github'),
            cv_data.get('professional_summary')
        ))
        
        resume_id = cursor.lastrowid
        
        # Insert experience
        for exp in cv_data.get('experience', []):
            cursor.execute('''
                INSERT INTO experience (resume_id, job_title, company, location, start_date, end_date, description)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (
                resume_id,
                exp.get('job_title'),
                exp.get('company'),
                exp.get('location'),
                exp.get('start_date'),
                exp.get('end_date'),
                exp.get('description')
            ))
        
        # Insert education
        for edu in cv_data.get('education', []):
            cursor.execute('''
                INSERT INTO education (resume_id, degree, field, institution, graduation_year, gpa)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (
                resume_id,
                edu.get('degree'),
                edu.get('field'),
                edu.get('institution'),
                edu.get('graduation_year'),
                edu.get('gpa')
            ))
        
        # Insert skills
        for skill in cv_data.get('skills', []):
            cursor.execute('''
                INSERT INTO skills (resume_id, skill)
                VALUES (?, ?)
            ''', (resume_id, skill))
        
        # Insert certifications
        for cert in cv_data.get('certifications', []):
            cursor.execute('''
                INSERT INTO certifications (resume_id, name, issuer, date)
                VALUES (?, ?, ?, ?)
            ''', (
                resume_id,
                cert.get('name'),
                cert.get('issuer'),
                cert.get('date')
            ))
        
        # Insert languages
        for lang in cv_data.get('languages', []):
            cursor.execute('''
                INSERT INTO languages (resume_id, language)
                VALUES (?, ?)
            ''', (resume_id, lang))
        
        conn.commit()
        return resume_id
    
    except Exception as e:
        print(f"Error inserting data: {e}")
        conn.rollback()
        return None
    finally:
        conn.close()

def process_cv_folder(cv_folder: str = "cv_folder", db_path: str = "cv_database.db"):
    """Process all CVs in a folder"""
    cv_folder_path = Path(cv_folder)
    
    if not cv_folder_path.exists():
        print(f"CV folder not found: {cv_folder}")
        return
    
    # Create database
    create_database(db_path)
    
    # Process each PDF in the folder
    pdf_files = list(cv_folder_path.glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {cv_folder}")
        return
    
    print(f"Found {len(pdf_files)} CV files. Processing...\n")
    
    for pdf_file in pdf_files:
        print(f"Processing: {pdf_file.name}")
        
        # Extract text from PDF
        cv_text = extract_text_from_pdf(str(pdf_file))
        
        if not cv_text:
            print(f"  ❌ Failed to extract text\n")
            continue
        
        # Parse CV using function calling
        cv_data = parse_cv_with_function_calling(cv_text)
        
        if not cv_data or 'personal_info' not in cv_data:
            print(f"  ❌ Failed to parse CV\n")
            continue
        
        # Save JSON
        json_filename = pdf_file.stem + "_parsed.json"
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(cv_data, f, indent=2, ensure_ascii=False)
        
        # Insert into database
        resume_id = insert_cv_data_to_database(cv_data, db_path)
        
        if resume_id:
            print(f"  ✓ Successfully parsed and stored (ID: {resume_id})")
            print(f"  ✓ JSON saved: {json_filename}")
        else:
            print(f"  ❌ Failed to store in database")
        
        print()

if __name__ == "__main__":
    # Process all CVs in cv_folder
    process_cv_folder(cv_folder="cv_folder", db_path="cv_database.db")
    print("CV processing completed!")

SyntaxError: incomplete input (3036922849.py, line 159)

In [2]:
def parse_cv_with_function_calling(cv_text: str) -> dict:
    """Parse CV using Gemini's function calling to ensure consistent JSON"""
    model = genai.GenerativeModel("gemini-2.0-flash-lite")
    
    tools = [
        genai.types.Tool(
            function_declarations=[
                genai.types.FunctionDeclaration(
                    name="extract_cv_data",
                    description="Extract structured data from CV/Resume",
                    parameters=genai.types.Schema(
                        type=genai.types.Type.OBJECT,
                        properties={
                            "personal_info": genai.types.Schema(
                                type=genai.types.Type.OBJECT,
                                properties={
                                    "name": genai.types.Schema(type=genai.types.Type.STRING),
                                    "email": genai.types.Schema(type=genai.types.Type.STRING),
                                    "phone": genai.types.Schema(type=genai.types.Type.STRING),
                                    "location": genai.types.Schema(type=genai.types.Type.STRING),
                                    "linkedin": genai.types.Schema(type=genai.types.Type.STRING),
                                    "github": genai.types.Schema(type=genai.types.Type.STRING)
                                },
                                required=["name"]
                            ),
                            "professional_summary": genai.types.Schema(type=genai.types.Type.STRING),
                            "experience": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(
                                    type=genai.types.Type.OBJECT,
                                    properties={
                                        "job_title": genai.types.Schema(type=genai.types.Type.STRING),
                                        "company": genai.types.Schema(type=genai.types.Type.STRING),
                                        "location": genai.types.Schema(type=genai.types.Type.STRING),
                                        "start_date": genai.types.Schema(type=genai.types.Type.STRING),
                                        "end_date": genai.types.Schema(type=genai.types.Type.STRING),
                                        "description": genai.types.Schema(type=genai.types.Type.STRING)
                                    },
                                    required=["job_title", "company"]
                                )
                            ),
                            "education": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(
                                    type=genai.types.Type.OBJECT,
                                    properties={
                                        "degree": genai.types.Schema(type=genai.types.Type.STRING),
                                        "field": genai.types.Schema(type=genai.types.Type.STRING),
                                        "institution": genai.types.Schema(type=genai.types.Type.STRING),
                                        "graduation_year": genai.types.Schema(type=genai.types.Type.STRING),
                                        "gpa": genai.types.Schema(type=genai.types.Type.STRING)
                                    },
                                    required=["degree", "institution"]
                                )
                            ),
                            "skills": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(type=genai.types.Type.STRING)
                            ),
                            "certifications": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(
                                    type=genai.types.Type.OBJECT,
                                    properties={
                                        "name": genai.types.Schema(type=genai.types.Type.STRING),
                                        "issuer": genai.types.Schema(type=genai.types.Type.STRING),
                                        "date": genai.types.Schema(type=genai.types.Type.STRING)
                                    }
                                )
                            ),
                            "languages": genai.types.Schema(
                                type=genai.types.Type.ARRAY,
                                items=genai.types.Schema(type=genai.types.Type.STRING)
                            )
                        },
                        required=["personal_info", "experience", "education", "skills"]
                    )
                )
            ]
        )
    ]
    
    prompt = f"""Extract all relevant information from this CV and structure it according to the schema.
    
CV Content:
{cv_text}

Please extract all the information and provide it in the structured format."""
    
    response = model.generate_content(prompt, tools=tools)
    
    # Parse function call response
    try:
        if response.candidates[0].content.parts:
            for part in response.candidates[0].content.parts:
                if hasattr(part, 'function_call') and part.function_call:
                    cv_data = type(part.function_call).to_dict(part.function_call)
                    return cv_data.get('args', {})
    except Exception as e:
        print(f"Error processing function call: {e}")
    
    return {}