In [None]:
import os
import json
import sys
import time
import subprocess
from collections import defaultdict

# --- 0. Pre-flight Checks and Installations ---
def install_if_missing(package, import_name=None):
    """Installs a package using pip if it's not already installed."""
    if not import_name:
        import_name = package
    try:
        __import__(import_name)
    except ImportError:
        print(f"Installing {package}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
            print(f"‚úÖ {package} installed successfully.")
        except subprocess.CalledProcessError as e:
            print(f"üõë Failed to install {package}. Please install it manually. Error: {e}")
            raise

# Install all required dependencies quietly
install_if_missing("google-generativeai", "google.generativeai")
install_if_missing("sentence-transformers")
install_if_missing("scikit-learn", "sklearn")
install_if_missing("tqdm")

import numpy as np
import google.generativeai as genai
from google.api_core import exceptions
from sentence_transformers import CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive, userdata
from tqdm import tqdm

# --- 1. Core Processing Functions (from previous step) ---

def setup_environment(gemini_api_key: str, esco_hierarchy_path: str):
    """
    Loads all models, API clients, and reference data.
    This should be run only once per session.
    """
    print("--- üöÄ Starting Environment Setup ---")
    assets = {}

    # 1.1 Configure Gemini API
    print("1. Configuring Gemini API...")
    try:
        genai.configure(api_key=gemini_api_key)
        assets['gemini_pos_model'] = genai.GenerativeModel('models/gemini-2.5-flash')
        print("   ‚úÖ Gemini API configured successfully.")
    except Exception as e:
        print(f"   üõë ERROR: Could not configure Gemini API: {e}")
        return None

    # 1.2 Load Ranking Model
    print("2. Loading CrossEncoder ranking model...")
    try:
        assets['ranking_model'] = CrossEncoder("vaibhav-ceew/onet-msmacroL6")
        print("   ‚úÖ Ranking model loaded.")
    except Exception as e:
        print(f"   üõë ERROR: Could not load ranking model: {e}")
        return None

    # 1.3 Load and Process ESCO Skill Hierarchy
    print("3. Loading and processing ESCO skill hierarchy...")
    try:
        with open(esco_hierarchy_path, 'r', encoding='utf-8') as f:
            esco_skills = json.load(f)

        skill_map = {skill['id']: skill for skill in esco_skills}
        assets['skill_map'] = skill_map
        l3_skills = [s for s in esco_skills if s['level'] == 3]
        assets['l3_skill_list'] = [s['levelName'] for s in l3_skills]
        assets['l3_id_list'] = [s['id'] for s in l3_skills]
        assets['all_l1_ids'] = sorted([s['id'] for s in esco_skills if s['level'] == 1])
        print(f"   ‚úÖ Loaded {len(esco_skills)} skills. Found {len(assets['l3_skill_list'])} L3 skills.")

    except Exception as e:
        print(f"   üõë ERROR: Could not process ESCO hierarchy file '{esco_hierarchy_path}': {e}")
        return None

    # 1.4 Generate Embeddings for all L3 skills
    print("4. Generating embeddings for all L3 skills (this may take a while)...")
    l3_skill_embeddings = []
    batch_size = 90
    l3_list = assets['l3_skill_list']

    try:
        with tqdm(total=len(l3_list), desc="Embedding L3 Skills") as pbar:
            for i in range(0, len(l3_list), batch_size):
                batch = l3_list[i:i + batch_size]
                time.sleep(1) # Respect API rate limits
                result = genai.embed_content(
                    model="models/embedding-001",
                    content=batch,
                    task_type="RETRIEVAL_DOCUMENT"
                )
                l3_skill_embeddings.extend(result['embedding'])
                pbar.update(len(batch))
    except Exception as e:
        print(f"   üõë An API error occurred during embedding generation: {e}")
        return None

    assets['l3_skill_embeddings'] = np.array(l3_skill_embeddings)
    print(f"   ‚úÖ Generated {len(l3_skill_embeddings)} embeddings.")

    print("--- ‚úÖ Environment Setup Complete ---")
    return assets

def _get_pos_from_task(task_text, model):
    """Calls Gemini to transform a task into a structured statement."""
    prompt = f"""Transform the sentence "{task_text}" into a simple Root verb + Object + Context/Condition structure, eliminating any unnecessary information. If more than two verbs, keep the most important one. Don't use short forms by your own, be precise, specific, and concise."""
    max_retries = 3
    delay = 2
    for attempt in range(max_retries):
        try:
            time.sleep(0.5) # Small delay to avoid hitting rate limits too fast
            response = model.generate_content(prompt)
            return response.text.strip()
        except Exception:
            if attempt < max_retries - 1:
                time.sleep(delay)
                delay *= 2
            else:
                return "Error: Transformation Failed"
    return "Error: Transformation Failed after all retries"

def process_job_profile(job_data: dict, assets: dict):
    """
    Processes a single job JSON object through the entire pipeline.
    """
    processed_data = json.loads(json.dumps(job_data))
    original_tasks = processed_data.get("tasks", [])

    if not original_tasks or not isinstance(original_tasks, list):
        processed_data['tasks'] = []
        processed_data['skill_hierarchy'] = []
        return processed_data

    # Part 1: Task Re-ranking and Weighting
    job_context = f"{processed_data.get('JOB TITLE', '')} {processed_data.get('JOB_DESCRIPTION', '')}"
    model_inputs = [(job_context, task) for task in original_tasks]
    scores = assets['ranking_model'].predict(model_inputs, show_progress_bar=False)
    scored_tasks = sorted(zip(original_tasks, scores), key=lambda x: x[1], reverse=True)
    top_tasks = scored_tasks[:40]
    top_scores = [float(score) for _, score in top_tasks]
    total_score = sum(top_scores)
    processed_tasks = []
    if total_score > 0:
        for task, score in top_tasks:
            processed_tasks.append({"task": task, "norm_w": float(score) / total_score})

    # Part 2, 3, 4: POS, Skill Matching, and Hierarchy Mapping
    skill_map = assets['skill_map']
    for task_obj in processed_tasks:
        task_obj['pos'] = _get_pos_from_task(task_obj['task'], assets['gemini_pos_model'])
        if "Error" in task_obj['pos']:
            task_obj['l1_id'], task_obj['l2_id'], task_obj['l3'] = None, None, []
            continue
        try:
            task_embedding_result = genai.embed_content(
                model="models/embedding-001",
                content=task_obj['pos'],
                task_type="RETRIEVAL_QUERY"
            )
            task_embedding = task_embedding_result['embedding']
            similarities = cosine_similarity([task_embedding], assets['l3_skill_embeddings'])[0]
            best_idx = np.argmax(similarities)
            l3_id, l3_skill, cos_sim = assets['l3_id_list'][best_idx], assets['l3_skill_list'][best_idx], round(float(similarities[best_idx]), 4)
            task_obj['l3'] = [{"id": l3_id, "l3": l3_skill}, {"cos_sim": cos_sim}]
            l3_node = skill_map.get(l3_id)
            if l3_node and l3_node.get('parentId'):
                l2_id = l3_node['parentId']
                l2_node = skill_map.get(l2_id)
                if l2_node and l2_node.get('parentId'):
                    task_obj['l2_id'] = l2_id
                    task_obj['l1_id'] = l2_node['parentId']
        except Exception:
            task_obj['l1_id'], task_obj['l2_id'], task_obj['l3'] = None, None, []

    processed_data['tasks'] = processed_tasks

    # Part 5: Create Skill Hierarchy
    hierarchy = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for task in processed_tasks:
        if task.get('l1_id') and task.get('l2_id') and task.get('l3'):
            hierarchy[task['l1_id']][task['l2_id']][task['l3'][0]['id']].append({"task": task['task'], "norm_w": task['norm_w']})

    final_hierarchy = []
    for l1_id in assets['all_l1_ids']:
        l1_data, l1_children, l1_total_weight = hierarchy.get(l1_id, {}), [], 0
        for l2_id, l2_data in l1_data.items():
            l2_children, l2_total_weight = [], 0
            for l3_id, l3_tasks in l2_data.items():
                l3_total_weight = sum(t['norm_w'] for t in l3_tasks)
                l2_total_weight += l3_total_weight
                l2_children.append({"l3id": l3_id, "norm_w_sum_of_all_tasks_under_this_l3": l3_total_weight, "tasks": l3_tasks})
            l1_total_weight += l2_total_weight
            l1_children.append({"l2id": l2_id, "norm_w_sum_of_all_tasks_under_this_l2": l2_total_weight, "children": l2_children})
        final_hierarchy.append({"l1id": l1_id, "norm_w_sum_of_all_tasks_under_this_l1": l1_total_weight, "children": l1_children})

    processed_data['skill_hierarchy'] = sorted(final_hierarchy, key=lambda x: x['norm_w_sum_of_all_tasks_under_this_l1'], reverse=True)
    return processed_data

# --- 2. Main Execution Block for Batch Processing ---

def main():
    """Main function to run the batch processing job."""

    # --- Configuration ---
    try:
        GEMINI_API_KEY = userdata.get('GEMINI_KEY')
    except Exception as e:
        print("üõë ERROR: Could not retrieve 'GOOGLE_API_KEY' from Colab secrets.")
        print("   Please follow the instructions to add the key and restart the runtime.")
        return

    ESCO_FILE = "esco_skill_hierarchy.json"

    # --- Mount Google Drive ---
    print("\n--- Mounting Google Drive ---")
    drive.mount('/content/drive')
    print("‚úÖ Google Drive mounted successfully.")

    # Define Drive paths
    base_drive_path = "/content/drive/My Drive"
    input_folder_path = os.path.join(base_drive_path, "Processed_JSON_Files_with_Tasks_NQR_new_25sep")
    output_folder_path = os.path.join(base_drive_path, "dataset_correct_2_adb")

    # Create output directory if it doesn't exist
    os.makedirs(output_folder_path, exist_ok=True)
    print(f"Input folder:  {input_folder_path}")
    print(f"Output folder: {output_folder_path}")

    # Check for ESCO file
    if not os.path.exists(ESCO_FILE):
        print(f"üõë ERROR: The ESCO hierarchy file '{ESCO_FILE}' was not found in the Colab root directory.")
        print("   Please upload the file and try again.")
        return

    # --- ONE-TIME SETUP ---
    processing_assets = setup_environment(
        gemini_api_key=GEMINI_API_KEY,
        esco_hierarchy_path=ESCO_FILE
    )
    if not processing_assets:
        print("üõë Halting execution due to setup failure.")
        return

    # --- Identify Files to Process (and skip processed ones) ---
    print("\n--- Identifying files to process ---")
    try:
        all_input_files = [f for f in os.listdir(input_folder_path) if f.endswith(".json")]
        processed_files = set(os.listdir(output_folder_path))

        files_to_process = [f for f in all_input_files if f not in processed_files]

        print(f"Found {len(all_input_files)} total files in source folder.")
        print(f"Found {len(processed_files)} already processed files to skip.")
        print(f"‚û°Ô∏è  Processing {len(files_to_process)} new files.")

    except FileNotFoundError:
        print(f"üõë ERROR: Input directory not found at '{input_folder_path}'. Please check the path.")
        return

    if not files_to_process:
        print("\nüéâ All files are already processed. Nothing to do!")
        return

    # --- Main Processing Loop ---
    print(f"\n--- Starting processing loop for {len(files_to_process)} files ---")

    for filename in tqdm(files_to_process, desc="Overall Progress"):
        input_filepath = os.path.join(input_folder_path, filename)
        output_filepath = os.path.join(output_folder_path, filename)

        try:
            # 1. Read the input JSON file
            with open(input_filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # 2. Call the main processing function
            final_result = process_job_profile(
                job_data=data,
                assets=processing_assets
            )

            # 3. Save the output JSON file
            with open(output_filepath, 'w', encoding='utf-8') as f:
                json.dump(final_result, f, indent=2, ensure_ascii=False)

        except json.JSONDecodeError:
            print(f"   - ‚ö†Ô∏è Skipping {filename} due to JSON decoding error.")
        except Exception as e:
            print(f"   - üõë An unexpected error occurred with {filename}: {e}. Skipping file.")

    print("\n\n--- üéâüéâüéâ All new files processed successfully! üéâüéâüéâ ---")
    print(f"Results are saved in: {output_folder_path}")

if __name__ == '__main__':
    main()
