In [1]:
import google.generativeai as genai
import pandas as pd
import yaml
import os
import json
import time
import dotenv
from pathlib import Path
from tqdm.notebook import tqdm # Usar la versión de tqdm para notebooks

# --- CONFIGURACIÓN DE LA API KEY ---

try:
    dotenv.load_dotenv()  # Cargar variables de entorno desde el archivo .env
    API_KEY = os.getenv("GEMINI_API_KEY")
    if not API_KEY:
        if 'API_KEY' not in locals():
             raise ValueError("API Key no encontrada. Configúrala como variable de entorno o directamente en la celda.")
    
    genai.configure(api_key=API_KEY)
    print("API de Gemini configurada correctamente.")
except Exception as e:
    print(f"Error de configuración: {e}")

API de Gemini configurada correctamente.


In [3]:
project_root = Path.cwd().parent 

def load_yaml_config(path):
    with open(path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def load_and_format_prompt(template_path, cv_text, offer_text):
    with open(template_path, 'r', encoding='utf-8') as f:
        prompt_template = f.read()
    return prompt_template.format(text_of_job_posting=offer_text, text_of_cv=cv_text)

# --- SELECCIONA EL EXPERIMENTO A PROBAR ---
PROMPTS_CONFIG_FILE = project_root / 'config' / 'experiment_prompts.yaml'
prompt_name_to_test = "P-07_zero_shot_gemini_friendly"  

prompts_config = load_yaml_config(PROMPTS_CONFIG_FILE)
chosen_prompt_config = prompts_config[prompt_name_to_test]
prompt_template_path = project_root / chosen_prompt_config['path']

print(f"Experimento seleccionado: '{prompt_name_to_test}'")
print(f"Usando plantilla de prompt: '{prompt_template_path}'")

Experimento seleccionado: 'P-07_zero_shot_gemini_friendly'
Usando plantilla de prompt: 'c:\Users\zveng\Desktop\TFG\report\prompts\P-07_zero_shot_gemini_friendly.txt'


In [4]:
# Rutas a los datos
PROCESSED_DATA_DIR = project_root / 'data' / '01_processed'
TEST_SETS_DIR = project_root / 'data' / '02_test_sets'

print("Cargando datos...")
# Cargar el archivo que define QUÉ pares vamos a procesar
pairs_df = pd.read_csv(TEST_SETS_DIR / 'test_set_pairs_to_annotate.csv', dtype={'job_id': str, 'candidate_id': str})

# Cargar los datos de ofertas y CVs para obtener los textos
offers_df = pd.read_csv(PROCESSED_DATA_DIR / 'offers_processed.csv', index_col='job_id', dtype={'job_id': str})
cvs_df = pd.read_csv(PROCESSED_DATA_DIR / 'cvs_processed.csv', index_col='candidate_id', dtype={'candidate_id': str})

# Mapear los textos a diccionarios para un acceso rápido
offer_texts = offers_df['description'].to_dict()
cv_texts = cvs_df['responsibilities'].to_dict()

print(f"Datos cargados. Se procesarán {len(pairs_df)} pares en total (en la ejecución completa).")
print("Ejemplo de los primeros pares a procesar:")
display(pairs_df.head())

Cargando datos...
Datos cargados. Se procesarán 360 pares en total (en la ejecución completa).
Ejemplo de los primeros pares a procesar:


Unnamed: 0,job_id,candidate_id,category,score,annotator_id,justification
0,3905283972,cand_3284,,,,
1,3886899287,cand_922,,,,
2,3905688343,cand_2338,,,,
3,3887569985,cand_7879,,,,
4,3890892626,cand_708,,,,


In [9]:
# --- CONFIGURACIÓN PARA LA PRUEBA ---
NUM_SAMPLES_TO_TEST = 2
small_batch_df = pairs_df.head(NUM_SAMPLES_TO_TEST)

print(f"Iniciando prueba con {NUM_SAMPLES_TO_TEST} pares...")

# Configuración del modelo Gemini
MODEL_NAME = "gemini-2.5-pro"
generation_config = genai.GenerationConfig(temperature=0.1)
model = genai.GenerativeModel(MODEL_NAME, generation_config=generation_config, system_instruction= """As a senior technical recruiter, analyze the provided Job Posting and Candidate CV. Your goal is to determine the candidate's fit for the role based on the following action-oriented scale.

SCORING SCALE
- 95.0 (MUST INTERVIEW): Compelling evidence of a strong fit. A priority to contact.
- 70.0 (PROMISING FIT): Good alignment and potential. Justifies a screening call.
- 45.0 (BORDERLINE): Significant gaps, but a noteworthy quality makes them worth keeping on file. Not a fit for this role now.
- 15.0 (NO FIT): Fundamentally misaligned or lacks non-negotiable requirements.

INSTRUCTIONS
1.  Look for transferable skills and conceptual similarities, not just keyword matching.
2.  Do not penalize for missing minor "nice-to-have" skills.
3.  Based on your analysis, provide a structured JSON response.

Your entire response must be a single, valid JSON object. Do not include any text before or after the JSON block.

```json
{{
  "category": "string (one of: MUST INTERVIEW, PROMISING FIT, BORDERLINE, NO FIT, using the exact text from the scale)",
  "score": "float (the corresponding numeric score: 95.0, 70.0, 45.0, or 15.0)",
  "justification": {{
    "strengths": [
      "string (List of points where the CV aligns with the job's core requirements)"
    ],
    "concerns_and_gaps": [
      "string (List of significant gaps or missing non-negotiable requirements)"
    ],
    "potential": [
      "string (List of transferable skills or inferred strengths that suggest future potential)"
    ],
    "final_summary": "string (A one-sentence summary justifying the final score)"
  }}
}}""")

results = []

for index, row in tqdm(small_batch_df.iterrows(), total=len(small_batch_df), desc="Probando Pares"):
    job_id, candidate_id = row['job_id'], row['candidate_id']
    offer_text, cv_text = offer_texts.get(job_id), cv_texts.get(candidate_id)

    if not all([offer_text, cv_text]):
        print(f"Saltando par por datos faltantes (Job: {job_id}, Candidate: {candidate_id})")
        continue

    prompt = load_and_format_prompt(prompt_template_path, cv_text, offer_text)
    
    try:
        response = model.generate_content(prompt)
        json_str = response.text.strip().removeprefix("```json").removesuffix("```").strip()
        result_json = json.loads(json_str)
        result_json.update({'job_id': job_id, 'candidate_id': candidate_id})
        results.append(result_json)
        print(f"\n✅ Éxito en par (Job: {job_id})")
        print(f"Respuesta: {response}")
    except Exception as e:
        print(f"\n❌ Error en par (Job: {job_id}, Candidate: {candidate_id}): {e}")
        error_data = {'job_id': job_id, 'candidate_id': candidate_id, 'error': str(e)}
        print(f"Respuesta: {response}")
        results.append(error_data)

    time.sleep(1.1)

print("\n\n--- Resultados de la Prueba ---")
# Imprimir los resultados de forma legible
for res in results:
    print(json.dumps(res, indent=2, ensure_ascii=False))

Iniciando prueba con 2 pares...


Probando Pares:   0%|          | 0/2 [00:00<?, ?it/s]


✅ Éxito en par (Job: 3905283972)
Respuesta: response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "```json\n{\n  \"category\": \"NO FIT\",\n  \"score\": 15.0,\n  \"justification\": {\n    \"strengths\": [\n      \"The candidate's listed skills, such as 'Construction', 'Project Coordination', and 'Quality Assurance', indicate experience within the correct industry.\"\n    ],\n    \"concerns_and_gaps\": [\n      \"The CV completely lacks any mention of AutoCAD, CAD software, drafting, or 2D shop drawings, which are the core, non-negotiable requirements of the role.\",\n      \"There is no evidence to support the required 'at least one year of work experience using AutoCAD software in a professional setting'.\",\n      \"The skills provided are managerial and operational (e.g., 'Supervision', 'Cost Control', 'Esti

In [12]:
genai.list_models
print("Listado de modelos disponibles:")
models = genai.list_models()
for model in models:
    print(f"- {model}")

Listado de modelos disponibles:
- Model(name='models/embedding-gecko-001',
      base_model_id='',
      version='001',
      display_name='Embedding Gecko',
      description='Obtain a distributed representation of a text.',
      input_token_limit=1024,
      output_token_limit=1,
      supported_generation_methods=['embedText', 'countTextTokens'],
      temperature=None,
      max_temperature=None,
      top_p=None,
      top_k=None)
- Model(name='models/gemini-1.5-pro-latest',
      base_model_id='',
      version='001',
      display_name='Gemini 1.5 Pro Latest',
      description=('Alias that points to the most recent production (non-experimental) release '
                   'of Gemini 1.5 Pro, our mid-size multimodal model that supports up to 2 '
                   'million tokens.'),
      input_token_limit=2000000,
      output_token_limit=8192,
      supported_generation_methods=['generateContent', 'countTokens'],
      temperature=1.0,
      max_temperature=2.0,
      top_p