In [None]:
# Instalar/verificar paquetes necesarios
!pip show azure-ai-ml scikit-learn pandas numpy pyarrow


In [None]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import os
import pickle
import joblib
from datetime import datetime

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Azure ML
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Model, Data
from azure.ai.ml.constants import AssetTypes

print("‚úÖ Librer√≠as importadas correctamente")


In [None]:
# Configurar Azure ML Client
try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()

# Conectar al workspace
ml_client = MLClient.from_config(credential=credential)
print(f"‚úÖ Conectado al workspace: {ml_client.workspace_name}")


In [None]:
# Cargar datos de curr√≠culums
data_path = "../data/processed/features_extracted.csv"
df = pd.read_csv(data_path)

print(f"üìä Datos cargados: {df.shape}")
print(f"üìã Columnas: {list(df.columns)}")
print(f"üéØ Distribuci√≥n de target 'apto':")
print(df['apto'].value_counts())

# Mostrar preview
print("\nüìã Preview de los datos:")
display(df.head())


In [None]:
# Preparar features para el modelo
def prepare_features(df):
    """Prepara las features para entrenamiento del modelo"""
    df_processed = df.copy()
    
    # Manejar valores faltantes
    df_processed['discipline'] = df_processed['discipline'].fillna('Unknown')
    df_processed['gender'] = df_processed['gender'].fillna('Unknown')
    df_processed['age_range'] = df_processed['age_range'].fillna('Unknown')
    
    # Seleccionar features num√©ricas y categ√≥ricas
    numeric_features = [
        'years_total_experience', 'years_skill_main', 'num_promotions', 
        'avg_tenure_months', 'gap_months_last5y'
    ]
    
    categorical_features = [
        'education_level', 'discipline', 'work_authorization', 
        'gender', 'age_range'
    ]
    
    # Agregar features de idiomas (ya son num√©ricas - Native/C1/B2/etc se pueden tratar como categ√≥ricas)
    language_features = [col for col in df.columns if col.startswith('languages.')]
    
    # Para este ejemplo, convertiremos idiomas a features binarias (tiene/no tiene)
    for lang_col in language_features:
        df_processed[f'{lang_col}_has'] = (~df_processed[lang_col].isna()).astype(int)
    
    # Crear feature de n√∫mero de habilidades (contar elementos en la lista de skills)
    def count_skills(skills_str):
        if pd.isna(skills_str) or skills_str == '[]':
            return 0
        try:
            # Contar elementos separados por comas dentro de los corchetes
            return len([s.strip() for s in skills_str.strip('[]').split(',') if s.strip().replace("'", "").replace('"', '').strip()])
        except:
            return 0
    
    df_processed['num_skills'] = df['skills'].apply(count_skills)
    
    # Crear feature de n√∫mero de certificaciones
    def count_certifications(certs_str):
        if pd.isna(certs_str) or certs_str == '[]':
            return 0
        try:
            return len([c.strip() for c in certs_str.strip('[]').split(',') if c.strip().replace("'", "").replace('"', '').strip()])
        except:
            return 0
    
    df_processed['num_certifications'] = df['certifications'].apply(count_certifications)
    
    # Actualizar features num√©ricas
    numeric_features.extend(['num_skills', 'num_certifications'])
    numeric_features.extend([f'{col}_has' for col in language_features])
    
    return df_processed, numeric_features, categorical_features

# Preparar datos
df_processed, numeric_features, categorical_features = prepare_features(df)

print(f"üîß Features num√©ricas ({len(numeric_features)}): {numeric_features}")
print(f"üè∑Ô∏è Features categ√≥ricas ({len(categorical_features)}): {categorical_features}")
print(f"üìä Dataset procesado: {df_processed.shape}")


In [None]:
# Preparar X y y
feature_columns = numeric_features + categorical_features
X = df_processed[feature_columns]
y = df_processed['apto']

print(f"üìä Features (X): {X.shape}")
print(f"üéØ Target (y): {y.shape}")
print(f"üìà Distribuci√≥n del target:")
print(y.value_counts().sort_index())

# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nüìä Divisi√≥n de datos:")
print(f"  üöÇ Train: {X_train.shape[0]} muestras")
print(f"  üß™ Test: {X_test.shape[0]} muestras")


In [None]:
# Crear pipeline de preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

# Definir modelos a entrenar
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Entrenar modelos
trained_models = {}
model_results = {}

for name, model in models.items():
    print(f"\nüöÇ Entrenando {name}...")
    
    # Crear pipeline completo
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Entrenar
    pipeline.fit(X_train, y_train)
    
    # Predecir
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)
    
    # Calcular m√©tricas
    accuracy = accuracy_score(y_test, y_pred)
    
    # Guardar resultados
    trained_models[name] = pipeline
    model_results[name] = {
        'accuracy': accuracy,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"‚úÖ {name} - Accuracy: {accuracy:.4f}")

print("\nüèÜ Resumen de modelos:")
for name, results in model_results.items():
    print(f"  {name}: {results['accuracy']:.4f}")


In [None]:
# Seleccionar el mejor modelo
best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['accuracy'])
best_model = trained_models[best_model_name]
best_accuracy = model_results[best_model_name]['accuracy']

print(f"ü•á Mejor modelo: {best_model_name} (Accuracy: {best_accuracy:.4f})")

# Mostrar reporte detallado del mejor modelo
y_pred_best = model_results[best_model_name]['y_pred']
print(f"\nüìä Reporte de clasificaci√≥n para {best_model_name}:")
print(classification_report(y_test, y_pred_best))

print(f"\nüî• Matriz de confusi√≥n:")
print(confusion_matrix(y_test, y_pred_best))


In [None]:
# Crear directorio para modelos
os.makedirs("model_artifacts", exist_ok=True)

# Guardar el mejor modelo
model_name = f"resume_classifier_{best_model_name.lower()}"
model_path = f"model_artifacts/{model_name}.pkl"

# Guardar con joblib (m√°s eficiente para scikit-learn)
joblib.dump(best_model, model_path)

# Tambi√©n guardar informaci√≥n adicional
model_info = {
    'model_type': best_model_name,
    'accuracy': best_accuracy,
    'feature_columns': feature_columns,
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'classes': best_model.classes_.tolist(),
    'training_date': datetime.now().isoformat(),
    'data_shape': df_processed.shape
}

import json
with open(f"model_artifacts/{model_name}_info.json", 'w') as f:
    json.dump(model_info, f, indent=2)

print(f"üíæ Modelo guardado en: {model_path}")
print(f"üìã Informaci√≥n del modelo guardada en: {model_name}_info.json")


In [None]:
# Registrar modelo en Azure ML
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

model_azure = Model(
    path="model_artifacts",
    type=AssetTypes.CUSTOM_MODEL,
    name=model_name,
    description=f"Resume classification model using {best_model_name} - Accuracy: {best_accuracy:.4f}",
    version="1",
    tags={
        "algorithm": best_model_name,
        "accuracy": str(best_accuracy),
        "data_type": "resume_classification",
        "framework": "scikit-learn"
    }
)

registered_model = ml_client.models.create_or_update(model_azure)
print(f"‚úÖ Modelo registrado en Azure ML: {registered_model.name} v{registered_model.version}")
print(f"üéØ Accuracy del modelo: {best_accuracy:.4f}")


In [None]:
# Preparar datasets para RAI (deben incluir solo las features que usa el modelo)
# RAI necesita los datos en el formato original con las mismas columnas que el modelo espera

# Crear datasets con las features que usa el modelo
df_train_rai = pd.concat([X_train, y_train], axis=1)
df_test_rai = pd.concat([X_test, y_test], axis=1)

print(f"üìä Dataset RAI Train: {df_train_rai.shape}")
print(f"üìä Dataset RAI Test: {df_test_rai.shape}")
print(f"üéØ Columna target: 'apto'")

# Verificar que no hay valores faltantes problem√°ticos
print(f"\nüîç Valores faltantes en train: {df_train_rai.isnull().sum().sum()}")
print(f"üîç Valores faltantes en test: {df_test_rai.isnull().sum().sum()}")

# Mostrar preview
print("\nüìã Preview RAI Train data:")
display(df_train_rai.head())


In [None]:
# Crear directorios y convertir a Parquet (requerido para RAI)
os.makedirs("rai-train-data", exist_ok=True)
os.makedirs("rai-test-data", exist_ok=True)

# Convertir a Parquet
table_train = pa.Table.from_pandas(df_train_rai)
table_test = pa.Table.from_pandas(df_test_rai)

pq.write_table(table_train, "rai-train-data/data.parquet", version="1.0")
pq.write_table(table_test, "rai-test-data/data.parquet", version="1.0")

print("‚úÖ Datos convertidos a formato Parquet para RAI")


In [None]:
# Crear archivos MLTable (requeridos para RAI)

# MLTable para datos de entrenamiento
mltable_train = '''
type: mltable
paths:
  - pattern: ./*.parquet
transformations:
  - read_parquet
'''

with open("rai-train-data/MLTable", "w") as f:
    f.write(mltable_train)

# MLTable para datos de test
mltable_test = '''
type: mltable
paths:
  - pattern: ./*.parquet
transformations:
  - read_parquet
'''

with open("rai-test-data/MLTable", "w") as f:
    f.write(mltable_test)

print("‚úÖ Archivos MLTable creados para RAI")


In [None]:
# Nombres para los datasets
train_data_name = f"{model_name}_train_rai"
test_data_name = f"{model_name}_test_rai"
data_version = "1"

# Registrar datos de entrenamiento
train_data = Data(
    path="rai-train-data/",
    type=AssetTypes.MLTABLE,
    description=f"RAI training data para {model_name}",
    name=train_data_name,
    version=data_version,
)
ml_client.data.create_or_update(train_data)

# Registrar datos de test
test_data = Data(
    path="rai-test-data/",
    type=AssetTypes.MLTABLE,
    description=f"RAI test data para {model_name}",
    name=test_data_name,
    version=data_version,
)
ml_client.data.create_or_update(test_data)

print(f"‚úÖ Datasets RAI registrados:")
print(f"  üìä Train: {train_data_name}")
print(f"  üìä Test: {test_data_name}")


In [None]:
# Conectar al registro de Azure ML para obtener componentes RAI
registry_name = "azureml"
ml_client_registry = MLClient(
    credential=credential,
    subscription_id=ml_client.subscription_id,
    resource_group_name=ml_client.resource_group_name,
    registry_name=registry_name,
)

# Obtener componentes RAI
label = "latest"

rai_constructor_component = ml_client_registry.components.get(
    name="rai_tabular_insight_constructor", label=label
)

version = rai_constructor_component.version
print(f"üì¶ Versi√≥n de componentes RAI: {version}")

rai_erroranalysis_component = ml_client_registry.components.get(
    name="rai_tabular_erroranalysis", version=version
)

rai_explanation_component = ml_client_registry.components.get(
    name="rai_tabular_explanation", version=version
)

rai_gather_component = ml_client_registry.components.get(
    name="rai_tabular_insight_gather", version=version
)

print("‚úÖ Componentes RAI obtenidos")


In [None]:
from azure.ai.ml import Input, dsl, Output
from azure.ai.ml.constants import AssetTypes
import uuid

# Configuraci√≥n del modelo
expected_model_id = f"{model_name}:1"
azureml_model_id = f"azureml:{expected_model_id}"

# CAMBIAR aqu√≠ el nombre de tu cluster de c√≥mputo
compute_name = "aml-cluster"  # Cambia por tu cluster

@dsl.pipeline(
    compute=compute_name,
    description=f"RAI dashboard para {model_name}",
    experiment_name=f"RAI_insights_{model_name}",
)
def rai_pipeline_resume_classifier(target_column_name, train_data, test_data):
    # Construir RAI dashboard
    create_rai_job = rai_constructor_component(
        title=f"RAI Dashboard - Resume Classifier ({best_model_name})",
        task_type="classification",
        model_info=expected_model_id,
        model_input=Input(type=AssetTypes.CUSTOM_MODEL, path=azureml_model_id),
        train_dataset=train_data,
        test_dataset=test_data,
        target_column_name=target_column_name,
    )
    create_rai_job.set_limits(timeout=300)

    # Agregar an√°lisis de errores
    error_job = rai_erroranalysis_component(
        rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,
    )
    error_job.set_limits(timeout=300)

    # Agregar explicaciones del modelo
    explanation_job = rai_explanation_component(
        rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,
        comment=f"Explicaciones para clasificador de curr√≠culums - {best_model_name}", 
    )
    explanation_job.set_limits(timeout=300)

    # Combinar todos los insights
    rai_gather_job = rai_gather_component(
        constructor=create_rai_job.outputs.rai_insights_dashboard,
        insight_3=error_job.outputs.error_analysis,
        insight_4=explanation_job.outputs.explanation,
    )
    rai_gather_job.set_limits(timeout=300)

    rai_gather_job.outputs.dashboard.mode = "upload"

    return {
        "dashboard": rai_gather_job.outputs.dashboard,
    }

print(f"‚úÖ Pipeline RAI definido para {model_name}")


In [None]:
from azure.ai.ml import Input

# Preparar inputs del pipeline
resume_train_input = Input(
    type="mltable",
    path=f"azureml:{train_data_name}:{data_version}",
    mode="download",
)

resume_test_input = Input(
    type="mltable",
    path=f"azureml:{test_data_name}:{data_version}",
    mode="download",
)

# Crear instancia del pipeline
insights_pipeline_job = rai_pipeline_resume_classifier(
    target_column_name="apto",
    train_data=resume_train_input,
    test_data=resume_test_input,
)

# Configurar output
rand_path = str(uuid.uuid4())
insights_pipeline_job.outputs.dashboard = Output(
    path=f"azureml://datastores/workspaceblobstore/paths/{rand_path}/dashboard/",
    mode="upload",
    type="uri_folder",
)

print("‚úÖ Pipeline RAI configurado y listo para ejecutar")
print(f"üéØ Modelo: {model_name} ({best_model_name})")
print(f"üìä Accuracy: {best_accuracy:.4f}")
print(f"üéØ Target: apto")


In [None]:
from azure.ai.ml.entities import PipelineJob
from IPython.core.display import HTML
from IPython.display import display
import time

def submit_and_wait(ml_client, pipeline_job) -> PipelineJob:
    created_job = ml_client.jobs.create_or_update(pipeline_job)
    assert created_job is not None

    print("üöÄ Pipeline RAI enviado. Puedes seguir el progreso en:")
    display(HTML(f'<a href="{created_job.studio_url}" target="_blank">{created_job.studio_url}</a>'))

    while created_job.status not in [
        "Completed",
        "Failed",
        "Canceled",
        "NotResponding",
    ]:
        time.sleep(30)
        created_job = ml_client.jobs.get(created_job.name)
        print(f"üìä Estado actual: {created_job.status}")
        
    if created_job.status == "Completed":
        print("‚úÖ Pipeline RAI completado exitosamente!")
        print("üéØ Ve al Azure ML Studio para ver tu dashboard RAI")
        print("üìç Ubicaci√≥n: Models > Responsible AI")
    else:
        print(f"‚ùå Pipeline termin√≥ con estado: {created_job.status}")
        
    return created_job

# EJECUTAR EL PIPELINE RAI
print(f"üéØ Ejecutando RAI para el modelo: {model_name}")
print(f"ü§ñ Algoritmo: {best_model_name}")
print(f"üìä Accuracy: {best_accuracy:.4f}")
print(f"üéØ Target: apto (clasificaci√≥n de aptitud de candidatos)")
print("\n‚è≥ Esto puede tomar varios minutos...")

insights_job = submit_and_wait(ml_client, insights_pipeline_job)
