In [None]:
# Importar librerías
import json
import re
import nltk
import hashlib
import torch
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, T5ForConditionalGeneration

# Descargar recursos de NLTK si es la primera vez
# nltk.download('punkt')
# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Función para limpiar texto
def limpiar_texto(texto):
    texto_limpio = re.sub(r'<[^>]+>', ' ', texto)
    texto_limpio = texto_limpio.lower()
    texto_limpio = re.sub(r'[^a-z0-9áéíóúüñç\s]', ' ', texto_limpio)
    texto_limpio = re.sub(r'\s+', ' ', texto_limpio).strip()
    palabras = nltk.word_tokenize(texto_limpio)
    palabras_lematizadas = [lemmatizer.lemmatize(p) for p in palabras]
    return ' '.join(palabras_lematizadas)

# Cargar archivo JSON de ZAP
with open('ReporteZAP.json', 'r', encoding='utf-8') as f:
    zap_report = json.load(f)

site = zap_report['site'][0]
alerts = site['alerts']

vuln_data = []
for alert in alerts:
    cwe_id = alert.get('cweid')
    description = alert.get('desc', '')
    solution = alert.get('solution', '')
    riskdesc = alert.get('riskdesc', '')
    uri = alert['instances'][0].get('uri', '') if 'instances' in alert and alert['instances'] else ''
    risk = riskdesc.split('(')[0].strip() if riskdesc else ''
    vuln_data.append({
        'cweid': cwe_id,
        'desc': limpiar_texto(description),
        'solution': limpiar_texto(solution),
        'uri': uri,
        'risk': risk
    })

# Adaptación al formato DiverseVul
diverse_data = []
project_name = site.get('@host') or site.get('@name') or 'Proyecto_ZAP'

for alert, vuln in zip(alerts, vuln_data):
    plugin_id = alert.get('pluginid', 'unknown')
    commit_id = str(plugin_id)
    unique_str = f"{project_name}_{plugin_id}_{vuln['uri']}"
    hash_id = hashlib.md5(unique_str.encode('utf-8')).hexdigest()[:8]
    size = len(vuln['desc'].split())
    diverse_data.append({
        'func': vuln['desc'],
        'target': 1,
        'cwe': vuln['cweid'],
        'project': project_name,
        'commitID': commit_id,
        'hash': hash_id,
        'size': size,
        'message': vuln['solution']
    })

# Tokenización con T5
tokenizer = AutoTokenizer.from_pretrained('t5-base')
input_ids_list = []
attention_mask_list = []
label_ids_list = []

for item in diverse_data:
    text = item['func']
    label = str(item['cwe'])
    enc_in = tokenizer(text, max_length=512, truncation=True, padding='max_length')
    enc_out = tokenizer(label, max_length=2, truncation=True, padding='max_length')
    input_ids_list.append(enc_in['input_ids'])
    attention_mask_list.append(enc_in['attention_mask'])
    label_ids = enc_out['input_ids']
    label_ids = [(-100 if token_id == tokenizer.pad_token_id else token_id) for token_id in label_ids]
    label_ids_list.append(label_ids)

input_ids_tensor = torch.tensor(input_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)
labels_tensor = torch.tensor(label_ids_list)

print("Shape de input_ids:", input_ids_tensor.shape)
print("Shape de attention_mask:", attention_mask_tensor.shape)
print("Shape de labels:", labels_tensor.shape)

# Cargar modelo para entrenamiento o predicción
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Ejemplo de entrada tokenizada
ejemplo = diverse_data[0]
print("\nEjemplo procesado:")
print("Descripción:", ejemplo['func'])
print("CWE objetivo:", ejemplo['cwe'])
