In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import json
import os

# Comprobar si hay soporte para CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo utilizado: {device}")

# Cargar el JSON con cláusulas
with open("./contracts_1000.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Preprocesar los datos
clauses = []
labels_legal = []
labels_article = []

for contract in data["contracts"]:
    for clause in contract["clauses"]:
        clauses.append(clause["text"])
        labels_legal.append(1 if clause["is_illegal"] else 0)  # 1: Ilegal, 0: Legal
        labels_article.append(clause["violation"]["article"] if clause["is_illegal"] else None)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_legal, y_test_legal = train_test_split(clauses, labels_legal, test_size=0.2, random_state=42)

# Filtrar datos de artículos (solo cláusulas ilegales y con artículos válidos)
X_train_illegal = [X_train[i] for i in range(len(X_train)) if y_train_legal[i] == 1 and labels_article[i] is not None]
y_train_article_raw = [labels_article[i] for i in range(len(X_train)) if y_train_legal[i] == 1 and labels_article[i] is not None]

X_test_illegal = [X_test[i] for i in range(len(X_test)) if y_test_legal[i] == 1 and labels_article[i] is not None]
y_test_article_raw = [labels_article[i] for i in range(len(X_test)) if y_test_legal[i] == 1 and labels_article[i] is not None]

# Crear un diccionario para mapear los artículos a índices
unique_articles = list(set([article for article in labels_article if article is not None]))
article_to_idx = {article: idx for idx, article in enumerate(unique_articles)}
idx_to_article = {idx: article for article, idx in article_to_idx.items()}

# Convertir las etiquetas de los artículos a índices numéricos
y_train_article = [article_to_idx[label] for label in y_train_article_raw]
y_test_article = [article_to_idx[label] for label in y_test_article_raw]

# Tokenización con BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings_legal = tokenizer(X_train, truncation=True, padding=True, max_length=128)
test_encodings_legal = tokenizer(X_test, truncation=True, padding=True, max_length=128)

train_encodings_article = tokenizer(X_train_illegal, truncation=True, padding=True, max_length=128)
test_encodings_article = tokenizer(X_test_illegal, truncation=True, padding=True, max_length=128)

# Crear datasets para PyTorch
class ClauseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset_legal = ClauseDataset(train_encodings_legal, y_train_legal)
test_dataset_legal = ClauseDataset(test_encodings_legal, y_test_legal)

train_dataset_article = ClauseDataset(train_encodings_article, y_train_article)
test_dataset_article = ClauseDataset(test_encodings_article, y_test_article)

# Modelo y entrenamiento para legal/ilegal
model_legal = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model_legal.to(device)

training_args_legal = TrainingArguments(
    output_dir="./results_legal", evaluation_strategy="epoch", save_strategy="epoch",
    num_train_epochs=40, per_device_train_batch_size=16, per_device_eval_batch_size=16,
    warmup_steps=500, weight_decay=0.001, logging_dir="./logs_legal", fp16=True
)

trainer_legal = Trainer(
    model=model_legal,
    args=training_args_legal,
    train_dataset=train_dataset_legal,
    eval_dataset=test_dataset_legal,
)

# Entrenar y guardar el modelo legal/ilegal
trainer_legal.train()
model_legal.save_pretrained("./saved_model_legal")
tokenizer.save_pretrained("./saved_model_legal")

# Modelo y entrenamiento para artículos
model_article = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(unique_articles))
model_article.to(device)

training_args_article = TrainingArguments(
    output_dir="./results_article", evaluation_strategy="epoch", save_strategy="epoch",
    num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=16,
    warmup_steps=500, weight_decay=0.001, logging_dir="./logs_article", fp16=True
)

trainer_article = Trainer(
    model=model_article,
    args=training_args_article,
    train_dataset=train_dataset_article,
    eval_dataset=test_dataset_article,
)

# Entrenar y guardar el modelo de artículos
trainer_article.train()
model_article.save_pretrained("./saved_model_article")
tokenizer.save_pretrained("./saved_model_article")

print("Entrenamiento completo y modelos guardados.")


In [None]:
# Predicción para una cláusula
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import json
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text = "El comprador no tiene derecho a garantia"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenizar el texto y mover a GPU
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
inputs = {key: val.to(device) for key, val in inputs.items()}  # Mover tensores a GPU

# Predicción legal/ilegal
outputs_legal = model_legal(**inputs)
predicted_legal = torch.argmax(outputs_legal.logits).item()

if predicted_legal == 0:
    print("La cláusula es: Legal")
else:
    print("La cláusula es: Ilegal")
    # Predicción del artículo
    outputs_article = model_article(**inputs)
    predicted_article_idx = torch.argmax(outputs_article.logits).item()
    predicted_article = idx_to_article[predicted_article_idx]
    print(f"Artículo correspondiente: {predicted_article}")
