In [None]:
pip install transformers datasets torch scikit-learn


In [None]:
import pandas as pd

# Crear el dataset
data = {
    "text": [
        "REGLAMENTO GENERAL GRADUACION TECNOLOGIA-2015-VERSION FINAL",
        "Políticas académicas",
        "ESTATUTO-ORGANICO-USFX-2010",
        "MODELO-ACADEMICO-2010",
        "REG.-DE-LA-CARRERA-DEL-DOCENTE-INVESTIGADOR-USFX",
        "REGLAMENTO-ESPECIFICO-SISTEMA-PROGRAMACION-OPERACIONES-2006",
        "Reglamento-Pgaeng-Tecnologia-Usfx-Final",
        "plan de estudios carreras sis cic dad tic",
        "Reglamento Específico de Sanciones Disciplinarias",
        "CONVOCATORIA_DE_BECAS_GESTION_2024_COMISION_DE_BIENESTAR",
    ],
    "label": [0, 2, 1, 2, 0, 3, 0, 2, 3, 4]  # Etiquetas: 0: Académico, 1: Administrativo, etc.
}

# Convertir a DataFrame
df = pd.DataFrame(data)
print(df)


In [None]:
from sklearn.model_selection import train_test_split

# Dividir los datos en entrenamiento y pruebas
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)


In [None]:
from transformers import BertTokenizer

# Cargar el tokenizador
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenizar los textos
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)


In [None]:
#CONVERTIR LOS DAATOS EN TENSORES
import torch

# Convertir a formato tensor
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings["input_ids"]),
    torch.tensor(train_labels)
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings["input_ids"]),
    torch.tensor(test_labels)
)


In [None]:
#DEFINIMOS MODELO
from transformers import BertForSequenceClassification

# Modelo de clasificación con 5 etiquetas (cambia el número según tus categorías)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)


In [None]:
#ENTRENAMIENTO
from transformers import Trainer, TrainingArguments

# Configuración del entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [None]:
#ENTRENAMIENTO

trainer.train()


In [None]:
#EVALUAR MODELO

from sklearn.metrics import classification_report

# Hacer predicciones
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=1)

# Reporte de clasificación
print(classification_report(test_labels, pred_labels))


In [None]:
#USAR MODELO PARA PREDICCIONES

new_texts = ["REGLAMENTO LABORATORIO FISICA", "Código de conducta SCETIC"]
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Obtener predicciones
outputs = model(**new_encodings)
predictions = torch.argmax(outputs.logits, dim=1)

print(predictions)  # Muestra la categoría predicha
