In [1]:
!pip install datasets transformers


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import os
import pandas as pd

folder_path = '/content/drive/MyDrive/sis421/documentosTokens'
data = {'text': [], 'label': []}
os.environ["WANDB_DISABLED"] = "true"
categories = {
    'REGLAMENTO': 'Reglamento',
    'POLITICA': 'Política',
    'ESTATUTO': 'Estatuto',
    'MODELO': 'Modelo Académico',
    'PLAN': 'Plan de Estudios',
    'CODIGO': 'Código de Conducta',
    # Añadir más categorías según tus documentos
}

# Crear un mapeo de categorías a números
label_to_id = {label: idx for idx, label in enumerate(categories.values())}
id_to_label = {idx: label for label, idx in label_to_id.items()}

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        label = 'Otros'
        for key in categories:
            if key in filename.upper():
                label = categories[key]
                break
        with open(os.path.join(folder_path, filename), 'r', encoding='latin1') as file:
            text = file.read()
            data['text'].append(text)
            data['label'].append(label)

df = pd.DataFrame(data)
df.to_csv('dataset.csv', index=False)
print(df.head())


In [3]:
#DIVIDIR EL DATASET
from sklearn.model_selection import train_test_split
df['label'] = df['label'].map(label_to_id)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('test_dataset.csv', index=False)


In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Cargar los datasets desde los archivos CSV
train_dataset = load_dataset('csv', data_files='train_dataset.csv', split='train')
test_dataset = load_dataset('csv', data_files='test_dataset.csv', split='train')

# Cargar el tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Función de preprocesamiento
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

# Tokenizar los datasets
encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

encoded_train_dataset = encoded_train_dataset.with_format("torch", columns=['input_ids', 'attention_mask', 'label'])
encoded_test_dataset = encoded_test_dataset.with_format("torch", columns=['input_ids', 'attention_mask', 'label'])

# Eliminar columnas innecesarias
#encoded_train_dataset = encoded_train_dataset.remove_columns(['text'])
#encoded_test_dataset = encoded_test_dataset.remove_columns(['text'])

# Asegurar que las etiquetas sean enteros
#encoded_train_dataset = encoded_train_dataset.with_format('torch')
#encoded_test_dataset = encoded_test_dataset.with_format('torch')



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Cargar el modelo preentrenado con la cantidad correcta de etiquetas
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=len(categories) )

# Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Inicializar el entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset
)

# Entrenar el modelo
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


ValueError: Target size (torch.Size([4])) must be the same as input size (torch.Size([4, 6]))

In [None]:
import matplotlib.pyplot as plt

# Extraer las pérdidas del historial de entrenamiento
train_loss = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
eval_loss = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]

# Graficar las pérdidas
if train_loss or eval_loss:  # Verificar que existan datos
    plt.plot(train_loss, label='Pérdida de entrenamiento')
    plt.plot(eval_loss, label='Pérdida de evaluación')
    plt.xlabel('Época')
    plt.ylabel('Pérdida')
    plt.legend()
    plt.title('Pérdidas durante el entrenamiento')
    plt.show()
else:
    print("No hay datos de pérdidas disponibles para graficar.")


In [None]:
trainer.save_model("path/to/save_model/content/drive/MyDrive/sis421/models/clasificacion")


In [None]:
# Clasificación de texto
def classify_text(text):
    try:
        # Preparar las entradas
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)  # Obtener los logits
        prediction = outputs.logits.argmax(dim=1).item()  # Índice de la predicción

        # Retornar la categoría predicha
        return id_to_label[prediction]
    except Exception as e:
        print(f"Error clasificando el texto: {e}")
        return None

# Ejemplo de clasificación
new_text = "Contenido del documento..."
classification = classify_text(new_text)

if classification:
    print(f"Categoría: {classification}")
else:
    print("No se pudo clasificar el texto.")

