In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from tqdm import tqdm


In [3]:
# Cargar los datasets
artista_df = pd.read_csv('/kaggle/input/datasetclasificador/DatasetClassifier-Artista.csv')
consulta_df = pd.read_csv('/kaggle/input/datasetclasificador/Dataset classifier-ConsultarInformacion.csv')
playlist_df = pd.read_csv('/kaggle/input/datasetclasificador/Dataset classifier-CrearPlaylist.csv')

# Asignar etiquetas a cada dataset
artista_df['label'] = 'Artista'
consulta_df['label'] = 'ConsultarInformacion'
playlist_df['label'] = 'CrearPlaylist'

# Combinar datasets
data = pd.concat([artista_df, consulta_df, playlist_df], ignore_index=True)
data['tipo'] = data['tipo'] - 1



In [4]:
data.head()

Unnamed: 0,frase,tipo,label
0,¿Puedes recomendarme un artista nuevo?,0,Artista
1,Necesito sugerencias de artistas similares a C...,0,Artista
2,¿Qué artista me recomendarías para relajarme?,0,Artista
3,¿Tienes alguna recomendación de artistas pop?,0,Artista
4,Recomiéndame un buen artista de jazz.,0,Artista


In [5]:
data.shape

(3020, 3)

In [6]:
data.tail()

Unnamed: 0,frase,tipo,label
3015,Quiero canciones modernas y energéticas para h...,2,CrearPlaylist
3016,Busco música que combine con un ambiente juven...,2,CrearPlaylist
3017,Quiero una lista de canciones de pop y indie p...,2,CrearPlaylist
3018,Necesito música optimista y alegre para animar...,2,CrearPlaylist
3019,Quiero una playlist fresca y estilosa para dar...,2,CrearPlaylist


In [7]:
X = data['frase']
y = data['tipo']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

models = {
    'SVM': SVC(kernel='linear', probability=True),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=200, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    print(f"Resultados para {name}:")
    print(classification_report(y_test, y_pred))


Resultados para SVM:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       200
           1       1.00      1.00      1.00       203
           2       0.98      1.00      0.99       201

    accuracy                           0.99       604
   macro avg       0.99      0.99      0.99       604
weighted avg       0.99      0.99      0.99       604

Resultados para Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       200
           1       0.99      1.00      0.99       203
           2       0.99      0.99      0.99       201

    accuracy                           0.99       604
   macro avg       0.99      0.99      0.99       604
weighted avg       0.99      0.99      0.99       604

Resultados para Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       200
           1       0.97      1.00      0

In [8]:
model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)

X_embeddings = embedder.encode(X.tolist(), show_progress_bar=True)
X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(X_embeddings, y, test_size=0.2, random_state=42, stratify=y)

clf = SVC(kernel='linear')
clf.fit(X_train_emb, y_train_emb)
y_pred_emb = clf.predict(X_test_emb)

print("Resultados con Sentence Transformers:")
print(classification_report(y_test_emb, y_pred_emb))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/95 [00:00<?, ?it/s]

Resultados con Sentence Transformers:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       200
           1       0.97      1.00      0.99       203
           2       0.99      0.97      0.98       201

    accuracy                           0.99       604
   macro avg       0.99      0.99      0.99       604
weighted avg       0.99      0.99      0.99       604



In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [10]:
X_train = X_train.reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

print(f"Longitud de X_train: {len(X_train)}")
print(f"Longitud de y_train: {len(y_train)}")
print(f"Longitud de X_train: {len(X_test)}")
print(f"Longitud de y_train: {len(y_test)}")

Longitud de X_train: 2416
Longitud de y_train: 2416
Longitud de X_train: 604
Longitud de y_train: 604


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_classes = data['tipo'].nunique()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

train_dataset = TextDataset(
    X_train,
    y_train,
    tokenizer,
    max_len=128
)

test_dataset = TextDataset(
    X_test,
    y_test,
    tokenizer,
    max_len=128
)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = Adam(model.parameters(), lr=2e-5)

for epoch in range(3):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 151/151 [25:55<00:00, 10.30s/it, loss=0.0276]
Epoch 1: 100%|██████████| 151/151 [25:56<00:00, 10.31s/it, loss=0.0191] 
Epoch 2: 100%|██████████| 151/151 [25:42<00:00, 10.21s/it, loss=0.00288]


In [12]:
data['label'].unique()

array(['Artista', 'ConsultarInformacion', 'CrearPlaylist'], dtype=object)

In [14]:
# Evaluación del modelo
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Resultados con BERT:")
print(classification_report(all_labels, all_preds, target_names=data['label'].unique()))


Resultados con BERT:
                      precision    recall  f1-score   support

             Artista       1.00      0.99      0.99       200
ConsultarInformacion       1.00      0.99      1.00       203
       CrearPlaylist       0.98      1.00      0.99       201

            accuracy                           0.99       604
           macro avg       0.99      0.99      0.99       604
        weighted avg       0.99      0.99      0.99       604

