In [None]:
import pandas as pd
import torch
import json
import numpy as np
from pathlib import Path
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Modellname hier anpassen
model_name = "distilbert-base-uncased"  # z.B. "roberta-base", "albert-base-v2", "prajjwal1/bert-tiny"

# ✅ Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = device.type == "cuda"

# ✅ Load and prepare data
df = pd.read_csv("super_sms_dataset.csv", encoding="latin1")
df = df.rename(columns={"SMSes": "text", "Labels": "label"})
df = df.dropna(subset=["label", "text"])
df["label"] = df["label"].astype(int)

# ✅ Reduziere auf 30 % (stratifiziert)
df_sampled = df.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=0.3, random_state=42))
print("✅ Genutzter Datensatz:", len(df_sampled), "Einträge")

# ✅ Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_sampled["text"].tolist(), df_sampled["label"].tolist(), test_size=0.2, random_state=42
)

# ✅ Tokenizer und Modell (dynamisch)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
for param in model.base_model.parameters():  # 🧊 Encoder optional einfrieren oder trainieren
    param.requires_grad = True
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# ✅ Dataset
class SpamDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=64, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __len__(self): return len(self.labels)
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = SpamDataset(train_texts, train_labels)
val_dataset = SpamDataset(val_texts, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# ✅ Early Stopping Setup
best_val_loss = float('inf')
patience = 4
counter = 0
early_stop = False

# ✅ Training
num_epochs = 9
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=use_amp):
            outputs = model(**batch)
            loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"🔧 Avg Training Loss: {avg_train_loss:.4f}")

    # ✅ Validation loss
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"🧪 Avg Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        print("✅ Validation loss improved.")
    else:
        counter += 1
        print(f"⚠️ No improvement. Patience: {counter}/{patience}")
        if counter >= patience:
            print("⏹️ Early stopping triggered.")
            early_stop = True
            break

# ✅ Lernkurve
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, label="Train Loss")
plt.plot(range(1, len(val_losses) + 1), val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("📉 Lernkurve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# ✅ Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = outputs.logits.argmax(dim=1).cpu()
        labels = batch['labels'].cpu()
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

print("\n📊 Classification Report:")
print(classification_report(all_labels, all_preds))

# ✅ Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.xlabel("Vorhergesagt")
plt.ylabel("Tatsächlich")
plt.title("📊 Confusion Matrix")
plt.tight_layout()
plt.show()

# ✅ Save metrics
bert_metrics = {
    "model": model_name,
    "model_version": f"{model_name.replace('/', '_')}_v9",
    "threshold": 0.5,
    "precision": precision_score(all_labels, all_preds),
    "recall": recall_score(all_labels, all_preds),
    "f1_score": f1_score(all_labels, all_preds),
    "accuracy": (np.array(all_labels) == np.array(all_preds)).mean(),
    "config": {
        "frozen": False,
        "pretrained": model_name,
        "epochs": len(train_losses),
        "max_length": 64,
        "batch_size": 16
    }
}

Path("results").mkdir(exist_ok=True)
metrics_path = f"results/metrics_{model_name.replace('/', '_')}.json"
with open(metrics_path, "w") as f:
    json.dump(bert_metrics, f, indent=4)

from google.colab import files
files.download(metrics_path)


  df_sampled = df.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=0.3, random_state=42))


✅ Genutzter Datensatz: 20102 Einträge


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [1]:
from google.colab import files
uploaded = files.upload()

Saving super_sms_dataset.csv to super_sms_dataset.csv
