In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('./data/merged_reviews_metadata_renamed.csv', sep=';', nrows=10000)
print(df.shape)
df['combined_text'] = df['review_title'] + " " + df['review_text']
df = df[['review_rating', 'combined_text']]

(10000, 7)


In [7]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df = train_df.dropna(subset=['combined_text'])
test_df = test_df.dropna(subset=['combined_text'])

print(train_df.shape)
print(test_df.shape)

(7979, 2)
(1993, 2)


In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(list(train_df['combined_text']),
                   padding=True, truncation=True, max_length=512,
                   return_tensors="pt")

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/can/Projects/Python/ml-bert-ratings/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/can/Projects/Python/ml-bert-ratings/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/can/Projects/Python/ml-bert-rat

In [9]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]  # Annahme: labels zwischen 1 und 5
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        # Falls Labels als 1-5 vorliegen, auf 0-4 anpassen:
        item['labels'] = torch.tensor(label - 1)
        return item


In [15]:
from torch.utils.data import DataLoader

train_dataset = ReviewDataset(train_df['combined_text'], train_df['review_rating'], tokenizer)
test_dataset = ReviewDataset(test_df['combined_text'], test_df['review_rating'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [19]:
import torch
from torch.optim import Adam  # Alternativer Optimizer

# Gerät auswählen: MPS (für M1), ansonsten CUDA oder CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Verwende Device:", device)

model.to(device)
model.train()

# Erstelle den Optimizer mit einem optionalen weight_decay
optimizer = Adam(model.parameters(), lr=2e-5, weight_decay=0.01)
epochs = 3

for epoch in range(epochs):
    running_loss = 0.0
    num_batches = len(train_loader)
    print(f"\n--- Epoch {epoch+1}/{epochs} ---")

    for i, batch in enumerate(train_loader, 1):
        # Verschiebe alle Elemente des Batches auf das ausgewählte Device
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)   # Erwartet: input_ids, attention_mask, labels, etc.
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Gebe alle 10 Schritte Infos aus
        if i % 10 == 0 or i == num_batches:
            avg_loss = running_loss / i
            print(f"Batch {i}/{num_batches} - Aktuelle durchschnittliche Loss: {avg_loss:.4f}")

    avg_epoch_loss = running_loss / num_batches
    print(f"Epoch {epoch+1} abgeschlossen. Durchschnittliche Loss: {avg_epoch_loss:.4f}")


Verwende Device: mps

--- Epoch 1/3 ---
Batch 10/499 - Aktuelle durchschnittliche Loss: 1.1826
Batch 20/499 - Aktuelle durchschnittliche Loss: 1.1290
Batch 30/499 - Aktuelle durchschnittliche Loss: 1.0833
Batch 40/499 - Aktuelle durchschnittliche Loss: 1.0289
Batch 50/499 - Aktuelle durchschnittliche Loss: 1.0052
Batch 60/499 - Aktuelle durchschnittliche Loss: 0.9818
Batch 70/499 - Aktuelle durchschnittliche Loss: 0.9605
Batch 80/499 - Aktuelle durchschnittliche Loss: 0.9220
Batch 90/499 - Aktuelle durchschnittliche Loss: 0.9141
Batch 100/499 - Aktuelle durchschnittliche Loss: 0.9071
Batch 110/499 - Aktuelle durchschnittliche Loss: 0.8923
Batch 120/499 - Aktuelle durchschnittliche Loss: 0.8809
Batch 130/499 - Aktuelle durchschnittliche Loss: 0.8763
Batch 140/499 - Aktuelle durchschnittliche Loss: 0.8581
Batch 150/499 - Aktuelle durchschnittliche Loss: 0.8528
Batch 160/499 - Aktuelle durchschnittliche Loss: 0.8442
Batch 170/499 - Aktuelle durchschnittliche Loss: 0.8325
Batch 180/499 - A

In [20]:
# Speichern beider Varianten
torch.save(model.state_dict(), "./data/models/model_weights.pth")
torch.save(model, "./data/models/model_full.pth")
print("Modell und Gewichte wurden in './data/models/' gespeichert.")

Modell und Gewichte wurden in './data/models/' gespeichert.


In [23]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()  # Modell in den Evaluationsmodus schalten
all_preds = []
all_labels = []
batch_count = 0

print("Starte die Evaluation...")

with torch.no_grad():
    for batch in test_loader:
        batch_count += 1
        print(f"\n--- Batch {batch_count}/{len(test_loader)} ---")

        # Batch auf das richtige Device verschieben (MPS, CUDA oder CPU)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        logits = outputs.logits
        loss = outputs.loss
        print(f"Batch Loss: {loss.item():.4f}")

        preds = torch.argmax(logits, dim=1)
        print(f"Predicted Labels: {preds.cpu().tolist()}")
        print(f"Ground Truth Labels: {batch['labels'].cpu().tolist()}")

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['labels'].cpu().tolist())

print("\nEvaluation abgeschlossen. Bereite finale Ergebnisse vor...")

accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=[f"Klasse {i}" for i in range(5)])
print(f"\nOverall Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)

Starte die Evaluation...

--- Batch 1/125 ---
Batch Loss: 0.7375
Predicted Labels: [3, 3, 4, 4, 4, 1, 0, 3, 4, 4, 4, 1, 4, 4, 2, 4]
Ground Truth Labels: [3, 4, 3, 4, 4, 1, 0, 3, 4, 4, 3, 1, 4, 4, 3, 4]

--- Batch 2/125 ---
Batch Loss: 0.4189
Predicted Labels: [4, 4, 4, 1, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 0, 4]
Ground Truth Labels: [4, 3, 4, 1, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 1, 4]

--- Batch 3/125 ---
Batch Loss: 1.0483
Predicted Labels: [4, 4, 0, 4, 4, 4, 4, 4, 4, 0, 4, 4, 1, 3, 1, 4]
Ground Truth Labels: [4, 3, 0, 2, 3, 4, 4, 4, 4, 2, 4, 3, 3, 4, 0, 4]

--- Batch 4/125 ---
Batch Loss: 0.4805
Predicted Labels: [4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 2]
Ground Truth Labels: [4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 3, 3, 2]

--- Batch 5/125 ---
Batch Loss: 0.6580
Predicted Labels: [0, 4, 0, 4, 4, 1, 2, 0, 4, 4, 4, 4, 4, 0, 4, 4]
Ground Truth Labels: [2, 4, 0, 4, 3, 2, 2, 1, 3, 4, 4, 4, 4, 0, 4, 4]

--- Batch 6/125 ---
Batch Loss: 0.4023
Predicted Labels: [4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4

In [39]:
def predict_review(review_text):
    # Tokenisiere den Eingabetext
    encoding = tokenizer(
        review_text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    # Verschiebe die Tensoren zum Device (MPS, CUDA, CPU)
    encoding = {k: v.to(device) for k, v in encoding.items()}

    # Vorwärtsdurchlauf (Inference)
    with torch.no_grad():
        outputs = model(**encoding)

    pred = torch.argmax(outputs.logits, dim=1)

    return pred.item() + 1

sample_text = "the product was a piece of shit"
prediction = predict_review(sample_text)
print(f"Die vorhergesagte Sternebewertung: {prediction}")


Die vorhergesagte Sternebewertung: 1
