In [20]:
import tempfile
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from skorch.callbacks import EarlyStopping, ProgressBar, LRScheduler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import mlflow
from rich import print
import spacy
import re
import time
import pandas as pd
from pathlib import Path
import tarfile
import joblib
import json

In [None]:
# Prüfung, ob CUDA verfügbar ist

import torch
print(torch.cuda.is_available())

In [13]:
# Ordner-Erstellung

os.makedirs("./Plots", exist_ok=True)
os.makedirs("./data", exist_ok=True)
os.makedirs("./data/test_data", exist_ok=True)
os.makedirs("./data/train_data", exist_ok=True)
os.makedirs("./Models", exist_ok=True)
os.makedirs("./Models/Streamlit", exist_ok=True)


In [4]:
# Mlflow Experiment initialisieren

exp_id = mlflow.set_experiment("Classification - NLP")

run = mlflow.start_run(run_name="Logistic Regression und MLP")
run_id = run.info.run_id

model_uris = {}


In [17]:
# Einlesen der Train und Test CSV-Dateien für TF-IDF

filename = "amazon_review_full_csv.tar.gz"
filepath = Path.cwd() / "data" / filename
columns = ["Label", "Title", "Description"]

with tarfile.open(filepath, "r:gz") as tar:

    test_file = tar.extractfile("amazon_review_full_csv/test.csv")
    df_test = pd.read_csv(test_file, header=None)

    train_file = tar.extractfile("amazon_review_full_csv/train.csv")
    df_train = pd.read_csv(train_file, header=None)

df_test.columns = columns
df_train.columns = columns

In [None]:
print(f"Speicherverbrauch im RAM: {
(df_train.memory_usage(deep=True).sum() + df_test.memory_usage(deep=True).sum())
/ 1000000:.3f} MegaBytes")


In [None]:
print(f"Shape des gesamten Datensatzes: {pd.concat([df_train, df_test]).shape}")


In [20]:
# Die ersten 10 Datensätze

with tempfile.TemporaryDirectory() as tmp:
    sample_data = df_train.head(10)
    local_path = os.path.join(tmp, "sample_data.csv")
    sample_data.to_csv(local_path)
    mlflow.log_artifact(local_path=local_path, artifact_path="Data")


In [None]:
# Klassenverteilung, um zu prüfen, ob Verteilung möglicherweise ungleich ist
# Bei ungleicher Klassenverteilung müssen entsprechende Maßnahmen wie Under-/Oversampling ergriffen werden
# Außerdem sollte die Metrik abhängig von der Klassenverteilung gewählt werden

sns.set_theme(context="paper", style="whitegrid")

pd.concat([df_train, df_test], ignore_index=True).iloc[:, 0].value_counts(normalize=True).sort_index().plot(kind="bar", color="steelblue")

plt.title("Klassenverteilung")

plt.xlabel("Klassen (Review-Ratings)")

plt.xticks(rotation=0)

plt.ylabel("Anteil")

# Prozentwerte auf Balken
ax = plt.gca()
for p in ax.patches:
    h = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., h + 0.005, f'{h:.1%}',
            ha='center', va='bottom', fontsize=9)

ax.set_ylim(top=ax.get_ylim()[1] * 1.08)

save_path = Path.cwd() / "Plots" / "Klassenverteilung.png"
plt.tight_layout()
plt.savefig(save_path, dpi=300, bbox_inches="tight")

mlflow.log_artifact(local_path=save_path, artifact_path="Plots")


--> Die Klassenverteilung ist exakt ausgeglichen, daher bedarf es hierfür keine Maßnahmen.


In [None]:
# Prüfung auf fehlende Einträge

print(pd.concat([df_train, df_test]).isna().sum())

df_test = df_test.dropna().reset_index(drop=True)
df_train = df_train.dropna().reset_index(drop=True)


--> In 214 Zeilen fehlt der Titel der Review
--> Da dies so gut wie nichts ist, werden diese gelöscht

In [23]:
# Aufteilung in X und y
# Titel und Textbeschreibung werden zu einem Feature zusammengeführt.

X_train = df_train.loc[:, "Title"] + " " + df_train.loc[:, "Description"]
X_test = df_test.loc[:, "Title"] + " " + df_test.loc[:, "Description"]

y_train = df_train.loc[:, "Label"]
y_test = df_test.loc[:, "Label"]

X_train.to_csv("./data/train_data/X_train.csv", header=None, index=False)
X_test.to_csv("./data/test_data/X_test.csv", header=None, index=False)

y_train.to_csv("./data/train_data/y_train.csv", header=None, index=False)
y_test.to_csv("./data/test_data/y_test.csv", header=None, index=False)


In [None]:
# Die ersten 5 Werte

print(X_train.head(5))

In [None]:
# Folgender Befehl muss im Terminal ausgeführt werden, um das Spacy-Sprachpaket zu laden
# python -m spacy download en_core_web_sm

In [None]:
# Training eines Logistic Regression Models mit TF-IDF als Vectorizer
# TF-IDF von Sklearn gibt eine Sparse-Matrix aus


class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="en_core_web_sm", batch_size=256):
        self.model_name = model_name
        self.batch_size = batch_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.Series):
            X = pd.Series(X)

        # Lazy-Load des Modells – nur beim Aufruf
        nlp = spacy.load(self.model_name, disable=["parser", "ner"])

        def clean_text(text: str) -> str:
            if not isinstance(text, str):
                return ""
            text = text.strip().lower()
            text = re.sub(r"\d+", "", text)
            text = re.sub(r"[^a-z\s]", " ", text)
            text = re.sub(r"\s+", " ", text)
            return text

        cleaned_text = X.apply(clean_text)
        docs = nlp.pipe(cleaned_text, batch_size=self.batch_size)

        processed_docs = [
            " ".join(token.lemma_ for token in doc if not token.is_stop and token.is_alpha)
            for doc in docs
        ]

        ser = pd.Series(processed_docs, index=X.index).dropna()

        return ser


tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=0.001, # Wörter ignorieren, die in < 0,1 % der Dokumente vorkommen
    max_df=0.95 # Wörter ignorieren, die in mehr als 95 % der Dokumente vorkommen (restliche Stoppwörter)
)

lr = LogisticRegression(solver="saga",
                        penalty="l2",
                        max_iter=1000,
                        random_state=42,
                        verbose=1
                        )

pipeline_tfidf_lr = Pipeline([
    ("text_cleaner", TextCleaner()),
    ("tfidf_vectorizer", tfidf),
    ("logistic_regression", lr)
], verbose=True)


def get_x_train():
    return pd.read_csv("./data/train_data/X_train.csv", header=None).squeeze("columns")

def get_y_train():
    return pd.read_csv("./data/train_data/y_train.csv", header=None).squeeze("columns")


X_train = get_x_train()
y_train = get_y_train()

start_time = time.time()

pipeline_tfidf_lr.fit(X_train, y_train)

overall_train_time = round(time.time() - start_time) / 60

mlflow.log_metric(key="Training_time_lr_tfidf_minutes", value=overall_train_time)

mlflow.sklearn.log_model(pipeline_tfidf_lr, artifact_path="tfidf_lr_pipeline")
model_uris["tfidf_lr_pipeline"] = f"runs:/{run_id}/tfidf_lr_pipeline"

with open("class_labels.json", "w") as f:
    json.dump(pipeline_tfidf_lr.classes_.tolist(), f)

mlflow.log_artifact("class_labels.json", artifact_path="Data")


In [None]:
# RAM leeren mit dem Python Garbage Collector

import gc

gc.collect()

In [None]:
mlflow.start_run(run_name="NochmalBertTraining")

In [6]:
run_id = mlflow.active_run().info.run_id

In [None]:
from sklearn.preprocessing import StandardScaler
# Training eines Logistic Regression Models mit Sentence-BERT als Vectorizer
# Der Sentence-BERT wird in einer Sklearn-Transformer-Klasse eingebettet, damit eine Pipeline erstellt werden kann


# all-MiniLM-L6-v2 erstellt Vektoren mit 384 Dimensionen
class SentenceBERTVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2", batch_size=256, device="cuda"):
        self.model_name = model_name
        self.batch_size = batch_size
        self.device = device

    def fit(self, X, y=None):
        self.model_ = SentenceTransformer(self.model_name, device=self.device)
        return self

    def transform(self, X):
        if not hasattr(self, "model_") or self.model_ is None:
            self.model_ = SentenceTransformer(self.model_name, device=self.device)

        return self.model_.encode(
            X,
            batch_size=self.batch_size,
            show_progress_bar=True,
            convert_to_numpy=True,
            device=self.device,
            normalize_embeddings=True
        )

    def __getstate__(self):
        # Für joblib-Speicherung: Modell nicht speichern (zu groß)
        state = self.__dict__.copy()
        if 'model_' in state:
            del state['model_']
        return state

    def __setstate__(self, state): # Beim Laden automatisch SentenceTransformer neu initialisieren
        self.__dict__.update(state)


pipeline_bert = Pipeline([
    ('sbert', SentenceBERTVectorizer(model_name='all-MiniLM-L6-v2', batch_size=256)),
    ('clf', LogisticRegression(solver="saga",
                        penalty="l2",
                        max_iter=100,
                        random_state=42,
                               verbose=1
                        ))
])


def get_x_train_trans():
    return pd.read_csv("./data/train_data/X_train.csv", header=None).squeeze("columns")

def get_y_train_trans():
    return pd.read_csv("./data/train_data/y_train.csv", header=None).squeeze("columns")

X_train = get_x_train_trans()
y_train = get_y_train_trans()

start_time = time.time()

pipeline_bert.fit(X_train, y_train)

overall_train_time = round(time.time() - start_time) / 60

mlflow.log_metric(key="Training_time_lr_bert_minutes", value=overall_train_time)

mlflow.sklearn.log_model(pipeline_bert, artifact_path="bert_lr_pipeline")
model_uris["bert_lr_pipeline"] = f"runs:/{run_id}/bert_lr_pipeline"


In [None]:
# RAM leeren mit dem Python Garbage Collector

import gc

gc.collect()

In [None]:
# Natives PyTorch MLP

import torch
import torch.nn as nn
# Skorch ermöglicht es, ein natives PyTorch Model in eine Sklearn-artige Pipeline einzubetten
from skorch import NeuralNetClassifier

from sentence_transformers import SentenceTransformer


class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_classes=5, dropout=None):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x):
        return self.net(x)


class SentenceTransformerEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        super().__init__()
        self.model = SentenceTransformer(model_name)
    def fit(self, X, y=None):
        return self  # nichts zu tun
    def transform(self, X):
        return self.model.encode(X, convert_to_numpy=True)


# 3. Skorch-Wrapper um PyTorch-Modell
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on {device}")
net = NeuralNetClassifier(
    MLPClassifier,
    module__input_dim=384,
    module__num_classes=5,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.Adam,
    lr=0.001,
    max_epochs=100,
    batch_size=128,
    device=device,
    verbose=1,
    callbacks=[
        EarlyStopping(
            monitor="valid_loss",
            patience=5,
            threshold=1e-4,
            lower_is_better=True
        )
    ]
)

pipeline_skorch = Pipeline([
    ("embedder", SentenceTransformerEmbedder()),
    ("classifier", net)
])


def get_x_train_trans():
    return pd.read_csv("./data/train_data/X_train.csv", header=None).squeeze("columns")

def get_y_train_trans():
    return pd.read_csv("./data/train_data/y_train.csv", header=None).squeeze("columns")


X_train = get_x_train_trans()
y_train = get_y_train_trans()

# Skorch erwartet Labels von 0 bis C-1
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

start_time = time.time()

pipeline_skorch.fit(X_train, y_train_encoded)

overall_train_time = round((time.time() - start_time) / 60, None)
print(f"Beendet nach {overall_train_time} Minuten")

mlflow.log_metric(key="Training_time_MLP_bert_minutes", value=overall_train_time)

mlflow.sklearn.log_model(pipeline_skorch, artifact_path="bert_PyTorch_MLP_withoutDo")
model_uris["bert_PyTorch_MLP_withoutDo"] = f"runs:/{run_id}/bert_PyTorch_MLP_withoutDo"

In [None]:
print(net.callbacks)

In [None]:
# Model Evaluation

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


def get_x_test():
    return pd.read_csv("./data/test_data/X_test.csv", header=None).squeeze("columns")


def get_y_test():
    return pd.read_csv("./data/test_data/y_test.csv", header=None).squeeze("columns")


def get_models_from_mlflow():
    models = {}
    for uri in model_uris.keys():
        name = uri.rsplit("/", 1)[-1]
        models[name] = mlflow.sklearn.load_model(uri)


# Extract Model-Pipelines from Mlflow URIs
model_pipelines = get_models_from_mlflow()

for name, model in model_pipelines:

    X_test = get_x_test()
    y_test = get_y_test()

    # Die Skorch-Pipeline wurde auf die Labels 0, 1, 2, 3, 4 trainiert
    if "MLP" in name:
        print("LabelEncoder anwenden...")
        le = LabelEncoder()
        y_test = le.fit_transform(y_test)

    y_pred = model.predict(X_test)

    ac_score = accuracy_score(y_true=y_test, y_pred=y_pred)

    f1 = f1_score(y_true=y_test, y_pred=y_pred, average="macro")

    cr = pd.DataFrame(classification_report(y_true=y_test, y_pred=y_pred, output_dict=True))

    cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
    fig = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    fig.plot()
    plt.savefig(f"./Plots/confusion_matrix_{name}.png", dpi=300)
    mlflow.log_artifact(f"./Plots/confusion_matrix_{name}.png", artifact_path="Plots")
    plt.close()

    with tempfile.TemporaryDirectory() as tmp:
        local_path = os.path.join(tmp, f"classification_report{name}.csv")
        cr.to_csv(local_path)
        mlflow.log_artifact(local_path, artifact_path="Plots")

    metrics = {
        f"{name}_accuracy_score": ac_score,
        f"{name}_f1_score": f1
    }

    mlflow.log_metrics(metrics=metrics)


In [None]:
# Sklearn Model Pipelines als .joblib-Datei speichern

for name, model in model_pipelines:
    joblib.dump(model, filename=f"./Models/{name}.joblib")

# Mlflow Run endgültig beenden
mlflow.end_run()


In [None]:
# RAM leeren mit dem Python Garbage Collector

import gc

gc.collect()

In [None]:
# Zero-Shot-Sentiment-Analysis mit Hugging Face Transformer (Bart von Facebook)
# Sentiment-Polaritäten

from transformers import pipeline

classifier = pipeline(
    task="zero-shot-classification",
    model="facebook/bart-large-mnli",
    truncation=True,
    max_length=512,
    batch_size=128
)

with tarfile.open("./data/amazon_review_full_csv.tar.gz", "r:gz") as tf:
    samples = pd.read_csv(tf.extractfile("amazon_review_full_csv/test.csv"), header=None, nrows=50000).reset_index(drop=True)

samples.columns = ["Label", "Title", "Description"]

X_sample = samples.drop(columns=["Label"])
X_sample = X_sample.loc[:, "Title"] + " " + X_sample.loc[:, "Description"]
y_sample = samples.loc[:, "Label"]


mapping = {1: "negative", 2: "negative", 3: "neutral", 4: "positive", 5: "positive"}
y_sample_mapped = y_sample.map(mapping).tolist()

result = classifier(
    X_sample.to_list(),
    candidate_labels=["positive", "negative", "neutral"]
)

y_pred_labels = [d["labels"][0] for d in result]

acc_score = accuracy_score(y_true=y_sample_mapped, y_pred=y_pred_labels)

f1_score = f1_score(y_true=y_sample_mapped, y_pred=y_pred_labels, average="macro")

metrics = {
    "f1_score_Transformer": f1_score,
    "accuracy_Transformer": acc_score
}

mlflow.log_metrics(metrics)

print(f"Die Genauigkeit durch den Transformer im Zero-Shot beträgt: {acc_score * 100:.2f} %")

print(f"Der F1-Score durch den Transformer im Zero-Shot beträgt: {f1_score * 100:.2f} %")




In [None]:
import torch
import torch.nn as nn
# Skorch ermöglicht es, ein natives PyTorch Model in eine Sklearn-artige Pipeline einzubetten
from skorch import NeuralNetClassifier

from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score


# Nun sind es 3 Klassen anstatt wie zuvor 5
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_classes=3, dropout=None):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x):
        return self.net(x)


class SentenceTransformerEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        super().__init__()
        self.model = SentenceTransformer(model_name)
    def fit(self, X, y=None):
        return self  # nichts zu tun
    def transform(self, X):
        return self.model.encode(X, convert_to_numpy=True)


# 3. Skorch-Wrapper um PyTorch-Modell
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on {device}")
net = NeuralNetClassifier(
    MLPClassifier,
    module__input_dim=384,
    module__num_classes=3,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.Adam,
    lr=0.001,
    max_epochs=100,
    batch_size=128,
    device=device,
    verbose=1,
    callbacks=[
        EarlyStopping(
            monitor="valid_loss",
            patience=5,
            threshold=1e-4,
            lower_is_better=True
        )
    ]
)

pipeline_skorch = Pipeline([
    ("embedder", SentenceTransformerEmbedder()),
    ("classifier", net)
])


def get_x_train_trans():
    return pd.read_csv("./data/train_data/X_train.csv", header=None).squeeze("columns")

def get_y_train_trans():
    return pd.read_csv("./data/train_data/y_train.csv", header=None).squeeze("columns")


X_train = get_x_train_trans()
y_train = get_y_train_trans()

mapping = {1: "negative", 2: "negative", 3: "neutral", 4: "positive", 5: "positive"}
y_sample_mapped = y_train.map(mapping).tolist()

# Skorch erwartet Labels von 0 bis C-1
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_sample_mapped)


pipeline_skorch.fit(X_train, y_train_encoded)


X_test = pd.read_csv("./data/test_data/X_test.csv", header=None).squeeze("columns")
y_test = pd.read_csv("./data/test_data/y_test.csv", header=None).squeeze("columns")
y_test_mapped = y_test.map(mapping).tolist()

y_test_encoded = le.transform(y_test_mapped)

y_pred = pipeline_skorch.predict(X_test)

acc_score = accuracy_score(y_true=y_test_encoded, y_pred=y_pred)
f1 = f1_score(y_true=y_test_encoded, y_pred=y_pred, average="macro")

print(f"Folgender Accuracy Score konnte erzielt werden: {acc_score:.2f}")
print(f"Folgender F1-Score konnte erzielt werden: {f1:.2f}")


In [None]:
# Speicherung der MLP-Pipeline für die Web-Oberfläche
import joblib

# Die Model-URI ist spezifisch und muss daher individuell angepasst weren
model_uri_MLP = f"./mlruns/457594409970679324/models/m-7d7d43bbe4ae4d7280caa04604264fbe/artifacts"
model = mlflow.sklearn.load_model(model_uri_MLP)

joblib.dump(model, "./Models/Streamlit/streamlit_model.joblib")