# Import

In [43]:

from __future__ import annotations

import json
import hashlib
from pathlib import Path
from copy import deepcopy


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
)
from scipy.sparse import hstack



# Global Settings

In [26]:
# Größen in cm → umrechnen in inches
cm = 1/2.54



# marker und dicken von linien und der achsen und alles sollte einheitlich sein
plt.rcParams.update({
    "figure.figsize": (18*cm, 8*cm),   # default: 18×8 cm
    "font.size": 10,                    # Standardschriftgröße
    "axes.titlesize": 10,
    "axes.labelsize": 10,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "legend.fontsize": 10,
    "figure.dpi": 300, # für saubere Notebooks/Export
    "lines.linewidth": 1.0,
    "lines.markersize": 4,
    "axes.linewidth": 1.0,
    "xtick.major.width": 1.0,
    "ytick.major.width": 1.0,
    "xtick.minor.width": 1.0,
    "ytick.minor.width": 1.0,
})


In [27]:
# -----------------------------
# Project paths
# -----------------------------

EXPERIMENTS_DIR = Path("experiments/LinSVM")
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)

METRICS_PATH = EXPERIMENTS_DIR / "metrics_long.csv"   # long format: (experiment_id x class)
RUNS_PATH    = EXPERIMENTS_DIR / "runs_global.csv"    # one row per experiment run


# -----------------------------
# Dataset column config
# -----------------------------
TEXT_VARIANT_COL  = "text_stripped"   # from your current notebook
LABEL_COL = "prdtypecode"

RANDOM_STATE = 42


In [28]:
def load_or_empty_csv(path, columns):
    if path.exists():
        return pd.read_csv(path)
    return pd.DataFrame(columns=columns)


def append_and_dedup(df_old, df_new, subset):
    out = pd.concat([df_old, df_new], ignore_index=True)
    out = out.drop_duplicates(subset=subset, keep="last")
    return out


def save_confusion_matrices(experiment_id, cm_raw, cm_norm):
    out_dir = EXPERIMENTS_DIR / experiment_id
    out_dir.mkdir(parents=True, exist_ok=True)
    np.save(out_dir / "confusion_raw.npy", cm_raw)
    np.save(out_dir / "confusion_norm.npy", cm_norm)


def log_run_global(experiment_id, cfg, global_metrics):
    cols = ["experiment_id", "config_json", "accuracy", "macro_f1", "weighted_f1"]
    df_old = load_or_empty_csv(RUNS_PATH, cols)

    row = {
        "experiment_id": experiment_id,
        "config_json": json.dumps(cfg, sort_keys=True),
        **global_metrics,
    }
    df_new = pd.DataFrame([row], columns=cols)
    df_out = append_and_dedup(df_old, df_new, subset=["experiment_id"])
    df_out.to_csv(RUNS_PATH, index=False)

def log_metrics_long(experiment_id, cfg, report_dict, train_counts):
    # Minimal per-class columns that are always meaningful
    base_cols = [
        "experiment_id", "class_id",
        "precision", "recall", "f1", "support_val",
        "n_train_class",
        "config_json",
    ]

    # Optional "nice-to-have" columns (only filled if present)
    optional_cols = [
        "vectorizer",
        "model",
        "class_weight",
        "analyzer", "ngram_min", "ngram_max", "max_features",   # single-vectorizer case
        "word_ngram_min", "word_ngram_max", "word_max_features", "word_min_df",
        "char_ngram_min", "char_ngram_max", "char_max_features", "char_min_df",
    ]

    cols = base_cols + optional_cols

    df_old = load_or_empty_csv(METRICS_PATH, cols)

    rows = []
    cfg_json = json.dumps(cfg, sort_keys=True)

    for k, v in report_dict.items():
        if k in ("accuracy", "macro avg", "weighted avg"):
            continue

        class_id = int(k)

        row = {
            "experiment_id": experiment_id,
            "class_id": class_id,
            "precision": float(v["precision"]),
            "recall": float(v["recall"]),
            "f1": float(v["f1-score"]),
            "support_val": int(v["support"]),
            "n_train_class": int(train_counts.get(class_id, 0)),
            "config_json": cfg_json,
        }

        # Fill optional keys if they exist; otherwise leave blank/NaN
        for key in optional_cols:
            if key in cfg:
                row[key] = cfg[key]

        rows.append(row)

    df_new = pd.DataFrame(rows, columns=cols)

    # If old file has fewer columns (from earlier runs), align columns before concat
    for c in df_old.columns:
        if c not in df_new.columns:
            df_new[c] = np.nan
    for c in df_new.columns:
        if c not in df_old.columns:
            df_old[c] = np.nan

    df_new = df_new[df_old.columns]  # same order
    df_out = append_and_dedup(df_old, df_new, subset=["experiment_id", "class_id"])
    df_out.to_csv(METRICS_PATH, index=False)


def stratified_subsample(df, label_col, sample_size, random_state=42):
    """
    Stratified subsampling that preserves class proportions.
    sample_size = 0 or >= len(df)  -> full dataset
    """
    if sample_size == 0 or sample_size >= len(df):
        return df.copy().reset_index(drop=True)

    frac = sample_size / len(df)

    df_sub = (
        df
        .groupby(label_col, group_keys=False)
        .apply(lambda x: x.sample(
            n=max(1, int(len(x) * frac)),
            random_state=random_state
        ))
        .sample(frac=1, random_state=random_state)  # shuffle
        .reset_index(drop=True)
    )

    return df_sub

In [None]:


# single vectorizer

CFG = {
    # data
    "run_name": "TF-IDF Word ngram (1,2)",
    "experiment_id": datetime.now().strftime("%Y%m%d_%H%M%S"),  # unique run id
    "text_column": TEXT_VARIANT_COL,
    "label_column": LABEL_COL,
    # vectorizer
    "analyzer": "word",          # "word" or "char"
    "ngram_min": 1,
    "ngram_max": 2,
    "max_features": 100000,

    # model
    "model": "LinearSVC",
    "class_weight": "balanced",  # None or "balanced"
    "C": 2
}


BASE_CFG = deepcopy(CFG)

# max_features experiments
EXPERIMENTS = [
    {"run_name": "TF-IDF Word ngram max_feat_run_1",
     "max_features": 20000,
     }
    ,
    {"run_name": "TF-IDF Char ngram max_feat_run_2",
        "max_features": 30000,
        }
    ,
    {"run_name": "TF-IDF Word ngram max_feat_run_3",
        "max_features": 50000,
        }
    ,
    {"run_name": "TF-IDF Char ngram max_feat_run_4",
        "max_features": 80000,
        },
    {"run_name": "TF-IDF Word ngram max_feat_run_5",
        "max_features": 120000,
        },
    {"run_name": "TF-IDF Char ngram max_feat_run_6",
        "max_features": 150000,
        },
    {"run_name": "TF-IDF Word ngram max_feat_run_7",
        "max_features": 200000,
        },
    # -----------------------------
    # same for char
    {"run_name": "TF-IDF Char ngram max_feat_run_1",
     "ngram_min":3,
     "ngram_max":5,
     "analyzer":"char",
     "max_features": 20000,
     },
    {"run_name": "TF-IDF Char ngram max_feat_run_2",
        "ngram_min":3,
        "ngram_max":5,
        "analyzer":"char",
        "max_features": 50000,
        },
    {"run_name": "TF-IDF Char ngram max_feat_run_3",
        "ngram_min":3,
        "ngram_max":5,
        "analyzer":"char",
        "max_features": 80000,
        },
    {"run_name": "TF-IDF Char ngram max_feat_run_4",
        "ngram_min":3,
        "ngram_max":5,
        "analyzer":"char",
        "max_features": 120000,
        },
    {"run_name": "TF-IDF Char ngram max_feat_run_5",
        "ngram_min":3,
        "ngram_max":5,
        "analyzer":"char",
        "max_features": 150000,
        },
    {"run_name": "TF-IDF Char ngram max_feat_run_6",
        "ngram_min":3,
        "ngram_max":5,
        "analyzer":"char",
        "max_features": 200000
        }
]

EXPERIMENTS = [
    {"run_name": f"TF-IDF Word ngram max_feat_{nmin}_{nmax}",
     "analyzer": "word",
     "ngram_min": nmin,
     "ngram_max": nmax,
     "max_features": 100000,
     }
    for nmin, nmax in [(1, 2), (1,3), (2,3), (2,4), (3,5)]
] + [
    {"run_name": f"TF-IDF Char ngram max_feat_{nmin}_{nmax}",
     "analyzer": "char",
     "ngram_min": nmin,
     "ngram_max": nmax,
     "max_features": 300000,
     }
    for nmin, nmax in [(2,3),(2,4),(3, 5),(3,4),(4,6)]
]

def make_cfg(base, overrides):
    cfg = deepcopy(base)
    cfg.update(overrides)
    cfg["experiment_id"] = datetime.now().strftime("%Y%m%d_%H%M%S")
    return cfg

# Functions

# Single vectorizer pipeline

In [30]:
def run_experiment(cfg):
    """Run one experiment with given config."""
    # splitting is already done
    # print config
    
    print("Running experiment with config:")
    print(json.dumps(cfg, indent=4))
    
    # train df
    DATA_DIR = Path("data")
    DATA_PATH_train = DATA_DIR / "train.csv"

    train_df = pd.read_csv(DATA_PATH_train)
    # val df
    DATA_PATH_val = DATA_DIR / "test.csv"
    val_df = pd.read_csv(DATA_PATH_val)

    #### LabelEncoder ###
    ##############################################

    # print shapes
    print(f"Train shape: {train_df.shape}")
    print(f"Validation shape: {val_df.shape}")

    X_train = train_df[TEXT_VARIANT_COL].astype(str)
    y_train = train_df[LABEL_COL].astype(str)

        # validation data
    X_val = val_df[TEXT_VARIANT_COL].astype(str)
    y_val = val_df[LABEL_COL].astype(str)
    
    print("Experiment ID:", CFG["experiment_id"])

    # -----------------------------
    # Vectorizer
    # -----------------------------
    tfidf = TfidfVectorizer(
        analyzer=CFG["analyzer"],
        ngram_range=(CFG["ngram_min"], CFG["ngram_max"]),
        max_features=CFG["max_features"],
    )

    X_train_vec = tfidf.fit_transform(X_train)
    X_val_vec   = tfidf.transform(X_val)

    print("Vectorized shapes:")
    print("X_train_vec:", X_train_vec.shape)
    print("X_val_vec  :", X_val_vec.shape)

    # -----------------------------
    # Model
    # -----------------------------
    clf = LinearSVC(
        class_weight=CFG["class_weight"],
        C=CFG["C"],
        random_state=RANDOM_STATE,
    )

    clf.fit(X_train_vec, y_train)

    print("Model trained.")

    import joblib
    
    # save model, vectorizer, everything to reload without retraining
    
    out_dir = EXPERIMENTS_DIR / CFG["experiment_id"]
    out_dir.mkdir(parents=True, exist_ok=True)
    joblib.dump(clf, out_dir / "model.joblib")
    joblib.dump(tfidf, out_dir / "vectorizer.joblib")
    joblib.dump(CFG, out_dir / "config.joblib")
    print("Model and vectorizer saved.")
    
    # save X_val_vec and y_val for evaluation later
    joblib.dump(X_val_vec, out_dir / "X_val_vec.joblib")
    joblib.dump(y_val, out_dir / "y_val.joblib")
    print("Validation data saved.")
    
    return None

In [31]:
for exp in EXPERIMENTS:
    CFG = make_cfg(BASE_CFG, exp)
    run_experiment(CFG)


Running experiment with config:
{
    "run_name": "TF-IDF Word ngram max_feat_1_2",
    "experiment_id": "20260120_230922",
    "text_column": "text_stripped",
    "label_column": "prdtypecode",
    "analyzer": "word",
    "ngram_min": 1,
    "ngram_max": 2,
    "max_features": 100000,
    "model": "LinearSVC",
    "class_weight": "balanced",
    "C": 2
}
Train shape: (66800, 6)
Validation shape: (16701, 6)
Experiment ID: 20260120_230922
Vectorized shapes:
X_train_vec: (66800, 100000)
X_val_vec  : (16701, 100000)
Model trained.
Model and vectorizer saved.
Validation data saved.
Running experiment with config:
{
    "run_name": "TF-IDF Word ngram max_feat_1_3",
    "experiment_id": "20260120_231004",
    "text_column": "text_stripped",
    "label_column": "prdtypecode",
    "analyzer": "word",
    "ngram_min": 1,
    "ngram_max": 3,
    "max_features": 100000,
    "model": "LinearSVC",
    "class_weight": "balanced",
    "C": 2
}
Train shape: (66800, 6)
Validation shape: (16701, 6)
Expe

In [None]:
# combined word and char

CFG = {
    "experiment_id": datetime.now().strftime("%Y%m%d_%H%M%S"),  # unique run id
    "text_column": TEXT_VARIANT_COL,
    "label_column": LABEL_COL,

    # word tfidf
    "word_analyzer": "word",
    "word_ngram_min": 1,
    "word_ngram_max": 2,
    "word_max_features": 200000,
    "word_min_df": 2,
    "word_sublinear_tf": True,

    # char tfidf
    "char_analyzer": "char",
    "char_ngram_min": 3,
    "char_ngram_max": 5,
    "char_max_features": 300000,
    "char_min_df": 3,
    "char_sublinear_tf": True,

    # model
    "model": "LinearSVC",
    "class_weight": "balanced",
    "C": 2
}

EXPERIMENTS = [
    {"run_name": f"TF-IDF Word+Char ngram combined C={c}",
     "C": c}
    for c in [0.01, 0.1, 1, 2, 5, 10, 20]
]

BASE_CFG = deepcopy(CFG)

# update



# Two vectorizer pipeline

In [48]:
def run_experiment_2(cfg):
    """Run one experiment with given config."""
    # splitting is already done

    # train df
    DATA_DIR = Path("data")
    DATA_PATH_train = DATA_DIR / "train.csv"

    train_df = pd.read_csv(DATA_PATH_train)
    # val df
    DATA_PATH_val = DATA_DIR / "test.csv"
    val_df = pd.read_csv(DATA_PATH_val)

    #### LabelEncoder ###
    ##############################################

    # print shapes
    print(f"Train shape: {train_df.shape}")
    print(f"Validation shape: {val_df.shape}")

    X_train = train_df[TEXT_VARIANT_COL].astype(str)
    y_train = train_df[LABEL_COL].astype(str)

        # validation data
    X_val = val_df[TEXT_VARIANT_COL].astype(str)
    y_val = val_df[LABEL_COL].astype(str)
    
    print("Experiment ID:", CFG["experiment_id"])




    # -----------------------------
    # Vectorizers
    # -----------------------------
    tfidf_word = TfidfVectorizer(
        analyzer=CFG["word_analyzer"],
        ngram_range=(CFG["word_ngram_min"], CFG["word_ngram_max"]),
        max_features=CFG["word_max_features"],
        min_df=CFG["word_min_df"],
        sublinear_tf=CFG["word_sublinear_tf"],
    )

    tfidf_char = TfidfVectorizer(
        analyzer=CFG["char_analyzer"],
        ngram_range=(CFG["char_ngram_min"], CFG["char_ngram_max"]),
        max_features=CFG["char_max_features"],
        min_df=CFG["char_min_df"],
        sublinear_tf=CFG["char_sublinear_tf"],
    )

    # Fit/transform
    X_train_word = tfidf_word.fit_transform(X_train)
    X_val_word   = tfidf_word.transform(X_val)

    X_train_char = tfidf_char.fit_transform(X_train)
    X_val_char   = tfidf_char.transform(X_val)

    # Stack features
    X_train_vec = hstack([X_train_word, X_train_char]).tocsr()
    X_val_vec   = hstack([X_val_word, X_val_char]).tocsr()

    print("Stacked shapes:")
    print("X_train_vec:", X_train_vec.shape)
    print("X_val_vec  :", X_val_vec.shape)

    # -----------------------------
    # Model
    # -----------------------------
    clf = LinearSVC(
        class_weight=CFG["class_weight"],
        random_state=RANDOM_STATE,
        C=CFG["C"],    
    )
    clf.fit(X_train_vec, y_train)

    print("Model trained.")

    # save model, vectorizer, everything to reload without retraining
    import joblib
    
     # save model, vectorizer, everything to reload without retraining
    out_dir = EXPERIMENTS_DIR / CFG["experiment_id"]
    out_dir.mkdir(parents=True, exist_ok=True)
    joblib.dump(clf, out_dir / "model.joblib")
    joblib.dump(tfidf_word, out_dir / "vectorizer_word.joblib")
    joblib.dump(tfidf_char, out_dir / "vectorizer_char.joblib")
    joblib.dump(CFG, out_dir / "config.joblib")
    print("Model and vectorizer saved.")
    
    # save X_val_vec and y_val for evaluation later
    joblib.dump(X_val_vec, out_dir / "X_val_vec.joblib")
    joblib.dump(y_val, out_dir / "y_val.joblib")
    print("Validation data saved.")


In [None]:
for exp in EXPERIMENTS:
    CFG = make_cfg(BASE_CFG, exp)
    print(CFG)
    run_experiment_2(CFG)


{'experiment_id': '20260121_093702', 'text_column': 'text_stripped', 'label_column': 'prdtypecode', 'word_analyzer': 'word', 'word_ngram_min': 1, 'word_ngram_max': 2, 'word_max_features': 200000, 'word_min_df': 2, 'word_sublinear_tf': True, 'char_analyzer': 'char', 'char_ngram_min': 3, 'char_ngram_max': 5, 'char_max_features': 300000, 'char_min_df': 3, 'char_sublinear_tf': True, 'model': 'LinearSVC', 'class_weight': 'balanced', 'C': 0.01, 'run_name': 'TF-IDF Word+Char ngram combined C=0.01'}
Train shape: (66800, 6)
Validation shape: (16701, 6)
Experiment ID: 20260121_093702
Stacked shapes:
X_train_vec: (66800, 500000)
X_val_vec  : (16701, 500000)
Model trained.
Model and vectorizer saved.
Validation data saved.
{'experiment_id': '20260121_094047', 'text_column': 'text_stripped', 'label_column': 'prdtypecode', 'word_analyzer': 'word', 'word_ngram_min': 1, 'word_ngram_max': 2, 'word_max_features': 200000, 'word_min_df': 2, 'word_sublinear_tf': True, 'char_analyzer': 'char', 'char_ngram_m



Model trained.
Model and vectorizer saved.
Validation data saved.


: 