In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
from ex_01_read_data import get_welding_data
from ex_03_feature_extraction import extract_features

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 


DATA_PATH = Path('/kaggle/input/uebdataset/data.csv')
RESULTS_PATH = Path('../results/')
RANDOM_SEEDS = [42, 1234, 123]
TEST_SIZE = 0.2



In [None]:

print("Loading and preparing engineered features...")
data_full, labels_full, _ = get_welding_data(DATA_PATH)
features_df = extract_features(data_full, labels_full)
X_eng = features_df.drop('label', axis=1)
y_eng = features_df['label']

X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(
    X_eng, y_eng, test_size=TEST_SIZE, random_state=RANDOM_SEEDS[0]
)

print("Loading and preparing raw data...")
data_raw, labels_raw, _ = get_welding_data(DATA_PATH)
y_raw = labels_raw              
#Flattening data
n_samples, n_timesteps, n_features = data_raw.shape
X_raw_flat = data_raw.reshape((n_samples, n_timesteps * n_features))

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw_flat, y_raw, test_size=TEST_SIZE, random_state=RANDOM_SEEDS[0]
)

print(f"Engineered Features - Train shape: {X_train_eng.shape}, Test shape: {X_test_eng.shape}")
print(f"Raw Data (Flattened) - Train shape: {X_train_raw.shape}, Test shape: {X_test_raw.shape}")

Loading and preparing engineered features...
Loading and preparing raw data...
Engineered Features - Train shape: (7200, 20), Test shape: (1800, 20)
Raw Data (Flattened) - Train shape: (5600, 400), Test shape: (1400, 400)


In [None]:
classifiers = {
    'LogisticRegression': LogisticRegression(solver='saga', max_iter=1000,random_state=42),
    'SVC': SVC(probability=True,random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier() 
}


#hyperparameter grids
param_grids = {
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10.0],
        'classifier__penalty': ['l1', 'l2']
    },
    'SVC': {
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__kernel': ['rbf']
    },
    'RandomForest': {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, 30],
        'classifier__min_samples_leaf': [1, 2, 3]
    },
        'KNeighborsClassifier': {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    }
    
    
}

In [None]:

def run_evaluation_pipeline(X_train, y_train, X_test, y_test, dataset_name: str):
    """
    Runs the full evaluation pipeline for all classifiers
    """
    all_results = []

    for name, classifier in classifiers.items():
        print(f"--- Running for {name} on {dataset_name} ---")

        #pipeline with SMOTE,Scaler and Classifier
        pipeline = Pipeline([
            ('smote', SMOTE(random_state=42)),
            ('scaler', StandardScaler()),
            ('classifier', classifier)
        ])

        grid_search = GridSearchCV(
            pipeline,
            param_grid=param_grids[name],
            scoring='f1'
        )

        #best hyperparameters
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        print(f"Best parameters : {best_params}")

        #Train with best params over multiple seeds and evaluate on the test set
        seed_metrics = []
        for seed in RANDOM_SEEDS:
            # Update random states in the best model pipeline
            final_model = grid_search.best_estimator_
            params_to_set = {'smote__random_state': seed}
            if 'random_state' in final_model.named_steps['classifier'].get_params():
                params_to_set['classifier__random_state'] = seed
                final_model.set_params(smote__random_state=seed, classifier__random_state=seed)
            else:
                final_model.set_params(smote__random_state=seed)
            final_model.fit(X_train, y_train)

            # Evaluate test set
            y_pred = final_model.predict(X_test)
            y_proba = final_model.predict_proba(X_test)[:, 1] 

            metrics = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'f1_score': f1_score(y_test, y_pred),
                'roc_auc': roc_auc_score(y_test, y_proba) 
            }
            seed_metrics.append(metrics)

        #mean and std dev of metrics
        metrics_df = pd.DataFrame(seed_metrics)
        mean_metrics = metrics_df.mean()
        std_metrics = metrics_df.std()

        result_row = {'model': name}
        for metric in mean_metrics.index:
            result_row[f'{metric}_mean'] = mean_metrics[metric]
            result_row[f'{metric}_std'] = std_metrics[metric]
        all_results.append(result_row)

    results_df = pd.DataFrame(all_results)
    output_path = RESULTS_PATH / f"ex_05_results_{dataset_name}.csv"
    results_df.to_csv(output_path)
    return results_df

#Run for both datasets
results_eng = run_evaluation_pipeline(X_train_eng, y_train_eng, X_test_eng, y_test_eng, "engineered_features")
results_raw = run_evaluation_pipeline(X_train_raw, y_train_raw, X_test_raw, y_test_raw, "raw_data")

--- Running for LogisticRegression on engineered_features ---
Best parameters : {'classifier__C': 10.0, 'classifier__penalty': 'l2'}
--- Running for SVC on engineered_features ---
Best parameters : {'classifier__C': 10, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}
--- Running for RandomForest on engineered_features ---
Best parameters : {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 3, 'classifier__n_estimators': 200}
--- Running for KNeighborsClassifier on engineered_features ---
Best parameters : {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 7, 'classifier__weights': 'distance'}
--- Running for LogisticRegression on raw_data ---
Best parameters : {'classifier__C': 0.1, 'classifier__penalty': 'l1'}
--- Running for SVC on raw_data ---
Best parameters : {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}
--- Running for RandomForest on raw_data ---
Best parameters : {'classifier__max_depth': 20, 'classifier__min

In [None]:

print("--- Results for Engineered Features ---")
display(results_eng)

print("\n--- Results for Raw Data (Flattened) ---")
display(results_raw)

--- Results for Engineered Features ---


Unnamed: 0,model,accuracy_mean,accuracy_std,precision_mean,precision_std,recall_mean,recall_std,f1_score_mean,f1_score_std,roc_auc_mean,roc_auc_std,feature_set
0,LogisticRegression,0.718519,0.001156,0.6477,0.000961,0.814309,0.00612,0.721504,0.002201,0.805932,0.000674,Engineered
1,SVC,0.767778,0.000962,0.70494,0.002264,0.827957,0.003122,0.761506,0.000456,0.85419,0.000284,Engineered
2,RandomForest,0.782593,0.00274,0.730234,0.004963,0.815964,0.003988,0.770707,0.001986,0.882698,0.000422,Engineered
3,KNeighborsClassifier,0.766852,0.004456,0.726807,0.004434,0.76799,0.005686,0.746831,0.005009,0.859408,0.000767,Engineered



--- Results for Raw Data (Flattened) ---


Unnamed: 0,model,accuracy_mean,accuracy_std,precision_mean,precision_std,recall_mean,recall_std,f1_score_mean,f1_score_std,roc_auc_mean,roc_auc_std,feature_set
0,LogisticRegression,0.755,0.003113,0.684461,0.004077,0.780405,0.005852,0.72928,0.003375,0.826938,0.00133,Raw
1,SVC,0.715952,0.001798,0.610388,0.002223,0.907658,0.006827,0.729904,0.001675,0.839972,0.000381,Raw
2,RandomForest,0.793333,0.006402,0.737709,0.007634,0.793356,0.006395,0.76452,0.007012,0.883599,0.000819,Raw
3,KNeighborsClassifier,0.767857,0.007559,0.706312,0.007077,0.771959,0.011824,0.737675,0.009249,0.855628,0.001603,Raw


Warum diese Modelle:


    Logistic Regression: Dient als lineare Baseline, damit prüfen wir, ob sich die Schweißqualität simpel etwa über die durchschnittliche Spannung – vorhersagen lässt. So sehen wir schnell, ob schon eine einfache Beziehung eine brauchbare Einschätzung 

    SVC (Support Vector Classifier): Ideal, um nicht-lineare Muster in den Schweißdaten zu erkennen. Die Qualität hängt oft von komplexen Wechselwirkungen , die keine einfache Gerade bilden. 

    Random Forest: ideal für die komplexen Schweißdaten ist. Da es für jeden Baum eine zufällige Auswahl der Features (z.B. einige nur mit Spannungs-Features, andere mit Strom-Features) verwende. Es verlässt sich nicht auf ein einziges Merkmal, sondern lernt aus vielen verschiedenen Perspektiven der Schweißdaten.

    K-Nearest Neighbors (KNN): Testet die Hypothese, dass sich gute Schweißnähte in ihren Messwerten ähneln.KNN prüft, ob die Datenpunkte für gute und schlechte Nähte im Merkmalsraum klar voneinander getrennte Gruppen bilden

Warum die Metriken :

    Recall : Die Sicherheitsmetrik. Jede schlechte Schweißnaht, die als "gut" durchgeht (ein False Negative), ist ein enormes Risiko für die Produktstabilität und kann zu teuren Rückrufen oder Unfällen führen.

    Precision (Präzision): Die Effizienzmetrik. Jede gute Schweißnaht, die fälschlicherweise als "schlecht" markiert wird (ein False Positive), verursacht unnötige Kosten 

    F1-Score: Kompromiss für die Erkennung der schlechten Schweißnähte. Ein Modell, das nur einen hohen Recall hat, würde vielleicht jede zweite Naht als fehlerhaft markieren (niedrige Precision), was die Produktion beschädigt , Der F1-Score stellt sicher, dass das Modell sowohl präzise als auch gründlich ist.

