# Projekt 2
Julia Strzelczyk, Maksymilian Tabian, Łukasz Wyszomierski

- Jeśli masz zainstalowaną wersję scikit-learn < 1.2, zmień argument `sparse_output` na `sparse` w `OneHotEncoder` wewnątrz metody `fit()`
- Upewnij się, że masz zainstalowane w swoim środowisku pakiety xgboost oraz catboost

In [1]:
import json
import numpy as np
import pandas as pd
import warnings

# Sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import clone
from sklearn.metrics import balanced_accuracy_score, roc_auc_score

# Modele Sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Modele zewnętrzne
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [2]:
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="catboost")
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
class MiniAutoML:
    def __init__(self, models_config, random_state=123):
        self.models_config = models_config
        self.random_state = random_state
        self.model_results = []
        
        self.best_model_name = None
        self.fitted_pipeline = None   
        self.ensemble_models = []     

        self.le_ = None       
        self.classes_ = None

    def fit(self, X, y, cv_folds=5, use_ensemble=False, ensemble_size=3):
        if isinstance(y, pd.DataFrame):
            y = y.squeeze()
        if hasattr(y, "astype"):
            y = y.astype(str)

        numerical_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler',  StandardScaler())
        ])

        categorical_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot',  OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('numerical', numerical_transformer, make_column_selector(dtype_include=np.number)),
                ('categorical', categorical_transformer, make_column_selector(dtype_include=object))
            ]
        )

        self.le_ = LabelEncoder()
        y = self.le_.fit_transform(y)
        self.classes_ = self.le_.classes_

        cv = KFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)
        self.model_results = []

        print(f"Rozpoczynam trenowanie {len(self.models_config)} modeli")

        for cfg in self.models_config:
            try:
                full_class_path = cfg["class"]
                parts = full_class_path.split(".")
                module_name = ".".join(parts[:-1])
                class_name = parts[-1]
                
                module = __import__(module_name, fromlist=[class_name])
                cls = getattr(module, class_name)
                
                params = cfg.get("params", {}).copy()
                model = cls(**params)

                pipeline = Pipeline(steps=[
                    ("preprocessing", preprocessor),
                    ("model", model)
                ])

                scores = cross_val_score(pipeline, X, y, cv=cv, scoring="balanced_accuracy")
                mean_score = scores.mean()
                
                self.model_results.append({
                    "name": cfg["name"],
                    "cv_score": mean_score,
                    "pipeline": pipeline
                })
            except ImportError:
                continue
            except Exception as e:
                print(f"Błąd modelu {cfg.get('name', 'unknown')}: {e}")
                continue

        if not self.model_results:
            raise RuntimeError("Żaden model nie został poprawnie wytrenowany. Sprawdź poprawność models.json i zainstalowane biblioteki.")

        self.model_results.sort(key=lambda x: x["cv_score"], reverse=True)

        self.ensemble_models = []
        self.fitted_pipeline = None

        if use_ensemble:
            n_best = min(ensemble_size, len(self.model_results))
            top_results = self.model_results[:n_best]
            print(f"\n--- Tryb ENSEMBLE ---")
            print(f"\nUśredniono prawdopodobieństwa następujących {n_best} modeli:")
            
            for res in top_results:
                print(f"-{res['name']} (balanced accuracy z CV: {res['cv_score']:.4f})")
                final_pipeline = clone(res["pipeline"])
                final_pipeline.fit(X, y)
                self.ensemble_models.append(final_pipeline)
        else:
            best = self.model_results[0]
            self.best_model_name = best["name"]
            print(f"\n--- Tryb SINGLE ---")
            print(f"\n{self.best_model_name} (balanced accuracy z CV: {best['cv_score']:.4f})")
            
            self.fitted_pipeline = clone(best["pipeline"])
            self.fitted_pipeline.fit(X, y)

        return self

    def transform_y(self, y):
        """Metoda do kodowania y_test tak samo jak y_train"""
        if self.le_ is None:
            raise ValueError("Model nie został wytrenowany (uruchom .fit() najpierw)")
        
        if isinstance(y, pd.DataFrame):
            y = y.squeeze()
        if hasattr(y, "astype"):
            y = y.astype(str)
            
        return self.le_.transform(y)


    def predict_proba(self, X):
        if self.ensemble_models:
            all_probas = []
            for model in self.ensemble_models:
                if hasattr(model[-1], "predict_proba"):
                    proba = model.predict_proba(X)[:, 1]
                else:
                    d = model.decision_function(X)
                    proba = 1 / (1 + np.exp(-d))
                all_probas.append(proba)
            return np.mean(all_probas, axis=0)

        elif self.fitted_pipeline:
            if hasattr(self.fitted_pipeline[-1], "predict_proba"):
                return self.fitted_pipeline.predict_proba(X)[:, 1]
            return self.fitted_pipeline.decision_function(X)
        
        else:
            raise ValueError("Model nie został wytrenowany")

    def predict(self, X):
        probas = self.predict_proba(X)
        return (probas > 0.5).astype(int)
    
    def __repr__(self):
        status = "Not fitted"
        if self.ensemble_models:
            status = f"Fitted (Ensemble: {len(self.ensemble_models)} models)"
        elif self.fitted_pipeline:
            status = f"Fitted (Single: {self.best_model_name})"
            
        return f"<MiniAutoML Object | Status: {status}>"

Blood- transfusion

In [4]:
import openml
dataset = openml.datasets.get_dataset('blood-transfusion-service-center')
X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123, stratify=y)

In [5]:
with open("models.json", "r") as f:
    models_config = json.load(f)

In [6]:
# Single model
automl_single = MiniAutoML(models_config)
automl_single.fit(X_train, y_train)

# Zakodowanie y_test tak samo jak y_train
y_test_encoded = automl_single.transform_y(y_test)

Rozpoczynam trenowanie 49 modeli

--- Tryb SINGLE ---

kknn_1954 (balanced accuracy z CV: 0.6368)


In [7]:
# Ensemble
automl_ensemble = MiniAutoML(models_config)
automl_ensemble.fit(X_train, y_train, use_ensemble=True)

Rozpoczynam trenowanie 49 modeli

--- Tryb ENSEMBLE ---

Uśredniono prawdopodobieństwa następujących 3 modeli:
-kknn_1954 (balanced accuracy z CV: 0.6368)
-randomForest_1744 (balanced accuracy z CV: 0.6359)
-kknn_1939 (balanced accuracy z CV: 0.6358)


<MiniAutoML Object | Status: Fitted (Ensemble: 3 models)>

In [None]:
# Sprawdzenie wyników
y_pred_single    = automl_single.predict(X_test)
y_pred_ensemble  = automl_ensemble.predict(X_test)

y_proba_single   = automl_single.predict_proba(X_test)
y_proba_ensemble = automl_ensemble.predict_proba(X_test)

In [None]:
acc_single   = balanced_accuracy_score(y_test_encoded, y_pred_single)
acc_ensemble = balanced_accuracy_score(y_test_encoded, y_pred_ensemble)

auc_single   = roc_auc_score(y_test_encoded, y_proba_single)
auc_ensemble = roc_auc_score(y_test_encoded, y_proba_ensemble)

In [10]:
print(f"Balanced accuracy Single Model:  {acc_single:.4f}")
print(f"Balanced accuracy Ensemble Model: {acc_ensemble:.4f}")

print(f"ROC AUC Single Model:  {auc_single:.4f}")
print(f"ROC AUC Ensemble Model: {auc_ensemble:.4f}")

Balanced accuracy Single Model:  0.5154
Balanced accuracy Ensemble Model: 0.6316
ROC AUC Single Model:  0.7076
ROC AUC Ensemble Model: 0.7094


Credit-g

In [18]:
import openml
dataset = openml.datasets.get_dataset('credit-g')
X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123, stratify=y)

In [19]:
with open("models.json", "r") as f:
    models_config = json.load(f)

In [20]:
# Single model
automl_single = MiniAutoML(models_config)
automl_single.fit(X_train, y_train)

# Zakodowanie y_test tak samo jak y_train
y_test_encoded = automl_single.transform_y(y_test)

Rozpoczynam trenowanie 49 modeli

--- Tryb SINGLE ---

gbm_1627 (balanced accuracy z CV: 0.5920)


In [21]:
# Ensemble
automl_ensemble = MiniAutoML(models_config)
automl_ensemble.fit(X_train, y_train, use_ensemble=True)

Rozpoczynam trenowanie 49 modeli

--- Tryb ENSEMBLE ---

Uśredniono prawdopodobieństwa następujących 3 modeli:
-gbm_1627 (balanced accuracy z CV: 0.5920)
-gbm_1272 (balanced accuracy z CV: 0.5918)
-xgboost_1049 (balanced accuracy z CV: 0.5854)


<MiniAutoML Object | Status: Fitted (Ensemble: 3 models)>

In [26]:
# Sprawdzenie wyników
y_pred_single = automl_single.predict(X_test)
y_pred_ensemble = automl_ensemble.predict(X_test)

y_proba_single = automl_single.predict_proba(X_test)
y_proba_ensemble = automl_ensemble.predict_proba(X_test)

In [34]:
acc_single   = balanced_accuracy_score(y_test_encoded, y_pred_single)
acc_ensemble = balanced_accuracy_score(y_test_encoded, y_pred_ensemble)

auc_single = roc_auc_score(y_test_encoded, y_proba_single)
auc_ensemble = roc_auc_score(y_test_encoded, y_proba_ensemble)

In [35]:
print(f"Balanced accuracy Single Model:  {acc_single:.4f}")
print(f"Balanced accuracy Ensemble Model: {acc_ensemble:.4f}")

print(f"ROC AUC Single Model:  {auc_single:.4f}")
print(f"ROC AUC Ensemble Model: {auc_ensemble:.4f}")

Balanced accuracy Single Model:  0.5810
Balanced accuracy Ensemble Model: 0.5560
ROC AUC Single Model:  0.6802
ROC AUC Ensemble Model: 0.6783
