## Custom Transformers

A custom transformer CustomScaler is defined for z-score scaling (standardizing) of data. Since scaling doesn’t require learning any parameters from the data, fit method just returns self. The transform method applies the z-score scaling formula.

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CustomScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # nothing to fit
    
    def transform(self, X, y=None):
        return (X - np.mean(X)) / np.std(X)  # simple z-score scaling

In [5]:
custom_scaler = CustomScaler()

In [6]:
from sklearn.datasets import load_iris

data = load_iris()

In [7]:
X = data["data"]

In [8]:
y = data["target"]

In [9]:
custom_scaler.transform(X)

array([[ 0.82858665,  0.01798522, -1.04592915, -1.65388022],
       [ 0.72726147, -0.23532773, -1.04592915, -1.65388022],
       [ 0.62593629, -0.13400255, -1.09659174, -1.65388022],
       [ 0.5752737 , -0.18466514, -0.99526657, -1.65388022],
       [ 0.77792406,  0.06864781, -1.04592915, -1.65388022],
       [ 0.98057441,  0.22063558, -0.89394139, -1.55255505],
       [ 0.5752737 , -0.03267737, -1.04592915, -1.60321764],
       [ 0.77792406, -0.03267737, -0.99526657, -1.65388022],
       [ 0.47394852, -0.28599032, -1.04592915, -1.65388022],
       [ 0.72726147, -0.18466514, -0.99526657, -1.70454281],
       [ 0.98057441,  0.1193104 , -0.99526657, -1.65388022],
       [ 0.67659888, -0.03267737, -0.94460398, -1.65388022],
       [ 0.67659888, -0.23532773, -1.04592915, -1.70454281],
       [ 0.42328593, -0.23532773, -1.19791692, -1.70454281],
       [ 1.18322477,  0.27129817, -1.14725433, -1.65388022],
       [ 1.13256218,  0.47394852, -0.99526657, -1.55255505],
       [ 0.98057441,  0.

Similarly, `OutlierRemoval` is a transformer that removes outliers based on the Interquartile Range (IQR).

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class OutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        
    def fit(self, X, y=None):
        self.median = np.median(X, axis=0)
        self.iqr = np.percentile(X, 75, axis=0) - np.percentile(X, 25, axis=0)
        return self
    
    def transform(self, X, y=None):
        lower_bound = self.median - self.factor * self.iqr
        upper_bound = self.median + self.factor * self.iqr
        return X[((X >= lower_bound) & (X <= upper_bound)).all(axis=1)]


## Custom Classifiers

Here two custom classifiers are defined - `MajorityClassClassifier`, which always predicts the majority class from the training data, and `ThresholdClassifier`, which makes predictions based on a threshold on a particular feature.

In [11]:
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.stats import mode

class MajorityClassClassifier(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.majority_class = mode(y)[0][0]
        return self
    
    def predict(self, X):
        return [self.majority_class] * len(X)

In [12]:
class ThresholdClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, threshold=0.5, feature_index=0):
        self.threshold = threshold
        self.feature_index = feature_index
    
    def fit(self, X, y):
        return self  # nothing to fit
    
    def predict(self, X):
        return (X[:, self.feature_index] > self.threshold).astype(int)

In [13]:
th_clf = ThresholdClassifier(threshold=0.7, feature_index=2)

In [14]:
th_clf.threshold

0.7

In [15]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class CustomRandomClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, p=0.5):
        self.p = p

    def fit(self, X, y):
        return self

    def predict(self, X):
        # Generate random predictions
        random_preds = np.random.choice([0, 1], size=len(X), p=[1-self.p, self.p])
        return random_preds

In [16]:
random_clf = CustomRandomClassifier(p=0.7)

In [17]:
random_clf.fit(X, y)

CustomRandomClassifier(p=0.7)

In [18]:
random_clf.predict(X)

array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1])

Additional classifiers like CustomRandomClassifier which randomly predicts classes, HybridClassifier which averages predictions from a Random Forest and an XGBoost model, and SklearnTFHybrid which averages predictions from a Random Forest and a simple Pytorch neural network are also defined.

In [19]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

class HybridClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, sklearn_classifier=RandomForestClassifier(), xgb_classifier=xgb.XGBClassifier()):
        self.sklearn_classifier = sklearn_classifier
        self.xgb_classifier = xgb_classifier
        
    def fit(self, X, y):
        self.sklearn_classifier.fit(X, y)
        self.xgb_classifier.fit(X, y)
        return self
    
    def predict(self, X):
        sklearn_preds = self.sklearn_classifier.predict(X)
        xgb_preds = self.xgb_classifier.predict(X)
        final_preds = (sklearn_preds + xgb_preds) / 2  # Averaging predictions
        return final_preds.round().astype(int)  # Round to nearest integer to get class labels


In [20]:
hyb_clf = HybridClassifier()

In [21]:
hyb_clf.xgb_classifier

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [22]:
hyb_clf.fit(X, y)

HybridClassifier()

In [23]:
hyb_clf.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import StandardScaler

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(4, 64)  # Assuming 4 features as input
        self.fc2 = nn.Linear(64, 3)  # Output layer for 3 classes

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=1)  # Use softmax for multi-class
        return x

class SklearnTorchHybrid(BaseEstimator, ClassifierMixin):
    def __init__(self, sklearn_classifier=RandomForestClassifier()):
        self.sklearn_classifier = sklearn_classifier
        self.torch_model = SimpleNN()
        self.criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class
        self.optimizer = optim.Adam(self.torch_model.parameters(), lr=0.001)
        
    def fit(self, X, y):
        scaler = StandardScaler().fit(X)
        X_scaled = scaler.transform(X)
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
        y_tensor = torch.tensor(y, dtype=torch.long)  # Labels for CrossEntropyLoss should be torch.long
        
        for epoch in range(10):  # Assuming 10 epochs
            self.optimizer.zero_grad()
            outputs = self.torch_model(X_tensor)
            loss = self.criterion(outputs, y_tensor)
            loss.backward()
            self.optimizer.step()
            
        return self
    
    def predict(self, X):
        scaler = StandardScaler().fit(X)
        X_scaled = scaler.transform(X)
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
        with torch.no_grad():
            torch_preds = self.torch_model(X_tensor).argmax(dim=1).numpy()  # Use argmax to get the predicted class
        final_preds = torch_preds
        return final_preds


## Custom Pipeline

A scikit-learn pipeline is created combining the CustomScaler transformer and HybridClassifier. This pipeline is then fitted on the training data and evaluated on the test data, showcasing how custom transformers and classifiers can be integrated into scikit-learn pipelines for streamlined model training and evaluation.

In [25]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load data
iris = load_iris()
X, y = iris.data, iris.target

## For simplicity, convert to a binary classification problem
#y_binary = (y == 0).astype(int)  # 1 if setosa, 0 otherwise

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# Assume CustomScaler, ThresholdClassifier, and HybridClassifier are defined as before

pipeline = Pipeline([
    #('scaler', CustomScaler()),
    ('classifier', SklearnTorchHybrid()
)])

# Fit and evaluate
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89        10
           1       0.67      0.67      0.67         9
           2       0.77      0.91      0.83        11

    accuracy                           0.80        30
   macro avg       0.81      0.79      0.80        30
weighted avg       0.82      0.80      0.80        30

