In [1]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from eipy.ei import EnsembleIntegration
from eipy.additional_ensembles import MeanAggregation, CES
from sklearn.model_selection import train_test_split

In [2]:
"""
Generate multimodal multiclass data.
https://dev.pages.lis-lab.fr/scikit-multimodallearn/tutorial/auto_examples/combo/plot_combo_3_views_3_classes.html#
"""
def generate_data(n_samples, lim):
    """Generate random data in a rectangle"""
    lim = np.array(lim)
    n_features = lim.shape[0]
    data = np.random.random((n_samples, n_features))
    data = (lim[:, 1]-lim[:, 0]) * data + lim[:, 0]
    return data


seed = 12
np.random.seed(seed)

n_samples = 300

view_0 = np.concatenate((generate_data(n_samples, [[0., 1.], [0., 1.]]),
                         generate_data(n_samples, [[1., 2.], [0., 1.]]),
                         generate_data(n_samples, [[0., 2.], [0., 1.]])))

view_1 = np.concatenate((generate_data(n_samples, [[1., 2.], [0., 1.]]),
                         generate_data(n_samples, [[0., 2.], [0., 1.]]),
                         generate_data(n_samples, [[0., 1.], [0., 1.]])))

view_2 = np.concatenate((generate_data(n_samples, [[0., 2.], [0., 1.]]),
                         generate_data(n_samples, [[0., 1.], [0., 1.]]),
                         generate_data(n_samples, [[1., 2.], [0., 1.]])))

X = np.concatenate((view_0, view_1, view_2), axis=1)

y = np.zeros(3*n_samples, dtype=np.int64)
y[n_samples:2*n_samples] = 1
y[2*n_samples:] = 2

In [3]:
X_0_train, X_0_test, y_train, y_test = train_test_split(view_0, y, test_size=0.2, random_state=3, stratify=y)
X_1_train, X_1_test, _, _ = train_test_split(view_1, y, test_size=0.2, random_state=3, stratify=y)
X_2_train, X_2_test, _, _ = train_test_split(view_2, y, test_size=0.2, random_state=3, stratify=y)

data_train = {
                "Modality_0": X_0_train,
                "Modality_1": X_1_train,
                "Modality_2": X_2_train
                }

data_test = {
                "Modality_0": X_0_test,
                "Modality_1": X_1_test,
                "Modality_2": X_2_test,
                }

In [4]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True),
}
meta_predictors = {
                    'Mean' : MeanAggregation(),
                    'CES' : CES(),
                    'S.ADAB': AdaBoostClassifier(),
                    'S.XGB': XGBClassifier(),
                    'S.DT': DecisionTreeClassifier(),
                    "S.RF": RandomForestClassifier(),
                    'S.GB': GradientBoostingClassifier(),
                    'S.KNN': KNeighborsClassifier(),
                    'S.LR': LogisticRegression(),
                    'S.NB': GaussianNB(),
                    'S.MLP': MLPClassifier(),
                    'S.SVM': SVC(probability=True),
}
n_classes = len(set(y))

In [5]:
class_EIs=[]
for c in range(n_classes):
    y_train_c = [1*(label==c)+0*(label != c) for label in y_train]
    EI = EnsembleIntegration(
                        base_predictors=base_predictors,
                        k_outer=5,
                        k_inner=5,
                        n_samples=1,
                        sampling_strategy="undersampling",
                        sampling_aggregation="mean",
                        n_jobs=-1,
                        random_state=38,
                        project_name="toy",
                        model_building=True,
                        )
    for name, modality in data_train.items():
        EI.train_base(modality, y_train_c, modality_name=name)
    EI.train_meta(meta_predictors=meta_predictors)
    class_EIs.append(EI)

Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


Training base predictors on Modality_0...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on Modality_1...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on Modality_2...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


In [6]:
class_EIs[0].meta_summary["metrics"]

Unnamed: 0,Mean,CES,S.ADAB,S.XGB,S.DT,S.RF,S.GB,S.KNN,S.LR,S.NB,S.MLP,S.SVM
fmax (minority),0.970711,0.985386,0.989562,0.993763,0.987448,0.995816,0.995851,0.995816,0.987654,0.995816,0.993711,0.993711
f (majority),0.985447,0.992716,0.994797,0.996872,0.993763,0.997921,0.997912,0.997921,0.993711,0.997921,0.996885,0.996885
AUC,0.996467,0.998429,0.995313,0.999931,0.989583,0.999939,0.99997,0.997865,0.999696,0.999913,0.99987,0.999913
max MCC,0.956177,0.971848,0.984364,0.99064,0.98123,0.993756,0.993782,0.993756,0.981537,0.993756,0.990639,0.990639


In [7]:
y_preds = []
thresholds = []

# connfidence relative to threshold weighed by threshold. This is basically the only decision in the whole implementation.
def confidence(prediction, thresh):
    if prediction-threshold >= 0:
        return (prediction-thresh)*thresh
    else:
        return (prediction-thresh)*(1-thresh)

# unweighted distance from threshold. performs worse on toy data.
def alt_confidence(prediction, thresh):
    return prediction-thresh
               

for c in range(n_classes):
    metrics = class_EIs[c].meta_summary["metrics"]
    class_preferred_model = metrics.loc["fmax (minority)"].idxmax()
    
    y_pred = class_EIs[c].predict(X_dict=data_test, meta_model_key=class_preferred_model)
    
    threshold = class_EIs[c].meta_summary['thresholds'][class_preferred_model]['fmax (minority)']

    y_pred = [confidence(pred, threshold) for pred in y_pred]
    
    y_preds.append(y_pred)

y_preds = np.array(y_preds)

final_pred = []
for k in range(len(y_preds[0])):
    most_confident_class = np.argmax(y_preds[:, k])
    
    final_pred.append(most_confident_class)
    

In [8]:
final_pred

[2,
 1,
 2,
 0,
 0,
 2,
 1,
 2,
 0,
 2,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 1,
 2,
 2,
 0,
 2,
 2,
 0,
 2,
 0,
 0,
 2,
 1,
 0,
 1,
 0,
 0,
 2,
 1,
 1,
 2,
 1,
 0,
 2,
 2,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 0,
 0,
 2,
 0,
 2,
 0,
 1,
 0,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 1,
 1,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 2,
 2,
 1,
 1,
 0,
 0,
 0,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 1,
 1,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 1,
 0,
 0,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 0,
 1,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 2,
 1,
 2,
 1,
 2,
 0,
 1]

In [9]:
y_test

array([2, 1, 2, 0, 0, 2, 1, 2, 0, 2, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1, 0, 0,
       2, 0, 1, 1, 1, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 1, 0, 1, 0, 0, 2, 1,
       1, 2, 1, 0, 2, 2, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 0, 0, 2, 0, 2,
       0, 1, 0, 1, 1, 2, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 2, 1, 0, 0, 1, 1,
       2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 0, 0, 0, 2, 0, 1, 1, 2, 2,
       2, 2, 2, 0, 2, 0, 1, 1, 2, 0, 1, 0, 2, 2, 2, 2, 1, 0, 0, 2, 1, 1,
       2, 1, 1, 2, 0, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 2, 0, 0, 0, 2, 1, 2,
       1, 2, 0, 1])

In [10]:
accuracy = sum([1*(y==y_hat)+0*(y!=y_hat) for y,y_hat in list(zip(y_test, final_pred))])/len(y_test)
accuracy

1.0