## Multi-output a' la scikit-learn
From: https://calmcode.io/course/scikit-meta/multi-output

In [9]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor


df = pd.read_csv("https://calmcode.io/static/data/titanic.csv")
df.head()


Unnamed: 0,survived,pclass,name,sex,age,fare,sibsp,parch
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,1,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,1,0
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,0,0


In [2]:
labels = df[['survived', 'pclass']].values
X = df.assign(sex=lambda d: d['sex'] == 'male')[['sex', 'age', 'fare']]
X

Unnamed: 0,sex,age,fare
0,True,22.0,7.2500
1,False,38.0,71.2833
2,False,26.0,7.9250
3,False,35.0,53.1000
4,True,35.0,8.0500
...,...,...,...
709,False,39.0,29.1250
710,True,27.0,13.0000
711,False,19.0,30.0000
712,True,26.0,30.0000


In [3]:
labels.shape

(714, 2)

In [4]:
# This will train a LogisticRegression for each model
clf = MultiOutputClassifier(LogisticRegression()).fit(X, labels)
clf.predict(X)

array([[0, 3],
       [1, 1],
       [1, 3],
       ...,
       [1, 2],
       [0, 2],
       [0, 3]])

In [5]:
# We notice that the first label has 2 classes while the second has three
clf.classes_

[array([0, 1]), array([1, 2, 3])]

In [6]:
clf.predict(X).shape

(714, 2)

In [7]:
clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, labels)
clf.predict(X)

array([[0, 3],
       [1, 1],
       [1, 3],
       ...,
       [1, 2],
       [1, 1],
       [0, 3]])

In [8]:
clf.predict_proba(X)


[array([[0.8, 0.2],
        [0.4, 0.6],
        [0.4, 0.6],
        ...,
        [0. , 1. ],
        [0.4, 0.6],
        [0.8, 0.2]]),
 array([[0. , 0. , 1. ],
        [0.8, 0.2, 0. ],
        [0. , 0. , 1. ],
        ...,
        [0.4, 0.6, 0. ],
        [0.6, 0.4, 0. ],
        [0. , 0. , 1. ]])]

In [9]:
[arr.shape for arr in clf.predict_proba(X)]

[(714, 2), (714, 3)]

In [10]:
clf.estimators_


[KNeighborsClassifier(), KNeighborsClassifier()]

## Multi-output a' la river

### Create dataset for streaming applications

In [1]:
from river import stream
from sklearn import datasets

dataset = stream.iter_sklearn_dataset(
    dataset=datasets.fetch_openml('Titanic', version=1, parser='auto', as_frame=True),
    shuffle=True,
    seed=42
    )


In [2]:
x, y = next(iter(dataset))
x

{'pclass': 1,
 'name': 'Penasco y Castellana, Mr. Victor de Satode',
 'sex': 'male',
 'age': 18.0,
 'sibsp': 1,
 'parch': 0,
 'ticket': 'PC 17758',
 'fare': 108.9,
 'cabin': 'C65',
 'embarked': 'C',
 'boat': nan,
 'body': nan,
 'home.dest': 'Madrid, Spain'}

In [3]:
y

'0'

In [4]:
# features and labels coherent with the scikit-learn based example
feat_list = ['sex', 'age', 'fare']

def transform_dict(x, y, feat):
    """From each item of the dataset generator, extract 2 dictionaries: 
    1 related to the features and the other one related to the labels
    """
    new_x = {k: x[k] for k in feat}
    new_y = {'survived': int(y), 'pclass': x['pclass']}
    return new_x, new_y


In [5]:
x, y = next(iter(dataset))
new_x, new_y = transform_dict(x, y, feat_list)
new_x, new_y

({'sex': 'male', 'age': 33.0, 'fare': 5.0}, {'survived': 0, 'pclass': 1})

In [6]:
import functools
from river import neighbors, multioutput, linear_model
from river.utils.math import minkowski_distance

### Test instance creation

In [7]:
knn_inst = \
    multioutput.PerOutputClassifier(
        model= neighbors.KNNClassifier(
            n_neighbors=1,
            engine=neighbors.LazySearch(
            window_size=10, dist_func=functools.partial(minkowski_distance, p=2)
            ),)
        )


### Test multi-class property

In [8]:
# check if the model is multi-class
knn_inst._multiclass

True

In [9]:
# check predicted proba before seeing any data
knn_inst.predict_proba_one(x)

{}

In [11]:
# check predictions before seeing any data
print(knn_inst.predict_one(x))

{}


In [12]:
lin_inst = \
    multioutput.PerOutputClassifier(
        model= linear_model.LogisticRegression()
    )


In [13]:
lin_inst._multiclass

False

In [14]:
lin_inst.__dict__

{'model': LogisticRegression (
   optimizer=SGD (
     lr=Constant (
       learning_rate=0.01
     )
   )
   loss=Log (
     weight_pos=1.
     weight_neg=1.
   )
   l2=0.
   l1=0.
   intercept_init=0.
   intercept_lr=Constant (
     learning_rate=0.01
   )
   clip_gradient=1e+12
   initializer=Zeros ()
 )}

In [15]:
help(multioutput.PerOutputClassifier)

Help on class PerOutputClassifier in module river.multioutput.peroutput:

class PerOutputClassifier(river.base.multi_output.MultiLabelClassifier)
 |  PerOutputClassifier(model: 'base.Classifier')
 |  
 |  Multi-target classification.
 |  
 |  This class implements a classification strategy where a classifier is 
 |  fitting multiple targets separately.
 |  
 |  
 |  Parameters
 |  ----------
 |  model
 |      The classifier used for learning.
 |  
 |  Examples
 |  --------
 |  
 |  Method resolution order:
 |      PerOutputClassifier
 |      river.base.multi_output.MultiLabelClassifier
 |      river.base.estimator.Estimator
 |      river.base.base.Base
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, model: 'base.Classifier')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  learn_one(self, x, y)
 |      Update the model with a set of features `x` and the labels `y`.
 |      
 |      Parameters
 |      --------