## Multi-output a' la scikit-learn
From: https://calmcode.io/course/scikit-meta/multi-output

In [3]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor


df = pd.read_csv("https://calmcode.io/static/data/titanic.csv")
df.head()


Unnamed: 0,survived,pclass,name,sex,age,fare,sibsp,parch
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,1,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,1,0
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,0,0


In [5]:
labels = df[['survived', 'pclass']].values
X = df.assign(sex=lambda d: d['sex'] == 'male')[['sex', 'age', 'fare']]
X

Unnamed: 0,sex,age,fare
0,True,22.0,7.2500
1,False,38.0,71.2833
2,False,26.0,7.9250
3,False,35.0,53.1000
4,True,35.0,8.0500
...,...,...,...
709,False,39.0,29.1250
710,True,27.0,13.0000
711,False,19.0,30.0000
712,True,26.0,30.0000


In [8]:
labels.shape

(714, 2)

In [7]:
# This will train a LogisticRegression for each model
clf = MultiOutputClassifier(LogisticRegression()).fit(X, labels)
clf.predict(X)

array([[0, 3],
       [1, 1],
       [1, 3],
       ...,
       [1, 2],
       [0, 2],
       [0, 3]])

In [14]:
# We notice that the first label has 2 classes while the second has three
clf.classes_

[array([0, 1]), array([1, 2, 3])]

In [9]:
clf.predict(X).shape

(714, 2)

In [10]:
clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, labels)
clf.predict(X)

array([[0, 3],
       [1, 1],
       [1, 3],
       ...,
       [1, 2],
       [1, 1],
       [0, 3]])

In [11]:
clf.predict_proba(X)


[array([[0.8, 0.2],
        [0.4, 0.6],
        [0.4, 0.6],
        ...,
        [0. , 1. ],
        [0.4, 0.6],
        [0.8, 0.2]]),
 array([[0. , 0. , 1. ],
        [0.8, 0.2, 0. ],
        [0. , 0. , 1. ],
        ...,
        [0.4, 0.6, 0. ],
        [0.6, 0.4, 0. ],
        [0. , 0. , 1. ]])]

In [13]:
[arr.shape for arr in clf.predict_proba(X)]

[(714, 2), (714, 3)]

In [15]:
clf.estimators_


[KNeighborsClassifier(), KNeighborsClassifier()]

## Multi-output a' la river

In [25]:
from river import stream
from sklearn import datasets

dataset = stream.iter_sklearn_dataset(
    dataset=datasets.fetch_openml('Titanic', version=1, parser='auto', as_frame=True),
    shuffle=True,
    seed=42
    )


In [30]:
x, y = next(iter(dataset))
x

{'pclass': 2,
 'name': 'Mellinger, Miss. Madeleine Violet',
 'sex': 'female',
 'age': 13.0,
 'sibsp': 0,
 'parch': 1,
 'ticket': '250644',
 'fare': 19.5,
 'cabin': nan,
 'embarked': 'S',
 'boat': '14',
 'body': nan,
 'home.dest': 'England / Bennington, VT'}

In [32]:
y

'0'

In [35]:
# features and labels coherent with the scikit-learn based example
feat_list = ['sex', 'age', 'fare']

def transform_dict(x, y, feat):
    """From each item of the dataset generator, extract 2 dictionaries: 
    1 related to the features and the other one related to the labels
    """
    new_x = {k: x[k] for k in feat}
    new_y = {'survived': int(y), 'pclass': x['pclass']}
    return new_x, new_y


In [36]:
x, y = next(iter(dataset))
new_x, new_y = transform_dict(x, y, feat_list)
new_x, new_y

({'sex': 'male', 'age': 30.0, 'fare': 57.75}, {'survived': 1, 'pclass': 1})