In [6]:
import numpy as np
import pandas as pd
from skmultilearn.dataset import load_dataset
import sklearn.metrics as metrics
import scipy.sparse as sparse
import seaborn as sns

X_train, y_train, feature_names, label_names = load_dataset('emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')

emotions:train - exists, not redownloading
emotions:test - exists, not redownloading


In [7]:
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.cluster import IGraphLabelGraphClusterer, LabelCooccurrenceGraphBuilder
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

# construct base forest classifier
base_classifier = RandomForestClassifier(n_estimators=1000)

# construct a graph builder that will include
# label relations weighted by how many times they
# co-occurred in the data, without self-edges
graph_builder = LabelCooccurrenceGraphBuilder(
    weighted = True,
    include_self_edges = False
)

# setup problem transformation approach with sparse matrices for random forest
problem_transform_classifier = LabelPowerset(classifier=base_classifier,
    require_dense=[False, False])

# setup the clusterer to use, we selected the fast greedy modularity-maximization approach
clusterer = IGraphLabelGraphClusterer(graph_builder=graph_builder, method='fastgreedy')

# setup the ensemble metaclassifier
classifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer)

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)