In [1]:
import numpy as np
import pandas as pd

from skmultilearn.dataset import load_dataset
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import BRkNNaClassifier
from skmultilearn.adapt import MLkNN
from sklearn.naive_bayes import GaussianNB
from skmultilearn.ensemble import RakelD

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import sklearn.metrics as metrics
import scipy.sparse as sparse

import copy

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set_theme(style="whitegrid")

In [10]:
def trans_result(y_testing, pred):
    col_count = y_testing.shape[1]
    y_test_df = pd.DataFrame(y_testing.todense(),columns=[label_names[x][0] for x in range(col_count)])
    pred_df = pd.DataFrame(pred.todense(),columns=[label_names[x][0] for x in range(col_count)])
    z=[]
    for col in y_test_df.columns:
        z.append({"name": col,
              "precision": metrics.precision_score(y_test_df[col], pred_df[col]),
              "recall": metrics.recall_score(y_test_df[col], pred_df[col]),
              "accuracy": metrics.accuracy_score(y_test_df[col], pred_df[col]),
              "f1": metrics.f1_score(y_test_df[col], pred_df[col])})
    return pd.DataFrame(z)

def feature_filter(X_training, X_testing, filter_arr):
    x_training_new = np.array([np.array(item)[0][filter_arr] for item in X_training.todense()])
    x_testing_new = np.array([np.array(item)[0][filter_arr] for item in X_testing.todense()])
    return x_training_new, x_testing_new

## Import Data

In [11]:
X_train, y_train, feature_names, label_names = load_dataset('emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')

emotions:train - exists, not redownloading
emotions:test - exists, not redownloading


In [9]:
label_names[0][0]

'amazed-suprised'

## Select `base classifier` and `label classifier`

In [12]:
base_clf = RandomForestClassifier(n_estimators=100)

In [13]:
clf = LabelPowerset(classifier=base_clf, require_dense=None)

In [14]:
prediction = clf.fit(X_train, y_train).predict(X_test)
result_df = trans_result(y_test,prediction)
result_df

Unnamed: 0,name,precision,recall,accuracy,f1
0,amazed-suprised,0.59322,0.648148,0.787129,0.619469
1,happy-pleased,0.596154,0.525424,0.757426,0.558559
2,relaxing-calm,0.706422,0.802083,0.747525,0.75122
3,quiet-still,0.746269,0.847458,0.871287,0.793651
4,sad-lonely,0.807018,0.630137,0.811881,0.707692
5,angry-aggresive,0.693333,0.896552,0.856436,0.781955


## Show feature importances

In [None]:
impt = base_clf.feature_importances_
impt_df=pd.DataFrame([{"feature": item[0], "impt": item[1]} for item in zip(range(72),impt)])
plt.figure(figsize=(16, 10))
sns.barplot(x="feature",y="impt",data=impt_df)

In [None]:
sns.barplot(x="feature",y="impt",data=impt_df[impt_df["impt"] >= 0.0175])

In [None]:
for item in impt_df[impt_df["impt"] >= 0.0175]["feature"]:
    print(feature_names[item][0])

## Apply only important features and retrain

### Use all features

In [None]:
prediction = clf.fit(X_train, y_train).predict(X_test)
result_df = trans_result(y_test,prediction)
result_df

### Importance >= 0.015 , 26 features
Improved: L2, L6 <br>
Reduced: L1, L3, L4, L5

In [None]:
impt_arr = impt_df[impt_df["impt"] >= 0.0150]["feature"].to_numpy()
X_train_new, X_test_new = feature_filter(X_train, X_test, impt_arr)
prediction_new = clf.fit(X_train_new, y_train).predict(X_test_new)
result_df_new = trans_result(y_test,prediction_new)
result_df_new

### Importance >= 0.0175 , 9 features
Imroved: L4, L5

In [None]:
impt_arr = impt_df[impt_df["impt"] >= 0.0175]["feature"].to_numpy()
X_train_new, X_test_new = feature_filter(X_train, X_test, impt_arr)
prediction_new = clf.fit(X_train_new, y_train).predict(X_test_new)
result_df_new = trans_result(y_test,prediction_new)
result_df_new

## Show details of `label classifiers`

In [None]:
clf.unique_combinations_

In [None]:
len(impt_df[impt_df["impt"] >= 0.0150])

In [None]:
len(impt_df[impt_df["impt"] >= 0.0175])