In [1]:
import numpy as np 
import pandas as pd
data = pd.read_pickle('multi_label_sample_10000_feature_800.pickle')

In [2]:
data.shape

(3316, 924)

In [3]:
X = data.iloc[:,:800]
Y = data.iloc[:,800:]
Y.shape[1]
X_train = X.loc[:2500].as_matrix()
X_test = X.loc[2500:].as_matrix()
Y_train = Y.loc[:2500].as_matrix()
Y_test = Y.loc[2500:].as_matrix()

# KNN

In [4]:
from skmultilearn.adapt import MLkNN
knn = MLkNN(k=3)
knn.fit(X_train, Y_train)

MLkNN(ignore_first_neighbours=0, k=3, s=1.0)

In [5]:
knn.score(X_train,Y_train)

0.3218712514994002

In [6]:
knn.score(X_test,Y_test)

0.028186274509803922

# rf

In [7]:
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.problem_transform import BinaryRelevance

# construct base forest classifier
base_classifier = RandomForestClassifier(n_estimators= 100, random_state=1)

rf = LabelPowerset(base_classifier, require_dense = [False, False])

# train
rf.fit(X_train,Y_train)



LabelPowerset(classifier=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       require_dense=[False, False])

In [8]:
rf.score(X_train,Y_train)

1.0

In [9]:
rf.score(X_test,Y_test)

0.07965686274509803

In [10]:
#rf_pred = rf.predict(X_test)
#from sklearn.metrics import accuracy_score
#accuracy_score(Y_test,rf_pred)

# SVM

In [11]:
from sklearn.svm import SVC
# svm with linear kernel
linear_base = SVC(kernel='linear')
linear_svm = LabelPowerset(linear_base, require_dense = [False, False])

# train
linear_svm.fit(X_train,Y_train)

LabelPowerset(classifier=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       require_dense=[False, False])

In [12]:
linear_svm.score(X_train,Y_train)

1.0

In [13]:
linear_svm.score(X_test,Y_test)

0.4178921568627451

In [14]:
#svm with rbf kernel
rbf_base = SVC(kernel='rbf')
rbf_svm = LabelPowerset(rbf_base, require_dense = [False, False])

# train
rbf_svm.fit(X_train,Y_train)

LabelPowerset(classifier=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       require_dense=[False, False])

In [15]:
rbf_svm.score(X_train,Y_train)

0.062375049980008

In [16]:
rbf_svm.score(X_test,Y_test)

0.041666666666666664

# NNET 

In [17]:
from sklearn.neural_network import MLPClassifier

In [18]:
nnet = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(3, 300), random_state=1)
nnet.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(3, 300), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [19]:
nnet.score(X_train, Y_train)

0.06517393042782887

In [20]:
nnet.score(X_test, Y_test)

0.03676470588235294

# using binary relevance transformation

In [21]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
#classifier = BinaryRelevance(GaussianNB())
classifier = BinaryRelevance(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(3, 300), random_state=1))


# train
classifier.fit(X_train, Y_train)


BinaryRelevance(classifier=MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(3, 300), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False),
        require_dense=[True, True])

In [22]:
# predict
train_pred = classifier.predict(X_train)
from sklearn.metrics import accuracy_score
accuracy_score(Y_train,train_pred)

0.8864454218312675

In [23]:
test_pred = classifier.predict(X_test)
accuracy_score(Y_test,test_pred)

0.058823529411764705