## Multi-label Classification

## Import Packages 

In [91]:
# Import relevant packages

import numpy as np
import matplotlib.pyplot as plt
import math
import pandas as pd

from imblearn.under_sampling import RandomUnderSampler
from sklearn import metrics
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import MetaEstimatorMixin
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score , multilabel_confusion_matrix
#from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import check_X_y, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression

##  Load the Yeast Dataset

In [64]:
# Load the Yeast DataSet

dataset = pd.read_csv('yeast.csv')
dataset.head(3)
X = dataset[dataset.columns[1:103]]
Y = dataset[dataset.columns[103:117]]
Y.head(3)

Unnamed: 0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0,0,0,0,0,0,1,1,0,0,0,1,1,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,1,1,0


##  Implement the Binary Relevance Algorithm

In [65]:
# Write your code here
# Implementation of binary relevance function

class BinaryR(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
    
    def __init__(self,estimator):
        
        self.estimator = estimator
        self.estimators_ =[]
       
        
    def fit(self, X, y):
        
        X,y = check_X_y(X,y, accept_sparse = True, multi_output = True)
        y = check_array(y,accept_sparse=True)
        
        for i in range(y.shape[1]):
            self.estimators_.append(clone(self.estimator).fit(X,y[:,i])) 
    
    
            
    def predict(self,X):
        pred =[]
                
        check_is_fitted(self,'estimators_')
        for e in self.estimators_ :
            pred.append(e.predict(X))
        arr = np.asarray(pred)
        return arr.T

In [66]:
#Splitting the data into 70-30% train and test sets

X_train,X_test,y_train,y_test = train_test_split(X,Y,train_size=0.7) 

In [68]:
# Implementation of binary relevance with Grid Search
# This gives the best performing estimator

param_grid = [
             
             {'estimator':[DecisionTreeClassifier()], 'estimator__criterion': ['gini', 'entropy'], 'estimator__max_depth':[10, 25, 50], 'estimator__min_samples_split': [2,5,10]},
             {'estimator':[SVC()], 'estimator__C':[0.1,1,10,100,1000], 'estimator__gamma':[1,0.1,0.01,0.001,0.0001], 'estimator__kernel':['rbf','linear']},
             {'estimator':[LogisticRegression()], 'estimator__max_iter':[1000,20000,40000]}
]

model_tuned = GridSearchCV(BinaryR(param_grid), param_grid, verbose=2, n_jobs=1, scoring=make_scorer(hamming_loss, greater_is_better = False))
model_tuned.fit(X_train, y_train)

print("Best param set:")
print(model_tuned.best_params_)
print(model_tuned.best_score_)

Fitting 5 folds for each of 71 candidates, totalling 355 fits
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=10, estimator__min_samples_split=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=10, estimator__min_samples_split=2, total=   1.6s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__crit

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=10, estimator__min_samples_split=2, total=   1.6s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__crit

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=10, estimator__min_samples_split=5, total=   1.6s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__crit

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=25, estimator__min_samples_split=2, total=   2.4s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__crit

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=25, estimator__min_samples_split=10, total=   3.2s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__cri

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=50, estimator__min_samples_split=2, total=   2.5s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__crit

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=gini, estimator__max_depth=50, estimator__min_samples_split=10, total=   2.9s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__cri

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=entropy, estimator__max_depth=10, estimator__min_samples_split=2, total=   2.4s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__c

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=entropy, estimator__max_depth=10, estimator__min_samples_split=10, total=   2.7s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=entropy, estimator__max_depth=25, estimator__min_samples_split=5, total=   2.6s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__c

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=entropy, estimator__max_depth=25, estimator__min_samples_split=10, total=   2.7s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=entropy, estimator__max_depth=50, estimator__min_samples_split=5, total=   6.0s
[CV] estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__c

[CV]  estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), estimator__criterion=entropy, estimator__max_depth=50, estimator__min_samples_split=10, total=   6.6s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=1, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', d

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf, total=   5.9s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rb

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.01, estimator__kernel=rbf, total=   5.0s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.01, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.01, estimator__ker

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=linear, total=   4.5s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.001, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=0.1, estimator__gamma=0.001, estimato

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=1, estimator__kernel=rbf, total=   7.3s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=1, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=1, estimator__kernel=rbf, total=   

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.1, estimator__kernel=rbf, total=   6.3s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.1, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.1, estimator__kernel=rbf, tot

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.01, estimator__kernel=linear, total=   4.6s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.01, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.01, estimator__kernel

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.001, estimator__kernel=linear, total=   4.6s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.0001, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1, estimator__gamma=0.0001, estimator__kern

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=1, estimator__kernel=rbf, total=   9.4s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=1, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=1, estimator__kernel=rbf, total=

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.1, estimator__kernel=linear, total=   6.5s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.1, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.1, estimator__kernel

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.01, estimator__kernel=linear, total=   6.5s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.01, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.01, estimator__ker

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.0001, estimator__kernel=rbf, total=   5.1s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.0001, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=10, estimator__gamma=0.0001, estimator__ker

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=1, estimator__kernel=rbf, total=   9.3s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=1, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=1, estimator__kernel=linea

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.1, estimator__kernel=linear, total=  16.5s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.1, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.1, estimator__ker

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.001, estimator__kernel=rbf, total=   5.9s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.001, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.001, estimator__ker

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.0001, estimator__kernel=rbf, total=   5.2s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.0001, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=100, estimator__gamma=0.0001, estimator__

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=1, estimator__kernel=linear, total= 2.8min
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=1, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=1, estimator__kernel

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.1, estimator__kernel=linear, total= 3.3min
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.01, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.01, estimator__k

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.001, estimator__kernel=rbf, total=  10.4s
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.001, estimator__kernel=rbf 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.001, estimator__

[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.0001, estimator__kernel=linear, total= 3.2min
[CV] estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.0001, estimator__kernel=linear 
[CV]  estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), estimator__C=1000, estimator__gamma=0.0001, es

[CV]  estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), estimator__max_iter=20000, total=   0.4s
[CV] estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), estimator__max_iter=20000 
[CV]  estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   

[Parallel(n_jobs=1)]: Done 355 out of 355 | elapsed: 111.7min finished


Best param set:
{'estimator': SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False), 'estimator__C': 1, 'estimator__gamma': 1, 'estimator__kernel': 'rbf'}
-0.1910114278732386


### Random Forest Classifier

In [71]:
#Implement binary relevance with base estimator RandomForest

model_rf = BinaryR(RandomForestClassifier(n_estimators = 100))
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
#print(("F1 Score : %f " ) % (f1_score(y_test,y_pred,average ='macro')))


Hamming Loss : 0.189197 


### Decision Tree Classifier

In [72]:
#Implement binary relevance with base estimator DecisionTree

model_dt = BinaryR(DecisionTreeClassifier())
model_dt.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)

print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
#print(("F1 Score : %f " ) % (f1_score(y_test,y_pred,average ='macro')))

Hamming Loss : 0.275679 


### SVM

In [73]:
#Implement binary relevance with base estimator SVM

model_svm = BinaryR(SVC(gamma ='auto'))
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
#print(("F1 Score : %f " ) % (f1_score(y_test,y_pred,average ='macro')))

Hamming Loss : 0.227863 


## Implement the Binary Relevance Algorithm with Under-Sampling

In [74]:
# Implementation of Binary relevance with under-sampling function

class BinaryR_UnderSampling(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
    
    def __init__(self,estimator):
        
        self.estimator = estimator
        self.estimators_ =[]
       
        
    def fit(self, X, y):
        
        X,y = check_X_y(X,y, accept_sparse = True)
        self.estimators_.append(self.estimator.fit(X,y)) 
    
    
            
    def predict(self,X):
        pred =[]
                
        check_is_fitted(self,'estimators_')
        for e in self.estimators_ :
            pred.append(e.predict(X))
        arr = np.asarray(pred)
        return arr.T


In [75]:
# Implement binary relevance with under-sampling on RandomForest, DecisionTree and SVM estimators

for col in Y:
    target_count = Y[col].value_counts()

    label0_ratio = round(target_count[0] / (target_count[1] + target_count[0]), 2)
    label1_ratio = round(target_count[1] / (target_count[1] + target_count[0]), 2)
    label_ratio = (label0_ratio/label1_ratio)

    if label_ratio != 1:
        rus = RandomUnderSampler(random_state=42)
        X_res, y_res = rus.fit_resample(X, Y[col])
        X_train,X_test,y_train,y_test = train_test_split(X_res,y_res,train_size=0.7) 
        model_dt = BinaryR_UnderSampling(DecisionTreeClassifier())
        model_dt.fit(X_train, y_train)
        y_pred = model_dt.predict(X_test)
        print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
        
        model_rf = BinaryR_UnderSampling(RandomForestClassifier(n_estimators = 100))
        model_rf.fit(X_train, y_train)
        y_pred = model_rf.predict(X_test)
        print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
        
        model_svm = BinaryR_UnderSampling(SVC(gamma ='auto'))
        model_svm.fit(X_train, y_train)
        y_pred = model_dt.predict(X_test)
        print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
       

Hamming Loss : 0.353712 
Hamming Loss : 0.268559 
Hamming Loss : 0.353712 
Hamming Loss : 0.415730 
Hamming Loss : 0.357945 
Hamming Loss : 0.415730 
Hamming Loss : 0.347458 
Hamming Loss : 0.257627 
Hamming Loss : 0.347458 
Hamming Loss : 0.395753 
Hamming Loss : 0.291506 
Hamming Loss : 0.395753 
Hamming Loss : 0.400922 
Hamming Loss : 0.329493 
Hamming Loss : 0.400922 
Hamming Loss : 0.392758 
Hamming Loss : 0.309192 
Hamming Loss : 0.392758 
Hamming Loss : 0.435798 
Hamming Loss : 0.361868 
Hamming Loss : 0.435798 
Hamming Loss : 0.465278 
Hamming Loss : 0.388889 
Hamming Loss : 0.465278 
Hamming Loss : 0.467290 
Hamming Loss : 0.560748 
Hamming Loss : 0.467290 
Hamming Loss : 0.480263 
Hamming Loss : 0.401316 
Hamming Loss : 0.480263 
Hamming Loss : 0.471264 
Hamming Loss : 0.408046 
Hamming Loss : 0.471264 
Hamming Loss : 0.426593 
Hamming Loss : 0.418283 
Hamming Loss : 0.426593 
Hamming Loss : 0.479784 
Hamming Loss : 0.466307 
Hamming Loss : 0.479784 
Hamming Loss : 0.571429 


##  Implement the Classifier Chains Algorithm

In [84]:
# Implementation of Classifier chain algorithm function 

class ClassifierChain(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
    
    def __init__(self,estimator):
        
        self.estimator = estimator
        self.estimators_ = []
#         self.L = 0
        
        
    def fit(self, X, y):
        
        X,y = check_X_y(X,y, accept_sparse = True, multi_output = True)
        y = check_array(y,accept_sparse=True)
#         self.L = y.shape[1]
        
        
        for i in range(y.shape[1]):
            
            c = clone(self.estimator)
            
            if (i==0):
                c.fit(X,y[:,0])
            else:
                stacked = np.hstack((X, y[:, :i]))
                c.fit(stacked, y[:, i])
                
        
            self.estimators_.append(c)          
            
    def predict(self,X):
        
        check_is_fitted(self,'estimators_')
        
        for i, c in enumerate(self.estimators_):
            
            if i == 0:
                y_pred = (c.predict(X)).reshape(-1, 1)
            else:
                stacked = np.hstack((X, y_pred))
                new_y = c.predict(stacked)
                y_pred = np.hstack((y_pred, new_y.reshape(-1, 1)))
                
        return y_pred

##  Evaluate the Performance of the Classifier Chains Algorithm

In [88]:
# Evaluating the performance of the Classifier Chain Algorithm using the following base estimators to compute the hamming loss. 
X_train,X_test,y_train,y_test = train_test_split(X,Y,train_size=0.7) 

model_dt = Classifier_Chain(DecisionTreeClassifier())
model_dt.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)

print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
#print(("F1 Score : %f " ) % (f1_score(y_test,y_pred,average ='macro')))

model_rf = Classifier_Chain(RandomForestClassifier(n_estimators = 100))
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))



Hamming Loss : 0.278827 
Hamming Loss : 0.193821 


In [89]:
model_svc = Classifier_Chain(SVC(C=1, gamma = 1))
model_svc.fit(X_train, y_train)
y_pred = model_svc.predict(X_test)

print(("Hamming Loss : %f " ) % (hamming_loss(y_test,y_pred)))
#print(("F1 Score : %f " ) % (f1_score(y_test,y_pred,average ='macro')))

Hamming Loss : 0.221468 


##  Reflect on the Performance of the Different Models Evaluated

### Evaluation measures:

The evaluation measures we considered for evaluating the performance are Hamming Loss and F1 Measure.
Hamming loss gives each label an equal weight in multi-label classification and hence helps with evaluating an imbalanced data set (as in our case).
Lower the value of hamming loss, better is the accuracy and performance of the model.
Whereas, F1 measure is better suited for binary data. (Thus, we commented it out)


### Performance for binary relevance:

The base estimators we considered for implementing binary relevance were Random Forest, SVM and Decision tree classifiers.
We observed that the hamming loss was least for Random Forest (0.189) followed by SVM (0.227) followed by decision tree (0.275).
This means that Random forest is the best base estimator for our particular data set of multi-label classification.

 
### Binary relevance with Grid search:

The base estimators we considered here are Logistic Regression, DT and SVM out of which SVM was observed to perform the best with the least hamming loss of 0.191.

 
### Performance for binary relevance with under-sampling:

Under-sampling helps in creating a balanced data set by down-sizing the majority class by removing observations at random until the dataset is balanced. We get 14 different hamming losses for all the 14 labels with respect to all three base estimators considered (Random Forest, SVM and DT). However, we observed that the overall accuracy for under-sampling decreased as the hamming loss increased in comparison with binary relevance without under-sampling since most of the data was lost while performing the under-sampling.

 
### Classifier Chain:

A classifier chain model generates a chain of binary classifiers each of which predicts the presence or absence of a specific label. By considering the base estimator the same as above, it is again observed that Random forest performed the best (hamming loss=0.193) followed by SVC (0.221) followed by DT (0.278).

By looking at the above observations, we can conclude that Random forest is the best suited ensemble technique and SVM is the better suited classification technique with respect to our data set.