## Building Multi-label Classifiers 
### Base Classifiers to test: Binary Relevance (no correlation), Binary Relevance (with correlation), Multi-label Random Forest, Multi-label Decision Tree, Classifier Chain, Binary Releveance with Stacking Aggregation, Cross-Coupling Aggregation (to combat class-imbalances)

In [1]:
#Generic library imports and data import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd
import csv 
import ast
%matplotlib inline
import matplotlib.pyplot as plt


data = pd.read_csv('../Data_sets/Master_cleaned.csv')

#Functions

def remove_unwanted_observations(data, unwanted_observations = ["['Oily', 'Sensitive']",
                                                                "['Dry', 'Normal', 'Oily']",
                                                                "['Dry', 'Oily']",
                                                                "['Dry', 'Normal', 'Oily', 'Sensitive']",
                                                                "['Combination', 'Dry', 'Oily', 'Sensitive']",
                                                                "['Normal', 'Oily']"
                                                                ]):
    for observation in unwanted_observations:
        data = data[data.Skin_Type != observation]
    
    return data

In [2]:
# Specific Libary imports and functions
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import KFold, cross_validate, cross_val_predict, cross_val_score, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import sklearn.metrics as metrics
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain
from skmultilearn.adapt import MLkNN
from skmultilearn.adapt import BRkNNaClassifier
from skmultilearn.ensemble import RakelD
import time


def grid_search (model, search_space):
    pipe = Pipeline([("classifier", model)])
    gs = GridSearchCV(pipe, search_space, scoring = "accuracy", cv= kf)
    gs.fit(X_train, y_train)
    return gs.best_estimator_.get_params()["classifier"]

def get_cross_val_results (clf, base_classifier, X, y, X_test,y_test):
    try: 
        model = clf(classifier = best_classifier)
    except TypeError:
        model = clf(base_classifier = best_classifier)
    results = cross_validate(model, X, y, cv= kf, scoring =('accuracy', "f1_weighted"), return_train_score = True)
    # train results 
    train_accuracy = results['train_accuracy'].mean()
    train_f1 = results['train_f1_weighted']
    print(model)
    print("Training Accuracy = %.04f +/- %.04f" % (train_accuracy.mean(), train_accuracy.std()*2))
    print("Training F1 Score = %.04f +/- %.04f" % (train_f1.mean(), train_f1.std()*2))

    # test results
    y_pred = cross_val_predict(model, X_test, y_test, cv=kf)
    test_accuracy = results['test_accuracy'].mean()
    test_f1 = results['test_f1_weighted']
    print("Test Accuracy = %.04f +/- %.04f" % (test_accuracy.mean(), test_accuracy.std()*2))
    print("Test F1 Score = %.04f +/- %.04f" % (test_f1.mean(), test_f1.std()*2))
    print("Hamming Loss = %.04f" % metrics.hamming_loss(y_test, y_pred))
    
def rfc_grid_search(model, hyperparameters):
    gs = GridSearchCV(model, hyperparameters, scoring = ham_loss, cv= kf)
    gs.fit(X_train, y_train)
    return gs.best_estimator_.get_params()

In [3]:
data

Unnamed: 0,Product,Brand,Ingredients,Price,Skin_Type,Combination,Dry,Normal,Oily,Sensitive,...,num_of_Emollients,num_of_Hydration,num_of_Skin-Restoring,num_of_Plant Extracts,num_of_Preservatives,num_of_Skin-Softening,num_of_Sensitizing,num_of_Skin-Replenishing,top_3,top3_category_list
0,#InstantDetox Facial Mask,Biobelle,"Water, Butylene Glycol, Glycerin, Trehalose, H...",3.99,['Oily'],0,0,0,1,0,...,0,0,0,0,0,0,0,1,"['Butylene Glycol', 'Glycerin', 'Trehalose']","['Texture Enhancer', 'Skin-Replenishing, Skin-..."
1,#Peachy Facial Mask,Biobelle,"Water, Methylpropanediol, Butylene Glycol, Gly...",3.99,['Dry'],0,1,0,0,0,...,0,0,0,0,0,0,0,1,"['Methylpropanediol', 'Butylene Glycol', 'Glyc...","[None, 'Texture Enhancer', 'Skin-Replenishing,..."
2,#Rise&Shine Facial Mask,Biobelle,"Water, Glycerin, Butylene Glycol, Triethylhexa...",3.99,['Combination'],1,0,0,0,0,...,0,0,0,0,0,0,0,1,"['Glycerin', 'Butylene Glycol', 'Triethylhexan...","['Skin-Replenishing, Skin-Restoring', 'Texture..."
3,#RoséAllDay Facial Mask,Biobelle,"Water, Methylpropanediol, Glycerin, Propanedio...",3.99,['Combination'],1,0,0,0,0,...,0,0,0,1,2,0,0,2,"['Methylpropanediol', 'Glycerin', 'Propanediol']","[None, 'Skin-Replenishing, Skin-Restoring', None]"
4,#VitaminSea Facial Mask,Biobelle,"Water, Butylene Glycol, Glycerin, Hydroxyaceto...",3.99,['Dry'],0,1,0,0,0,...,0,0,0,1,0,0,0,1,"['Butylene Glycol', 'Glycerin', 'Hydroxyacetop...","['Texture Enhancer', 'Skin-Replenishing, Skin-..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,Youthful Vitamin C Fresh Radiance Essence,No7,"Aqua (Water), Butylene Glycol, Glycerin, Gluco...",24.99,"['Combination', 'Dry', 'Normal', 'Oily']",1,1,1,1,0,...,0,0,0,2,3,0,0,2,"['Butylene Glycol', 'Glycerin', 'Gluconolactone']","['Texture Enhancer', 'Skin-Replenishing, Skin-..."
2020,Yuza Sorbet Day Cream,Erborian,"Aqua/Water, Cyclomethicone, Glycerin, Nylon-12...",48.00,"['Combination', 'Dry', 'Normal', 'Oily', 'Sens...",1,1,1,1,1,...,1,0,0,1,1,0,0,1,"['Cyclomethicone', 'Glycerin', 'Nylon-12']","['Emollients', 'Skin-Replenishing, Skin-Restor..."
2021,Yuza Sorbet Night Treatment,Erborian,"Aqua/Water, Cyclomethicone, Glycerin, Cetearyl...",55.00,"['Combination', 'Dry', 'Normal', 'Oily', 'Sens...",1,1,1,1,1,...,2,0,0,0,0,0,0,1,"['Cyclomethicone', 'Glycerin', 'Cetearyl Alcoh...","['Emollients', 'Skin-Replenishing, Skin-Restor..."
2022,Yuzu Overnight Moisture Mask,Earth Therapeutics,"Water (Aqua), Propanediol, Glycerin, Hydrogena...",7.00,"['Combination', 'Dry', 'Normal', 'Sensitive']",1,1,1,0,1,...,2,0,0,1,0,0,0,1,"['Propanediol', 'Glycerin', 'Hydrogenated Poly...","[None, 'Skin-Replenishing, Skin-Restoring', None]"


## 1. Post Processing Data

In [5]:
#Split Data

X = data[data.columns[17:28]].values
y = data.Skin_Type

data = remove_unwanted_observations(data)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state = 1024, stratify=y)

print("Dataset sizes:\n\tTrain %s\n\tTest %s" % (X_train.shape,X_test.shape))

Dataset sizes:
	Train (1407, 11)
	Test (603, 11)


In [6]:
y_train = y_train.apply(ast.literal_eval)
y_test = y_test.apply(ast.literal_eval)

mlb=MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)
mlb.classes_

array(['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive'], dtype=object)

## 2. Baseline

In [8]:
dummy = DummyClassifier(strategy= 'uniform', random_state=1024)
dummy.fit(X_train, y_train)
dummy.score(X_test, y_test)

0.029850746268656716

In [9]:
p = dummy.predict(X_test)
print("Accuracy = ", metrics.accuracy_score(y_test,p))
print("F1 Score = ", metrics.f1_score(y_test,p, average = "weighted"))
print("Hamming Loss", metrics.hamming_loss(y_test, p))

Accuracy =  0.029850746268656716
F1 Score =  0.5882546298802469
Hamming Loss 0.5058043117744611


## 3. Problem Transformation Methods

In [10]:
kf = KFold(n_splits=5, shuffle = True, random_state=1024)
ham_loss = metrics.make_scorer(metrics.hamming_loss, greater_is_better= False)
mlb2 = MultiLabelBinarizer()
y = mlb2.fit_transform(y)

### 3-1. One vs the Rest Classifier (Binary Relevance)

In [11]:
search_space1 = [{'classifier':[DecisionTreeClassifier()], 'classifier__max_depth' :[5,6,7,8,9],
    'classifier__max_leaf_nodes': [5,6,7,8,9,10,15]},{'classifier': [RandomForestClassifier()], 'classifier__n_estimators': [5,8,10,12,14,15],'classifier__max_features': [[8,9,10,11], "auto", 'sqrt', 'log2']}, {'classifier': [SVC()], 'classifier__kernel':['rbf', 'linear']},{'classifier': [MultinomialNB()], 'classifier__alpha': [.7, 1.0]}]

In [12]:
best_classifier = grid_search(BinaryRelevance(), search_space1)
get_cross_val_results(BinaryRelevance, best_classifier, X_train, y_train, X_test, y_test)

BinaryRelevance(classifier=DecisionTreeClassifier(max_depth=5,
                                                  max_leaf_nodes=5),
                require_dense=[True, True])
Training Accuracy = 0.4451 +/- 0.0000
Training F1 Score = 0.8525 +/- 0.0046
Test Accuracy = 0.4158 +/- 0.0000
Test F1 Score = 0.8455 +/- 0.0177
Hamming Loss = 0.2779


### 3-2. Classifer Chains

In [13]:
best_classifier = grid_search(ClassifierChain(), search_space1)
get_cross_val_results(ClassifierChain, best_classifier, X_train, y_train, X_test, y_test)

ClassifierChain(classifier=DecisionTreeClassifier(max_depth=5,
                                                  max_leaf_nodes=5),
                require_dense=[True, True])
Training Accuracy = 0.4586 +/- 0.0000
Training F1 Score = 0.8525 +/- 0.0042
Test Accuracy = 0.4513 +/- 0.0000
Test F1 Score = 0.8494 +/- 0.0210
Hamming Loss = 0.2713


### 3-3. Label Powerset

In [14]:
best_classifier = grid_search(LabelPowerset(), search_space1)
get_cross_val_results(LabelPowerset,best_classifier, X_train, y_train, X_test, y_test)

LabelPowerset(classifier=DecisionTreeClassifier(max_depth=5, max_leaf_nodes=5),
              require_dense=[True, True])
Training Accuracy = 0.4595 +/- 0.0000
Training F1 Score = 0.8517 +/- 0.0042
Test Accuracy = 0.4542 +/- 0.0000
Test F1 Score = 0.8512 +/- 0.0176
Hamming Loss = 0.2753


## 4. Ensemble Methods

### 4-1. Random Forest Classifier

In [15]:
model4 = RandomForestClassifier(random_state = 1024)
hyperparameters = {'n_estimators' :[4,5,8,10,15,20], "criterion" :['gini', 'entropy'], 'max_leaf_nodes': [5,6,7,8,9,10,15], 'max_features': [8,9,10,11]}
best_param = rfc_grid_search(model4, hyperparameters)

In [16]:
best_param

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 8,
 'max_leaf_nodes': 15,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1024,
 'verbose': 0,
 'warm_start': False}

In [17]:
model4 = RandomForestClassifier(bootstrap= True, criterion= "entropy", max_features= 8, max_leaf_nodes= 15, n_estimators=10,random_state=1024)

# train results
results = cross_validate(model4, X_train, y_train, cv= kf, scoring =('accuracy', "f1_weighted"), return_train_score = True)
train_accuracy = results['train_accuracy'].mean()
train_f1 = results['train_f1_weighted']
print("Training Accuracy = %.04f +/- %.04f" % (train_accuracy.mean(), train_accuracy.std()*2))
print("Training F1 Score = %.04f +/- %.04f" % (train_f1.mean(), train_f1.std()*2))

# test results
y_pred = cross_val_predict(model4, X_test, y_test, cv=kf)
test_accuracy = results['test_accuracy']
test_f1 = results['test_f1_weighted']
print("Test Accuracy = %.04f +/- %.04f" % (test_accuracy.mean(), test_accuracy.std()*2))
print("Test F1 Score = %.04f +/- %.04f" % (test_f1.mean(), test_f1.std()*2))
print("Hamming Loss = %.04f" % (metrics.hamming_loss(y_test, y_pred)))

Training Accuracy = 0.4481 +/- 0.0000
Training F1 Score = 0.8546 +/- 0.0060
Test Accuracy = 0.4257 +/- 0.0415
Test F1 Score = 0.8495 +/- 0.0183
Hamming Loss = 0.2653


### 4-2. RAkELd: random label space partitioning with Label powerset

In [18]:
get_cross_val_results(RakelD, SVC(kernel='linear', random_state= 1024), X_train, y_train, X_test, y_test)

RakelD(base_classifier=DecisionTreeClassifier(max_depth=5, max_leaf_nodes=5))
Training Accuracy = 0.4586 +/- 0.0000
Training F1 Score = 0.8521 +/- 0.0042
Test Accuracy = 0.4471 +/- 0.0000
Test F1 Score = 0.8490 +/- 0.0204
Hamming Loss = 0.2769


## 5. Adaptive Methods

### 5-1. Multi-label K Nearest Neighbors

In [43]:
parameters = {'k': range(1,9), 
              's': [0.5, 0.7, 1.0]}

score = ham_loss
classifier = GridSearchCV(MLkNN(), parameters, scoring=score)
classifier.fit(X_train, y_train)
print('best parameters :', classifier.best_params_, 'best score: ',
      classifier.best_score_)

best parameters : {'k': 7, 's': 0.5} best score:  -0.2929431362156432


In [39]:
MlKnn = MLkNN(k= 7, s = .5)
MlKnn.fit(X_train, y_train)
train_pred = MlKnn.predict(X_train)
pred = MlKnn.predict(X_test)

print("Training Accuracy = %.04f" % metrics.accuracy_score(y_train, train_pred))
print("Training F1 Score = %.04f" % metrics.f1_score(y_train, train_pred, average='weighted'))
print("Accuracy = %.04f" % metrics.accuracy_score(y_test, pred))
print("F1 Score = %.04f" % metrics.f1_score(y_test, pred, average='weighted'))
print("Hamming Loss = %.04f" % metrics.hamming_loss(y_test, pred))

Training Accuracy = 0.4385
Training F1 Score = 0.8623
Accuracy = 0.2869
F1 Score = 0.8106
Hamming Loss = 0.3022


### 5-2. Binary Relevance K Nearest Neighbors

In [44]:
parameters = {'k': range(3,9)}

classifier = GridSearchCV(BRkNNaClassifier(), parameters, scoring=score)
classifier.fit(X_train, y_train)

print('best parameters :', classifier.best_params_,
      'best score: ',classifier.best_score_)

best parameters : {'k': 7} best score:  -0.2886792357588147


In [42]:
BrKnn = BRkNNaClassifier(k= 7)
BrKnn.fit(X_train, y_train)
pred = BrKnn.predict(X_test)
print("Accuracy = %.04f" % metrics.accuracy_score(y_test, pred))
print("F1 Score = %.04f" % metrics.f1_score(y_test, pred, average='weighted'))
print("Hamming Loss = %.04f" % metrics.hamming_loss(y_test, pred))

Accuracy = 0.2869
F1 Score = 0.8106
Hamming Loss = 0.3022
