## Synthetic Minority Oversampling Technique (SMOTE)
### Using SMOTE to oversample the minority classes

In [1]:
#Generic library imports and data import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd
import csv
import ast
%matplotlib inline
import matplotlib.pyplot as plt

data = pd.read_csv('../Data_sets/Master_cleaned.csv')

#Functions

def remove_unwanted_observations(data, unwanted_observations = ["['Oily', 'Sensitive']",
                                                                "['Dry', 'Normal', 'Oily']",
                                                                "['Dry', 'Oily']",
                                                                "['Dry', 'Normal', 'Oily', 'Sensitive']",
                                                                "['Combination', 'Dry', 'Oily', 'Sensitive']",
                                                                "['Normal', 'Oily']"
                                                                ]):
    for observation in unwanted_observations:
        data = data[data.Skin_Type != observation]
    
    return data

In [2]:
# Specific Libary imports and functions
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain
from sklearn.model_selection import KFold, GridSearchCV, cross_validate, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics

kf = KFold(n_splits=5, shuffle = True, random_state=1024)

def grid_search (model, search_space):
    pipe = Pipeline([("classifier", model)])
    gs = GridSearchCV(pipe, search_space, scoring = "accuracy", cv= kf)
    gs.fit(X_train, y_train)
    return gs.best_estimator_.get_params()["classifier"]

def rfc_grid_search(model, hyperparameters):
    gs = GridSearchCV(model, hyperparameters, scoring = ham_loss, cv= kf)
    gs.fit(X_train, y_train)
    return gs.best_estimator_.get_params()

def get_cross_val_results (clf, base_classifier, X, y, X_test,y_test):
    try: 
        model = clf(classifier = best_classifier)
    except TypeError:
        model = clf(base_classifier = best_classifier)
    results = cross_validate(model, X, y, cv= kf, scoring =('accuracy', "f1_weighted"), return_train_score = True)
    # train results 
    train_accuracy = results['train_accuracy'].mean()
    train_f1 = results['train_f1_weighted']
    print(model)
    print("Training Accuracy = %.04f +/- %.04f" % (train_accuracy.mean(), train_accuracy.std()*2))
    print("Training F1 Score = %.04f +/- %.04f" % (train_f1.mean(), train_f1.std()*2))

    # test results
    y_pred = cross_val_predict(model, X_test, y_test, cv=kf)
    test_accuracy = results['test_accuracy'].mean()
    test_f1 = results['test_f1_weighted']
    print("Test Accuracy = %.04f +/- %.04f" % (test_accuracy.mean(), test_accuracy.std()*2))
    print("Test F1 Score = %.04f +/- %.04f" % (test_f1.mean(), test_f1.std()*2))
    print("Hamming Loss = %.04f" % metrics.hamming_loss(y_test, y_pred))

In [3]:
data = remove_unwanted_observations(data)
X = data[data.columns[17:28]].values
y = data.Skin_Type
counter = Counter(y)
print(counter)

Counter({"['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']": 918, "['Combination', 'Dry', 'Normal', 'Oily']": 239, "['Combination', 'Normal', 'Oily']": 113, "['Combination', 'Oily']": 91, "['Normal']": 83, "['Dry', 'Normal']": 72, "['Dry']": 71, "['Combination', 'Dry', 'Normal']": 66, "['Combination', 'Dry', 'Normal', 'Sensitive']": 61, "['Combination']": 59, "['Sensitive']": 46, "['Combination', 'Normal']": 30, "['Dry', 'Normal', 'Sensitive']": 29, "['Oily']": 26, "['Combination', 'Normal', 'Oily', 'Sensitive']": 23, "['Combination', 'Dry']": 22, "['Dry', 'Sensitive']": 19, "['Combination', 'Dry', 'Oily']": 13, "['Normal', 'Sensitive']": 12, "['Combination', 'Oily', 'Sensitive']": 10, "['Combination', 'Dry', 'Sensitive']": 7})


In [4]:
data

Unnamed: 0,Product,Brand,Ingredients,Price,Skin_Type,Combination,Dry,Normal,Oily,Sensitive,...,num_of_Emollients,num_of_Hydration,num_of_Skin-Restoring,num_of_Plant Extracts,num_of_Preservatives,num_of_Skin-Softening,num_of_Sensitizing,num_of_Skin-Replenishing,top_3,top3_category_list
0,#InstantDetox Facial Mask,Biobelle,"Water, Butylene Glycol, Glycerin, Trehalose, H...",3.99,['Oily'],0,0,0,1,0,...,0,0,0,0,0,0,0,1,"['Butylene Glycol', 'Glycerin', 'Trehalose']","['Texture Enhancer', 'Skin-Replenishing, Skin-..."
1,#Peachy Facial Mask,Biobelle,"Water, Methylpropanediol, Butylene Glycol, Gly...",3.99,['Dry'],0,1,0,0,0,...,0,0,0,0,0,0,0,1,"['Methylpropanediol', 'Butylene Glycol', 'Glyc...","[None, 'Texture Enhancer', 'Skin-Replenishing,..."
2,#Rise&Shine Facial Mask,Biobelle,"Water, Glycerin, Butylene Glycol, Triethylhexa...",3.99,['Combination'],1,0,0,0,0,...,0,0,0,0,0,0,0,1,"['Glycerin', 'Butylene Glycol', 'Triethylhexan...","['Skin-Replenishing, Skin-Restoring', 'Texture..."
3,#RoséAllDay Facial Mask,Biobelle,"Water, Methylpropanediol, Glycerin, Propanedio...",3.99,['Combination'],1,0,0,0,0,...,0,0,0,1,2,0,0,2,"['Methylpropanediol', 'Glycerin', 'Propanediol']","[None, 'Skin-Replenishing, Skin-Restoring', None]"
4,#VitaminSea Facial Mask,Biobelle,"Water, Butylene Glycol, Glycerin, Hydroxyaceto...",3.99,['Dry'],0,1,0,0,0,...,0,0,0,1,0,0,0,1,"['Butylene Glycol', 'Glycerin', 'Hydroxyacetop...","['Texture Enhancer', 'Skin-Replenishing, Skin-..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,Youthful Vitamin C Fresh Radiance Essence,No7,"Aqua (Water), Butylene Glycol, Glycerin, Gluco...",24.99,"['Combination', 'Dry', 'Normal', 'Oily']",1,1,1,1,0,...,0,0,0,2,3,0,0,2,"['Butylene Glycol', 'Glycerin', 'Gluconolactone']","['Texture Enhancer', 'Skin-Replenishing, Skin-..."
2020,Yuza Sorbet Day Cream,Erborian,"Aqua/Water, Cyclomethicone, Glycerin, Nylon-12...",48.00,"['Combination', 'Dry', 'Normal', 'Oily', 'Sens...",1,1,1,1,1,...,1,0,0,1,1,0,0,1,"['Cyclomethicone', 'Glycerin', 'Nylon-12']","['Emollients', 'Skin-Replenishing, Skin-Restor..."
2021,Yuza Sorbet Night Treatment,Erborian,"Aqua/Water, Cyclomethicone, Glycerin, Cetearyl...",55.00,"['Combination', 'Dry', 'Normal', 'Oily', 'Sens...",1,1,1,1,1,...,2,0,0,0,0,0,0,1,"['Cyclomethicone', 'Glycerin', 'Cetearyl Alcoh...","['Emollients', 'Skin-Replenishing, Skin-Restor..."
2022,Yuzu Overnight Moisture Mask,Earth Therapeutics,"Water (Aqua), Propanediol, Glycerin, Hydrogena...",7.00,"['Combination', 'Dry', 'Normal', 'Sensitive']",1,1,1,0,1,...,2,0,0,1,0,0,0,1,"['Propanediol', 'Glycerin', 'Hydrogenated Poly...","[None, 'Skin-Replenishing, Skin-Restoring', None]"


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state = 1024, stratify=y)

In [7]:
#SMOTE
num = 100
num2 = 50

# summarize the class distribution
counter = Counter(y_train)
print(f"Before resampling:\n{counter}\n") 

oversample = SMOTE(k_neighbors = 3, sampling_strategy = {"['Combination', 'Normal', 'Oily']": num, 
                                                         "['Combination', 'Oily']": num,
                                                         "['Dry']": num,
                                                         "['Normal']": num,
                                                         "['Combination', 'Dry', 'Normal', 'Sensitive']": num,
                                                         "['Dry', 'Normal']": num,
                                                         "['Combination', 'Dry', 'Normal']": num,
                                                         "['Combination']": num,
                                                         "['Sensitive']": num2,
                                                         "['Combination', 'Normal']": num2,
                                                         "['Dry', 'Normal', 'Sensitive']": num2,
                                                         "['Dry', 'Sensitive']": num2,
                                                         "['Combination', 'Dry']": num2,
                                                         "['Oily']": num2,
                                                         "['Combination', 'Normal', 'Oily', 'Sensitive']": num2,
                                                         "['Combination', 'Dry', 'Oily']": num2,
                                                         "['Normal', 'Sensitive']": num2,
                                                         "['Combination', 'Oily', 'Sensitive']": num2,
                                                         "['Combination', 'Dry', 'Sensitive']": num2
                                                        })
X_train, y_train = oversample.fit_resample(X_train, y_train)

# summarize the new class distribution
counter = Counter(y_train)
print(f"After resampling:\n{counter}\n") 

Before resampling:
Counter({"['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']": 643, "['Combination', 'Dry', 'Normal', 'Oily']": 167, "['Combination', 'Normal', 'Oily']": 79, "['Combination', 'Oily']": 64, "['Normal']": 58, "['Dry', 'Normal']": 50, "['Dry']": 50, "['Combination', 'Dry', 'Normal']": 46, "['Combination', 'Dry', 'Normal', 'Sensitive']": 43, "['Combination']": 41, "['Sensitive']": 32, "['Combination', 'Normal']": 21, "['Dry', 'Normal', 'Sensitive']": 20, "['Oily']": 18, "['Combination', 'Dry']": 16, "['Combination', 'Normal', 'Oily', 'Sensitive']": 16, "['Dry', 'Sensitive']": 13, "['Combination', 'Dry', 'Oily']": 9, "['Normal', 'Sensitive']": 9, "['Combination', 'Oily', 'Sensitive']": 7, "['Combination', 'Dry', 'Sensitive']": 5})

After resampling:
Counter({"['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']": 643, "['Combination', 'Dry', 'Normal', 'Oily']": 167, "['Normal']": 100, "['Combination', 'Dry', 'Normal', 'Sensitive']": 100, "['Combination']": 100, "['Com

In [8]:
y_train = y_train.apply(ast.literal_eval)
y_test = y_test.apply(ast.literal_eval)

mlb=MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)
mlb.classes_

array(['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive'], dtype=object)

## Results after running SMOTE

In [9]:
search_space1 = [{'classifier':[DecisionTreeClassifier()], 'classifier__max_depth' :[5,6,7,8,9],
    'classifier__max_leaf_nodes': [5,6,7,8,9,10,15]},{'classifier': [RandomForestClassifier()], 'classifier__n_estimators': [5,8,10,12,14,15],'classifier__max_features': [[8,9,10,11], "auto", 'sqrt', 'log2']}, {'classifier': [SVC()], 'classifier__kernel':['rbf', 'linear']},{'classifier': [MultinomialNB()], 'classifier__alpha': [.7, 1.0]}]

In [10]:
kf = KFold(n_splits=5, shuffle = True, random_state=1024)
ham_loss = metrics.make_scorer(metrics.hamming_loss, greater_is_better= False)
mlb2 = MultiLabelBinarizer()
y = mlb2.fit_transform(y)

In [11]:
model4 = RandomForestClassifier(random_state = 1024)
hyperparameters = {'n_estimators' :[4,5,8,10,15,20], "criterion" :['gini', 'entropy'], 'max_leaf_nodes': [5,6,7,8,9,10,15], 'max_features': [8,9,10,11]}
best_param = rfc_grid_search(model4, hyperparameters)

clf = [BinaryRelevance, ClassifierChain, LabelPowerset]
for classifier in clf:
    best_classifier = grid_search(classifier(), search_space1)
    get_cross_val_results(classifier,best_classifier, X_train, y_train, X_test, y_test)

BinaryRelevance(classifier=RandomForestClassifier(max_features='log2',
                                                  n_estimators=10),
                require_dense=[True, True])
Training Accuracy = 0.8516 +/- 0.0000
Training F1 Score = 0.9643 +/- 0.0017
Test Accuracy = 0.3444 +/- 0.0000
Test F1 Score = 0.7913 +/- 0.0083
Hamming Loss = 0.3154
ClassifierChain(classifier=RandomForestClassifier(n_estimators=15),
                require_dense=[True, True])
Training Accuracy = 0.9045 +/- 0.0000
Training F1 Score = 0.9642 +/- 0.0038
Test Accuracy = 0.4486 +/- 0.0000
Test F1 Score = 0.8053 +/- 0.0183
Hamming Loss = 0.2889
LabelPowerset(classifier=RandomForestClassifier(n_estimators=15),
              require_dense=[True, True])
Training Accuracy = 0.9190 +/- 0.0000
Training F1 Score = 0.9689 +/- 0.0018
Test Accuracy = 0.4556 +/- 0.0000
Test F1 Score = 0.8042 +/- 0.0057
Hamming Loss = 0.2975


In [12]:
rfc_SMOTE = RandomForestClassifier(random_state = 1024)
best_param = rfc_grid_search(rfc_SMOTE, hyperparameters)

In [13]:
best_param

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 9,
 'max_leaf_nodes': 15,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 15,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1024,
 'verbose': 0,
 'warm_start': False}

In [15]:
rfc_SMOTE = RandomForestClassifier(bootstrap= True, criterion= "entropy", max_features= 11, max_leaf_nodes= 15, n_estimators=10,random_state=1024)

# train results
results = cross_validate(rfc_SMOTE, X_train, y_train, cv= kf, scoring =('accuracy', "f1_weighted"), return_train_score = True)
train_accuracy = results['train_accuracy'].mean()
train_f1 = results['train_f1_weighted']
print("Training Accuracy = %.04f +/- %.04f" % (train_accuracy.mean(), train_accuracy.std()*2))
print("Training F1 Score = %.04f +/- %.04f" % (train_f1.mean(), train_f1.std()*2))

# test results
y_pred = cross_val_predict(rfc_SMOTE, X_test, y_test, cv=kf)
test_accuracy = results['test_accuracy']
test_f1 = results['test_f1_weighted']
print("Test Accuracy = %.04f +/- %.04f" % (test_accuracy.mean(), test_accuracy.std()*2))
print("Test F1 Score = %.04f +/- %.04f" % (test_f1.mean(), test_f1.std()*2))
print("Hamming Loss = %.04f" % (metrics.hamming_loss(y_test, y_pred)))

Training Accuracy = 0.2449 +/- 0.0000
Training F1 Score = 0.7863 +/- 0.0075
Test Accuracy = 0.2306 +/- 0.0332
Test F1 Score = 0.7744 +/- 0.0109
Hamming Loss = 0.2743
