In [1]:
# Imports
import pandas as pd
from itertools import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from itertools import chain, combinations
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.cluster import KMeans
from clover.over_sampling import ClusterOverSampler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
import warnings

In [2]:
# Defining the dataset that is being used by importing it via .csv
df = pd.read_csv('/datc/nano/notebooks/Target variable & Features (V3).csv', index_col= 0)
df = df[df['Threshold method']=='yen']

# Random seed
RANDOM_SEED = 3

# Defining the different features used by the experiment
features_used = ['Threshold: area spread', 'Threshold: border', 'Threshold: count', 'Threshold: fill', 'Threshold: intensity', 'Threshold: separation']

# Defining the label that has to be predicted
pred_label = 'score_label'

# Defining the total amount of classes that this run has to execute
total_classes_amount = 4

# Init combination dataframes
df_combinations_dict = {}

# Removing warnings
warnings.filterwarnings('ignore')

In [3]:
def score(y_true, y_pred):
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    diff = mean_absolute_error(y_true, y_pred)

    return (acc, prec, rec, f1, diff)

### Model function definitions 
In this section the function for the different models are defined, please follow the format bellow to make sure it can be used by this program.

#### input
* X_train
* X_test
* y_train
* y_test

#### output
* accuracy_score : float 

In [4]:
def trainDecisionTree(X_train, X_test, y_train, y_test):
    model = DecisionTreeClassifier(criterion="entropy" ,max_depth=5)
    model.fit(X_train, y_train)
    return score(y_test, model.predict(X_test))

In [5]:
def trainRandomForestClassifier(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    return score(y_test, model.predict(X_test))

In [6]:
def trainLogisticRegression(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=5000)
    model.fit(X_train, y_train)
    return score(y_test, model.predict(X_test))

In [7]:
def trainMLPClassifier(X_train, X_test, y_train, y_test):
    model = MLPClassifier(solver= 'adam', max_iter = 5000)
    model.fit(X_train, y_train)
    return score(y_test, model.predict(X_test))

### Balancing algorithms function definitions 
In this section the function for the different balancing algorithms are defined, please follow the format bellow to make sure it can be used by this program.

#### input
* data : DataFrame (Dataframe that the balancing has to be done over)

#### output
* df : DataFrame (Returns a DataFrame with the balanced classes. Classes are indexed by 'pred_label')

In [8]:
def applyNoBalancing(data):
    return data

In [9]:
def applyRandomOversampling(data):
    
    X = data.drop(pred_label, axis=1)
    y = data[pred_label]
    ros = RandomOverSampler(random_state=RANDOM_SEED+3)
    X_res, y_res = ros.fit_resample(X, y)

    return X_res.join(y_res)

In [10]:
def applyClusterBasedOversamplingSMOTE(data):
    
    X = data.drop(pred_label, axis= 1)
    y = data[pred_label]
    
    # Create KMeans-SMOTE instance
    smote = SMOTE()
    kmeans = KMeans(n_clusters=8)
    kmeans_smote = ClusterOverSampler(oversampler=smote, clusterer=kmeans)
    
    # Fit and resample imbalanced data
    X_res, y_res = kmeans_smote.fit_resample(X, y)
    
    return X_res.join(y_res)

In [11]:
def applySMOTE(data):
    sm = SMOTE(k_neighbors=4)
    
    X = data.drop(pred_label, axis= 1)
    y = data[pred_label]
    
    X_res, y_res = sm.fit_sample(X, y)
    
    return X_res.join(y_res)

### Splitting classses function definitions 

In [12]:
def normalSplitter(data, classes_amount):
    class_labels = []

    current_classes = np.sort(data['User score'].unique())
    split = np.array_split(current_classes, classes_amount) 

    for row in data.iterrows():
        for label, class_ranges in enumerate(split): 
            if(row[1]['User score'] in class_ranges): 
                class_labels.append(label) 
    data['score_label'] = class_labels  
    return data

In [13]:
def nonLinearSplitter(data, classes_amount):
    score_count = 10
    start = 0
    
    def n_score(score):
        return data[data['User score']==score].shape[0]
    
    n_score_list = []
    for n in range(10):
        n_score_list.append(n_score(n + 1))

    def recursiveFunction(start, classes, scores):
        lst_combinations = []
        n = scores - start - classes

        # N score is 0 so only groups of 1 are possible
        if n == 0:
            lst = []
            for i in range(classes):
                lst.append([sum(n_score_list[start+ i:start+i + 1]), list(range(start+ i + 1, start+i + 2 ))])
            lst_combinations.append(lst)

        # Only one class is left so group remaining scores
        elif classes == 1:
            return [[[sum(n_score_list[start:]), list(range(start + 1,scores + 1))]]]

        # Loop over range of N score (Recursive part)
        else:
            for m in range(n + 1):
                sub_lst = recursiveFunction(start + n - m +1, classes -1, scores)    
                for k in sub_lst:
                    lst = [[sum(n_score_list[start:start + n-m + 1]), list(range(start + 1, start + n-m + 2))]]
                    lst.extend(k)
                    lst_combinations.append(lst)
        return lst_combinations


    best_average = -1
    combinations = recursiveFunction(start,classes_amount, score_count)

    for cb in combinations:
        if best_average == -1:
            best_average = cb
        else:
            best_average_scores = []
            cb_scores = []
            for score in best_average:
                best_average_scores.append(score[0])
            for score in cb:
                cb_scores.append(score[0])
            if max(cb_scores) - min(cb_scores) < max(best_average_scores) - min(best_average_scores):
                best_average = cb

    cur_label = 0
    data[pred_label] = 0
    for score in best_average:
        mask = data['User score'].isin(score[1])
        data.loc[mask, pred_label] = cur_label
        cur_label += 1
        
    return data

In [14]:
def generateScoresDataframe(model, balancing_method):
    d_acc = {'score_type':'accuracy'}
    d_rec = {'score_type':'recall'}
    d_prec = {'score_type':'precision'}
    d_f1 = {'score_type':'f1'}
    d_diff = {'score_type': 'difference'}
    progress = 0.0
    progress_string = f'%-40s Balancing method: {balancing_method}' % f'Model: {model}'
    
    # Define scaler used to scale the classes
    scaler = PowerTransformer(method='box-cox')

    # Loop over all possible class amounts and feature combinations
    for amount in range(total_classes_amount):
        
        accuracy_list = []
        recall_list = []
        precision_list = []
        f1_list = []
        diff_list = []
        
        # Create all subsets for the features
        subsets_features = list(chain(*map(lambda x: combinations(features_used, x), range(1, len(features_used)+1))))
        
        # Apply non linear scaling when dividing classes
        split_df = class_splitter(df, amount + 2)
        split_df = split_df.drop(['User score', 'Threshold method'], axis=1)
        
        np_array_acc = np.array([])
        np_array_rec = np.array([])
        np_array_prec = np.array([])
        np_array_f1 = np.array([])
        
        split_df_cross = np.array_split(split_df, 5)
        
        for n_split, test_scaled_df in enumerate(split_df_cross, start=0):

            acc_l_temp = []
            rec_l_temp = []
            prec_l_temp = []
            f1_l_temp = []
            diff_l_temp = []
            
            train_scaled_df = pd.concat([split_df, test_scaled_df]).drop_duplicates(keep= False)

            # Apply balancing method to training set
            train_balanced_scaled_df = balancing_dict[balancing_method](train_scaled_df)

            i = 0
            for subset in subsets_features:
                i += 1
                
                # dataframe to train on
                X_train = train_balanced_scaled_df[list(subset)]
                y_train = train_balanced_scaled_df[pred_label]

                # dataframe to test on
                X_test = test_scaled_df[list(subset)]
                y_test = test_scaled_df[pred_label]
                
                # Scaling of dataset features
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

                # Call the method linked to model key to get the scores
                scores = models_dict[model](X_train, X_test, y_train, y_test)
                acc_l_temp.append(scores[0])
                prec_l_temp.append(scores[1])
                rec_l_temp.append(scores[2])
                f1_l_temp.append(scores[3])
                diff_l_temp.append(scores[4])

                progress = round(((i/len(list(subsets_features))*.2 + n_split/5) * (1 / total_classes_amount) + (amount / total_classes_amount)) * 100 ,1)
                print(f'%-100s {progress}%% done' % progress_string, end='\r')
                
            # Create numpy array on first itteration
            if(n_split == 0):
                np_array_acc = np.array([acc_l_temp])
                np_array_prec = np.array([prec_l_temp])
                np_array_rec = np.array([rec_l_temp])
                np_array_f1 = np.array([f1_l_temp])
                np_array_diff = np.array([diff_l_temp])
            else:
                np_array_acc = np.append(np_array_acc, [acc_l_temp], axis=0)
                np_array_prec = np.append(np_array_prec, [prec_l_temp],axis=0)
                np_array_rec = np.append(np_array_rec, [rec_l_temp],axis=0)
                np_array_f1 = np.append(np_array_f1, [f1_l_temp],axis=0)
                np_array_diff = np.append(np_array_diff, [diff_l_temp],axis=0)
        
        # Save accuracy list to dict we use to create the dataframe      
        d_acc['n_classes=' + str(amount + 2)] = np.average(np_array_acc, axis=0)
        d_prec['n_classes=' + str(amount + 2)] = np.average(np_array_prec, axis=0)
        d_rec['n_classes=' + str(amount + 2)] = np.average(np_array_rec, axis=0)   
        d_f1['n_classes=' + str(amount + 2)] = np.average(np_array_f1, axis=0)
        d_diff['n_classes=' + str(amount + 2)] = np.average(np_array_diff, axis=0)
    
    
    print(f'%-100s 100.0%% done' % (progress_string))
    subset_labels = []
    
    # Create labels for feature combinations
    for subset in subsets_features:
        subset_label = ''
        for i in range(len(subset)):
            subset_label += subset[i][11]
        subset_labels.append(subset_label)
    
    dataframe = pd.DataFrame()

    df_acc = pd.DataFrame(d_acc)
    df_acc['features_used'] = subset_labels
    df_prec = pd.DataFrame(d_prec)
    df_prec['features_used'] = subset_labels
    df_rec = pd.DataFrame(d_rec)
    df_rec['features_used'] = subset_labels
    df_f1 = pd.DataFrame(d_f1)
    df_f1['features_used'] = subset_labels
    df_diff = pd.DataFrame(d_diff)
    df_diff['features_used'] = subset_labels
    
    dataframe = df_acc.append(df_prec, ignore_index=True)
    dataframe = dataframe.append(df_rec, ignore_index=True)
    dataframe = dataframe.append(df_f1, ignore_index=True)
    dataframe = dataframe.append(df_diff, ignore_index=True)
    
    dataframe = dataframe.set_index(['score_type','features_used'])
    return dataframe



## Main script

In [15]:
# Define the models and balancing methods that we make combinations with

models_dict = {
    'Decision_Tree': trainDecisionTree,
    'Logistic_Regression': trainLogisticRegression,
    'MLP_Classifier': trainMLPClassifier,
    'Random_Forest_Classifier': trainRandomForestClassifier
    
}

# Defining the Balancing Algorithms by storing them as functions in a dict
balancing_dict = {
    'No-balancing': applyNoBalancing,
    'Random-oversampling': applyRandomOversampling,
    'SMOTE' : applySMOTE,
    'Cluster-based_Oversampling_SMOTE' : applyClusterBasedOversamplingSMOTE
}

# Define the class splitter used
class_splitter = normalSplitter

In [16]:
# Loop over all models defined in the models_dict

print('Generating the Scores DataFrames\n')
for key_model in models_dict:
    for key_bal in balancing_dict:
        scores_df = generateScoresDataframe(key_model, key_bal)
        if key_model in df_combinations_dict:
            df_combinations_dict[key_model][key_bal] = scores_df
        else:
            df_combinations_dict[key_model] = {key_bal: scores_df}
        
print('\nProgram finished')

Generating the Scores DataFrames

Model: Decision_Tree                     Balancing method: No-balancing                              100.0% done
Model: Decision_Tree                     Balancing method: Random-oversampling                       100.0% done
Model: Decision_Tree                     Balancing method: SMOTE                                     100.0% done
Model: Decision_Tree                     Balancing method: Cluster-based_Oversampling_SMOTE          100.0% done
Model: Logistic_Regression               Balancing method: No-balancing                              100.0% done
Model: Logistic_Regression               Balancing method: Random-oversampling                       100.0% done
Model: Logistic_Regression               Balancing method: SMOTE                                     100.0% done
Model: Logistic_Regression               Balancing method: Cluster-based_Oversampling_SMOTE          100.0% done
Model: MLP_Classifier                    Balancing method: No-

In [17]:
df_temp = df_combinations_dict['Decision_Tree']['No-balancing']

In [18]:
# Code to export the dataframes to set location
location = 'data/nano/Results/DataFrames/'
models = ['Decision_Tree', 'Logistic_Regression', 'MLP_Classifier', 'Random_Forest_Classifier']
balancing_methods = ['No-balancing', 'Random-oversampling', 'SMOTE', 'Cluster-based_Oversampling_SMOTE']

for model in models:
    for method in balancing_methods:
        df_combinations_dict[model][method].to_csv(f'{model}-{method}.csv')

In [None]:
df_temp.loc['accuracy', 'n_classes=2'].max()

In [None]:
df_temp.loc['f1', 'n_classes=2']

In [18]:
df_temp

Unnamed: 0_level_0,Unnamed: 1_level_0,n_classes=2,n_classes=3,n_classes=4,n_classes=5
score_type,features_used,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accuracy,a,0.518182,0.430303,0.136364,0.210606
accuracy,b,0.622727,0.450000,0.227273,0.157576
accuracy,c,0.516667,0.377273,0.383333,0.190909
accuracy,f,0.548485,0.322727,0.236364,0.203030
accuracy,i,0.539394,0.503030,0.310606,0.189394
...,...,...,...,...,...
difference,abcis,0.445455,0.672727,1.093939,1.568182
difference,abfis,0.463636,0.996970,1.603030,1.601515
difference,acfis,0.428788,0.671212,1.206061,1.653030
difference,bcfis,0.393939,0.890909,1.112121,1.601515


In [28]:
df_temp.loc['recall', 'cfis']

n_classes=2    0.595714
n_classes=3    0.393889
n_classes=4    0.279762
n_classes=5    0.238000
Name: (recall, cfis), dtype: float64

In [None]:
df_temp.loc['difference'].mean() 