In [None]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import *
import os
from scipy.stats import zscore
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import shap
import lime
from PyALE import ale
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)

import seaborn as sns
import matplotlib.pyplot as plt

import matplotlib.style as style
#style.available
style.use('tableau-colorblind10')
# style.use('seaborn-notebook')
style.use('seaborn-whitegrid')

random_state = 1000

In [None]:
def save_plot(figname: str):
    figname = figname+'.jpg'
    figdir = os.getcwd()
    filedir = '\\'.join([figdir, figname])
    plt.savefig(filedir, bbox_inches='tight', pad_inches=0, transparent=True)
    return

In [None]:
df = pd.read_csv('../data/rock_data.csv')

# handle missing values
if df.isnull().values.any():
    df.fillna(df.mean(), inplace=True)
    print('Missing values filled with mean.')

# detect and remove outliers

# drop highly correlated features
df.drop(['Area', 'Perimeter','Median Height', 'Max Height', 'STD Height'], axis=1, inplace=True)
df.columns

In [None]:
X = df.iloc[:,3:].to_numpy(dtype=float)

def map_class_values(x):
    mapping = {12: 0, 58: 1, 34: 2}
    return mapping.get(x,None)

y = np.vectorize(map_class_values)(df['Class'].values)

In [None]:
# training:validation:testing = 60:20:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)

In [None]:
model_names = ['KNN', 'SVC', 'RF', 'XGB', 'TREE', 'MLP']
models = [KNeighborsClassifier(), SVC(), RandomForestClassifier(), XGBClassifier(), DecisionTreeClassifier(), MLPClassifier(max_iter=500)]
grid_params = [{'n_neighbors': [5, 10, 15, 20], 'weights': ['uniform', 'distance']}, {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'sigmoid']}, {'n_estimators': [10, 100, 200], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_features': ['sqrt', 'log2', None]}, {'max_depth': [3,5,7], 'n_estimators': [10, 100, 200], 'reg_lambda': [1e-3, 1, 1e3]}, {'criterion': ['gini', 'entropy', 'log_loss'],'max_depth': [3,5,7]}, {'hidden_layer_sizes': [(100,), (100,50), (100,50,100)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'alpha': [1e-3, 1, 1e3]}]

In [None]:
def parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method):
    """
    Parse classification report valeus to dataframe for posterior comparison of model performance.
    Returns:
    --------
    df: classification report values dataframe
    """
    report = classification_report(y_val, model_best_pred, output_dict=True)
    dict_values = {}
    dict_values['model']=model_names[m]
    dict_values['params']=np.array([grid_search.best_params_])
    dict_values['dataset']=dataset
    dict_values['opt_method']=opt_method
    for elem, score in report.items():
        try:
            for score_name, score_num in score.items():
                if elem.isnumeric():
                    col_i = '_'.join(['class', elem.replace(" ", "_"), score_name])
                else:
                    col_i = '_'.join([elem.replace(" ", "_"), score_name])
                dict_values[col_i] = score_num
#                 col = np.append(col, col_i)
        except:
            col_i = elem
            dict_values[col_i] = score_num
#             col = np.append(col, col_i)
            continue
    df = pd.DataFrame(dict_values, index=[m])
    return df

In [None]:
dataset = 'mean_height'
opt_method = 'grid_search'
for m,model in enumerate(models):
    grid_search = GridSearchCV(model, grid_params[m], cv=5)
    grid_search.fit(X_train, y_train)
    model_best = grid_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
    if m==0:
        results_df = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method)
    else:
        df_i = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method)
        results_df = pd.concat([results_df, df_i])
    # Print results
    print(f'{model_names[m]} with {grid_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

## Bayes Search Optimization

In [None]:
search_spaces = [{
    'n_neighbors': Integer(3,50),
    'weights': Categorical(['uniform', 'distance'])
    },
    
    {
    'C': Real(0.1, 10, 'log-uniform'),
    'kernel': Categorical(['linear', 'rbf', 'sigmoid']),
    },
    
    {
    'n_estimators': Integer(10,200),
    'criterion': Categorical(['gini', 'entropy', 'log_loss']),
    'max_features': Categorical(['sqrt', 'log2', None]),
    },
    
    {
    'max_depth': Integer(3,10),
    'n_estimators': Integer(10,200),
    'reg_lambda': Real(1e-3, 1e3, 'log-uniform'),
    },
    {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3,5,7]
    }
    ]

In [None]:
dataset = 'mean_height'
opt_method = 'bayes_search'
for m,model in enumerate(models[:-1]):
    bayes_search = BayesSearchCV(model, search_spaces[m], n_iter=50, cv=5, n_jobs=-1)
    bayes_search.fit(X_train, y_train)
    model_best = bayes_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
    df_i = parse_report(y_val, model_best_pred, m, model_names, bayes_search, dataset, opt_method)
    results_df = pd.concat([results_df, df_i])
    print(f'{model_names[m]} with {bayes_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

In [None]:
best_models = [KNeighborsClassifier(n_neighbors=15, weights= 'distance'), SVC(C=10, kernel='rbf'), RandomForestClassifier(criterion='log_loss', max_features= 'sqrt', n_estimators=200), XGBClassifier(max_depth=3, n_estimators=100, reg_lambda=1), MLPClassifier(max_iter=500, activation='logistic', alpha=0.001, hidden_layer_sizes=(100,50))]

In [None]:
for m,model in enumerate(best_models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'{model} \n {classification_report(y_test, y_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()
    show()

In [None]:
# best_model = KNeighborsClassifier(n_neighbors=15, weights='distance') 
       
labels = ['Uncalibrated', 'Isotonic', 'Sigmoid']
classes = ['12', '58', '34']
colors= ['r', 'g', 'b']

In [None]:
X_train, X_cal, y_train, y_cal = train_test_split(X, y, test_size=0.3, random_state=random_state+2)

In [None]:
best_model = SVC(C=0.1, kernel='linear', probability=True)
best_model_cal_iso = CalibratedClassifierCV(best_model, method='isotonic')
best_model_cal_sig = CalibratedClassifierCV(best_model, method='sigmoid')
models = [best_model, best_model_cal_iso, best_model_cal_sig]

fig, axs = subplots(nrows=1, ncols=3, figsize=(16,4))

for m, model in enumerate(models):
    model.fit(X_train, y_train)

for i in range(3):
    for m, model in enumerate(models):
        y_pred = model.predict_proba(X_cal)
        y_cal_b = np.vectorize(lambda x: 1 if x==i else 0)(y_cal)
        y_pred_b = y_pred[:,i]

        fop, mpv = calibration_curve(y_cal_b, (y_pred_b-y_pred_b.min())/y_pred_b.ptp(), n_bins=10)

        axs[i].plot(fop, mpv, color=colors[m], linestyle='-', linewidth=3, label=labels[m])

    axs[i].plot([0, 1], [0, 1], color='gray', linestyle='--', linewidth=3, label='Ideal')
    axs[i].set_xlim([0, 1])
    axs[i].set_ylim([0, 1])
    axs[i].set_xlabel('Fraction of positives', fontsize=24)
    axs[i].set_ylabel('Mean predicted value', fontsize=24)
    axs[i].set_title(f'{classes[i]} vs. else', fontsize=24)
    
    if i==0:
        axs[i].legend(loc='best' , fontsize=12)

show()

In [None]:
fig, axs = subplots(nrows=1, ncols=3, figsize=(16,4))

for i in range(3):
    best_model = SVC(C=0.1, kernel='linear', probability=True)
    best_model_cal_iso = CalibratedClassifierCV(best_model, method='isotonic')
    best_model_cal_sig = CalibratedClassifierCV(best_model, method='sigmoid')
    models = [best_model, best_model_cal_iso, best_model_cal_sig]
    for m, model in enumerate(models):
        
        y_train_b = np.vectorize(lambda x: 1 if x==i else 0)(y_train)
        model.fit(X_train, y_train_b)
        
        y_pred = model.predict_proba(X_cal)
        y_cal_b = np.vectorize(lambda x: 1 if x==i else 0)(y_cal)
        y_pred_b = y_pred[:,1]

        fop, mpv = calibration_curve(y_cal_b, (y_pred_b-y_pred_b.min())/y_pred_b.ptp(), n_bins=10)

        axs[i].plot(fop, mpv, color=colors[m], linestyle='-', linewidth=3, label=labels[m])

    axs[i].plot([0, 1], [0, 1], color='gray', linestyle='--', linewidth=3, label='Ideal')
    axs[i].set_xlim([0, 1])
    axs[i].set_ylim([0, 1])
    axs[i].set_xlabel('Fraction of positives', fontsize=24)
    axs[i].set_ylabel('Mean predicted value', fontsize=24)
    axs[i].set_title(f'{classes[i]} vs. else', fontsize=24)
    
    if i==0:
        axs[i].legend(loc='best' , fontsize=12)

show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state+4)

for m, model in enumerate(models):
        model.fit(X_train, y_train)

In [None]:
fig, axs = subplots(nrows=1, ncols=3, figsize=(16,4))

for i in range(3):
    best_model = SVC(C=0.1, kernel='linear', probability=True)
    best_model_cal_iso = CalibratedClassifierCV(best_model, method='isotonic')
    best_model_cal_sig = CalibratedClassifierCV(best_model, method='sigmoid')
    models = [best_model, best_model_cal_iso, best_model_cal_sig]
    for m, model in enumerate(models):
        
        y_train_b = np.vectorize(lambda x: 1 if x==i else 0)(y_train)
        model.fit(X_train, y_train_b)
        
        y_pred = model.predict_proba(X_test)
        y_test_b = np.vectorize(lambda x: 1 if x==i else 0)(y_test)
        y_pred_b = y_pred[:,1]

        fpr, tpr, thresholds = roc_curve(y_test_b, y_pred_b)

        axs[i].plot(fpr, tpr, color=colors[m], linestyle='--', linewidth=3, label=labels[m])
        axs[i].text(0.7, 0.6-0.05*m, f'AUC={auc(fpr, tpr):.2f}', color=colors[m], fontsize=12)

    axs[i].plot([0, 1], [0, 1], color='gray', linestyle='--', linewidth=3)
    axs[i].set_xlim([0, 1])
    axs[i].set_ylim([0, 1])
    axs[i].set_xlabel('False Positive Rate', fontsize=24)
    axs[i].set_ylabel('True Positive Rate', fontsize=24)
    axs[i].set_title(f'{classes[i]} vs. else', fontsize=24)
    

    if i==0:
        axs[i].legend(loc='best' , fontsize=12)
            
show()

In [None]:
best_model = SVC(C=0.1, kernel='linear', probability=True)
best_model_cal_iso = CalibratedClassifierCV(best_model, method='isotonic')
best_model_cal_sig = CalibratedClassifierCV(best_model, method='sigmoid')
models = [best_model, best_model_cal_iso, best_model_cal_sig]

fig, axs = subplots(nrows=1, ncols=3, figsize=(16,4))

for m, model in enumerate(models):
    model.fit(X_train, y_train)
for i in range(3):
    for m, model in enumerate(models):
        y_pred = model.predict_proba(X_test)
        y_test_b = np.vectorize(lambda x: 1 if x==i else 0)(y_test)
        y_pred_b = y_pred[:,i]

        fpr, tpr, thresholds = roc_curve(y_test_b, y_pred_b)

        axs[i].plot(fpr, tpr, color=colors[m], linestyle='--', linewidth=3, label=labels[m])
        axs[i].text(0.7, 0.6-0.05*m, f'AUC={auc(fpr, tpr):.2f}', color=colors[m], fontsize=12)

    axs[i].plot([0, 1], [0, 1], color='gray', linestyle='--', linewidth=3)
    axs[i].set_xlim([0, 1])
    axs[i].set_ylim([0, 1])
    axs[i].set_xlabel('False Positive Rate', fontsize=24)
    axs[i].set_ylabel('True Positive Rate', fontsize=24)
    axs[i].set_title(f'{classes[i]} vs. else', fontsize=24)
    
    if i==0:
        axs[i].legend(loc='best' , fontsize=12)
            
show()

In [None]:
X = df.iloc[:,3:]
y = df['Class']

In [None]:
best_model = RandomForestClassifier(criterion='log_loss', n_estimators=200)
best_model.fit(X, y)
explainer = shap.TreeExplainer(best_model)
shap_values = explainer(X)

In [None]:
shap.summary_plot(shap_values[:,:,0], X, max_display=X.shape[1])

In [None]:
shap.summary_plot(shap_values[:,:,1], X, max_display=X.shape[1])

In [None]:
shap.summary_plot(shap_values[:,:,2], X, max_display=X.shape[1])

In [None]:
shap.plots.waterfall(shap_values[100,:,0])

In [None]:
shap.plots.waterfall(shap_values[100,:,1])

In [None]:
shap.plots.waterfall(shap_values[100,:,2])

In [None]:
X.columns

In [None]:
shap.plots.scatter(shap_values[:,1,0], color=shap_values[:,2,0])

In [None]:
shap.plots.scatter(shap_values[:,1,1], color=shap_values[:,2,1])

In [None]:
shap.plots.scatter(shap_values[:,1,2], color=shap_values[:,2,2])

In [None]:
best_model = KNeighborsClassifier(n_neighbors=15, weights='distance')
best_model.fit(X, y)
features_PDP = ["Axis Major Length", "Axis Minor Length"]
PartialDependenceDisplay.from_estimator(best_model, X, features_PDP, kind='both', target=12)

In [None]:
ale_eff = ale(X=X, model=best_model, feature=["Axis Minor Length"], grid_size=20, include_CI=False)

In [None]:
ale_eff = ale(X=X, model=best_model, feature=features_PDP, grid_size=20, include_CI=False)

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X.to_numpy(), class_names=['12', '58', '34'], feature_names = X.columns, kernel_width=2, mode='classification')

In [None]:
i = np.random.randint(0, X.shape[0])
exp = explainer.explain_instance(X.iloc[i,:], best_model.predict_proba, num_features=X.shape[1], top_labels=1)
exp.save_to_file(f'original_lime_row{i}.html')

# Model training - change Mean height to Max Height (based on SHAP results)

In [None]:
df = pd.read_csv('../data/rock_data.csv')

# handle missing values
if df.isnull().values.any():
    df.fillna(df.mean(), inplace=True)
    print('Missing values filled with mean.')

# detect and remove outliers

# drop highly correlated features
df.drop(['Area', 'Perimeter','Median Height', 'Mean Height', 'STD Height'], axis=1, inplace=True)
df.columns

In [None]:
X = df.iloc[:,3:].to_numpy(dtype=float)

def map_class_values(x):
    mapping = {12: 0, 58: 1, 34: 2}
    return mapping.get(x,None)

y = np.vectorize(map_class_values)(df['Class'].values)

In [None]:
# training:validation:testing = 60:20:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)

In [None]:
model_names = ['KNN', 'SVC', 'RF', 'XGB', 'MLP']
models = [KNeighborsClassifier(), SVC(), RandomForestClassifier(), XGBClassifier(), MLPClassifier(max_iter=500)]
grid_params = [{'n_neighbors': [5, 10, 15, 20], 'weights': ['uniform', 'distance']}, {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'sigmoid']}, {'n_estimators': [10, 100, 200], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_features': ['sqrt', 'log2', None]}, {'max_depth': [3,5,7], 'n_estimators': [10, 100, 200], 'reg_lambda': [1e-3, 1, 1e3]}, {'hidden_layer_sizes': [(100,), (100,50), (100,50,100)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'alpha': [1e-3, 1, 1e3]}]

In [None]:
dataset = 'max_height'
opt_method = 'grid_search'
for m,model in enumerate(models):
    grid_search = GridSearchCV(model, grid_params[m], cv=5)
    grid_search.fit(X_train, y_train)
    model_best = grid_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
#     if m==0:
#         results_df = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset)
#     else:
    df_i = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method)
    results_df = pd.concat([results_df, df_i])
    # Print results
    print(f'{model_names[m]} with {grid_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

## Bayes Search Optimization

In [None]:
search_spaces = [{
    'n_neighbors': Integer(3,50),
    'weights': Categorical(['uniform', 'distance'])
    },
    
    {
    'C': Real(0.1, 10, 'log-uniform'),
    'kernel': Categorical(['linear', 'rbf', 'sigmoid']),
    },
    
    {
    'n_estimators': Integer(10,200),
    'criterion': Categorical(['gini', 'entropy', 'log_loss']),
    'max_features': Categorical(['sqrt', 'log2', None]),
    },
    
    {
    'max_depth': Integer(3,10),
    'n_estimators': Integer(10,200),
    'reg_lambda': Real(1e-3, 1e3, 'log-uniform'),
    }
    ]

In [None]:
dataset = 'max_height'
opt_method = 'bayes_search'
for m,model in enumerate(models[:-1]):
    bayes_search = BayesSearchCV(model, search_spaces[m], n_iter=50, cv=5, n_jobs=-1)
    bayes_search.fit(X_train, y_train)
    model_best = bayes_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
    df_i = parse_report(y_val, model_best_pred, m, model_names, bayes_search, dataset, opt_method)
    results_df = pd.concat([results_df, df_i])
    print(f'{model_names[m]} with {bayes_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

### Compare Model Results

In [None]:
results_df['opt_method_dataset']=results_df['opt_method']+'_'+results_df['dataset']

In [None]:
plt.title('Model Comparison - Grid or Bayes Search \nDataset with Max or Mean Height')
sns.barplot(x=results_df['model'], y=results_df['weighted_avg_f1-score'], hue=results_df['opt_method_dataset'])
plt.ylim(0.6, 0.75)
plt.show()

In [None]:
plt.title('Model Comparison\nDataset with Max or Mean Height')
sns.barplot(x=results_df['model'], y=results_df['weighted_avg_f1-score'], hue=results_df['dataset'])
plt.ylim(0.6, 0.75)
plt.show()

# SHAP Values

In [None]:
df = pd.read_csv('../data/rock_data.csv')

# handle missing values
if df.isnull().values.any():
    df.fillna(df.mean(), inplace=True)
    print('Missing values filled with mean.')

# detect and remove outliers

# drop highly correlated features
df.drop(['Area', 'Perimeter','Median Height', 'Mean Height', 'STD Height'], axis=1, inplace=True)
df.columns

X = df.iloc[:,3:]#.to_numpy(dtype=float)
y = df['Class'].values #np.vectorize(map_class_values)(df['Class'].values)

In [None]:
# Best model - from plot
model_mask = (results_df['model']=='SVC')
dataset_mask = (results_df['dataset']=='max_height')
opt_method_mask = (results_df['opt_method']=='grid_search')
mask = model_mask & dataset_mask & opt_method_mask
params_best_model = results_df['params'][mask].values[0]
results_df[mask]

In [None]:
params_best_model

In [None]:
# best_model = SVC(C= params_best_model['C'], kernel= params_best_model['kernel'])
best_model = RandomForestClassifier(criterion= params_best_model['criterion'], 
                                    max_features= params_best_model['max_features'], 
                                    n_estimators= params_best_model['n_estimators'])
best_model.fit(X_train, y_train)

# svm_explainer = shap.KernelExplainer(best_model.predict,X_test)
# shap_values = svm_explainer.shap_values(X_test)

explainer = shap.TreeExplainer(best_model)
shap_values = explainer(X)

In [None]:
shap.summary_plot(shap_values[:,:,0], X, max_display=X.shape[1])

In [None]:
shap.summary_plot(shap_values[:,:,1], X, max_display=X.shape[1])

In [None]:
shap.summary_plot(shap_values[:,:,2], X, max_display=X.shape[1])

In [None]:
shap.plots.scatter(shap_values[:,2,0], color=shap_values[:,3,0])
save_plot('shap_scatter_minor len vs max H_class 0')

In [None]:
shap.plots.scatter(shap_values[:,2,1], color=shap_values[:,3,1])
save_plot('shap_scatter_minor len vs max H_class 1')

In [None]:
shap.plots.scatter(shap_values[:,2,2], color=shap_values[:,3,2])
save_plot('shap_scatter_minor len vs max H_class 2')

# Partial dependence plots

In [None]:
# best_model = KNeighborsClassifier(n_neighbors=15, weights='distance')
# best_model.fit(X, y)
features_PDP = ["Axis Minor Length"]
PartialDependenceDisplay.from_estimator(best_model, X, features_PDP, kind='both', target=0)
save_plot('pdp_axis minor len_class 0')

In [None]:
features_PDP = ["Axis Minor Length"]
PartialDependenceDisplay.from_estimator(best_model, X, features_PDP, kind='both', target=1)
save_plot('pdp_axis minor len_class 1')

In [None]:
features_PDP = ["Axis Minor Length"]
PartialDependenceDisplay.from_estimator(best_model, X, features_PDP, kind='both', target=2)
save_plot('pdp_axis minor len_class 2')

In [None]:
# best_model = KNeighborsClassifier(n_neighbors=15, weights='distance')
# best_model.fit(X, y)
features_PDP = ["Max Height"]
PartialDependenceDisplay.from_estimator(best_model, X, features_PDP, kind='both', target=0)
save_plot('pdp_axis minor len_class 0_max h')

In [None]:
features_PDP = ["Max Height"]
PartialDependenceDisplay.from_estimator(best_model, X, features_PDP, kind='both', target=1)
save_plot('pdp_axis minor len_class 1_max h')

In [None]:
features_PDP = ["Max Height"]
PartialDependenceDisplay.from_estimator(best_model, X, features_PDP, kind='both', target=2)
save_plot('pdp_axis minor len_class 2_max h')

# Instance - Waterfall

In [None]:
shap.plots.waterfall(shap_values[100,:,0])

In [None]:
shap.plots.waterfall(shap_values[100,:,1])

In [None]:
shap.plots.waterfall(shap_values[100,:,2])

In [None]:
results_df.to_csv('models_performance.csv')

# Dropping Orientation

In [None]:
df = pd.read_csv('../data/rock_data.csv')

# handle missing values
if df.isnull().values.any():
    df.fillna(df.mean(), inplace=True)
    print('Missing values filled with mean.')

# detect and remove outliers

# drop highly correlated features
df.drop(['Area', 'Perimeter','Median Height', 'Mean Height', 'STD Height', 'Orientation','Sample ID', 'Rock ID'], axis=1, inplace=True)
df.columns

In [None]:
X = df.iloc[:,3:].to_numpy(dtype=float)

def map_class_values(x):
    mapping = {12: 0, 58: 1, 34: 2}
    return mapping.get(x,None)

y = np.vectorize(map_class_values)(df['Class'].values)

In [None]:
# training:validation:testing = 60:20:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)

In [None]:
model_names = ['KNN', 'SVC', 'RF', 'XGB', 'TREE', 'MLP']
models = [KNeighborsClassifier(), SVC(), RandomForestClassifier(), XGBClassifier(), DecisionTreeClassifier(), MLPClassifier(max_iter=500)]
grid_params = [{'n_neighbors': [5, 10, 15, 20], 'weights': ['uniform', 'distance']}, {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'sigmoid']}, {'n_estimators': [10, 100, 200], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_features': ['sqrt', 'log2', None]}, {'max_depth': [3,5,7], 'n_estimators': [10, 100, 200], 'reg_lambda': [1e-3, 1, 1e3]}, {'criterion': ['gini', 'entropy', 'log_loss'],'max_depth': [3,5,7]}, {'hidden_layer_sizes': [(100,), (100,50), (100,50,100)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'alpha': [1e-3, 1, 1e3]}]

In [None]:
dataset = 'max_height'
opt_method = 'grid_search'
for m,model in enumerate(models):
    grid_search = GridSearchCV(model, grid_params[m], cv=5)
    grid_search.fit(X_train, y_train)
    model_best = grid_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
    if m==0:
        results_df_drop = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method)
    else:
        df_i = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method)
        results_df_drop = pd.concat([results_df_drop, df_i])
    # Print results
    print(f'{model_names[m]} with {grid_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

## Bayes Search Optimization

In [None]:
search_spaces = [{
    'n_neighbors': Integer(3,50),
    'weights': Categorical(['uniform', 'distance'])
    },
    
    {
    'C': Real(0.1, 10, 'log-uniform'),
    'kernel': Categorical(['linear', 'rbf', 'sigmoid']),
    },
    
    {
    'n_estimators': Integer(10,200),
    'criterion': Categorical(['gini', 'entropy', 'log_loss']),
    'max_features': Categorical(['sqrt', 'log2', None]),
    },
    
    {
    'max_depth': Integer(3,10),
    'n_estimators': Integer(10,200),
    'reg_lambda': Real(1e-3, 1e3, 'log-uniform'),
    },
    {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3,5,7]
    }
    ]

In [None]:
dataset = 'max_height'
opt_method = 'bayes_search'
for m,model in enumerate(models[:-1]):
    bayes_search = BayesSearchCV(model, search_spaces[m], n_iter=50, cv=5, n_jobs=-1)
    bayes_search.fit(X_train, y_train)
    model_best = bayes_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
    df_i = parse_report(y_val, model_best_pred, m, model_names, bayes_search, dataset, opt_method)
    results_df_drop = pd.concat([results_df_drop, df_i])
    print(f'{model_names[m]} with {bayes_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

# Z-score normalized

In [None]:
df = pd.read_csv('../data/rock_data.csv')

# handle missing values
if df.isnull().values.any():
    df.fillna(df.mean(), inplace=True)
    print('Missing values filled with mean.')

# detect and remove outliers

# drop highly correlated features
df.drop(['Area', 'Perimeter','Median Height', 'Mean Height', 'STD Height'], axis=1, inplace=True)
df.columns

In [None]:
X = df.iloc[:,3:].to_numpy(dtype=float)

def map_class_values(x):
    mapping = {12: 0, 58: 1, 34: 2}
    return mapping.get(x,None)

y = np.vectorize(map_class_values)(df['Class'].values)

In [None]:
# training:validation:testing = 60:20:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)

In [None]:
model_names = ['KNN', 'SVC', 'RF', 'XGB', 'TREE', 'MLP']
knn = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
svc = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
rf = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())])
xgb = Pipeline([('scaler', StandardScaler()), ('xgb', XGBClassifier())])
tree = Pipeline([('scaler', StandardScaler()), ('tree', DecisionTreeClassifier())])
mlp = Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=500))])
models = [knn, svc, rf, xgb, tree, mlp]
grid_params = [{'knn__n_neighbors': [5, 10, 15, 20], 'knn__weights': ['uniform', 'distance']}, 
               {'svc__C': [0.1, 1, 10], 'svc__kernel': ['linear', 'rbf', 'sigmoid']}, 
               {'rf__n_estimators': [10, 100, 200], 'rf__criterion': ['gini', 'entropy', 'log_loss'], 'rf__max_features': ['sqrt', 'log2', None]}, 
               {'xgb__max_depth': [3,5,7], 'xgb__n_estimators': [10, 100, 200], 'xgb__reg_lambda': [1e-3, 1, 1e3]}, 
               {'tree__criterion': ['gini', 'entropy', 'log_loss'],'tree__max_depth': [3,5,7]}, 
               {'mlp__hidden_layer_sizes': [(100,), (100,50), (100,50,100)], 'mlp__activation': ['identity', 'logistic', 'tanh', 'relu'], 'mlp__alpha': [1e-3, 1, 1e3]}]

In [None]:
dataset = 'max_height'
opt_method = 'grid_search'
for m,model in enumerate(models):
    grid_search = GridSearchCV(model, grid_params[m], cv=5)
    grid_search.fit(X_train, y_train)
    model_best = grid_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
    if m==0:
        results_df_norm = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method)
    else:
        df_i = parse_report(y_val, model_best_pred, m, model_names, grid_search, dataset, opt_method)
        results_df_norm = pd.concat([results_df_norm, df_i])
    # Print results
    print(f'{model_names[m]} with {grid_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

## Bayes Search Optimization

In [None]:
search_spaces = [{
    'knn__n_neighbors': Integer(3,50),
    'knn__weights': Categorical(['uniform', 'distance'])
    },
    
    {
    'svc__C': Real(0.1, 10, 'log-uniform'),
    'svc__kernel': Categorical(['linear', 'rbf', 'sigmoid']),
    },
    
    {
    'rf__n_estimators': Integer(10,200),
    'rf__criterion': Categorical(['gini', 'entropy', 'log_loss']),
    'rf__max_features': Categorical(['sqrt', 'log2', None]),
    },
    
    {
    'xgb__max_depth': Integer(3,10),
    'xgb__n_estimators': Integer(10,200),
    'xgb__reg_lambda': Real(1e-3, 1e3, 'log-uniform'),
    },
    {
    'tree__criterion': ['gini', 'entropy', 'log_loss'],
    'tree__max_depth': Integer(3,10)
    }
    ]

In [None]:
dataset = 'max_height'
opt_method = 'bayes_search'
for m,model in enumerate(models[:-1]):
    bayes_search = BayesSearchCV(model, search_spaces[m], n_iter=50, cv=5, n_jobs=-1)
    bayes_search.fit(X_train, y_train)
    model_best = bayes_search.best_estimator_
    model_best_pred = model_best.predict(X_val)
    # Parse report to dataframe to compare results among models
    df_i = parse_report(y_val, model_best_pred, m, model_names, bayes_search, dataset, opt_method)
    results_df_norm = pd.concat([results_df_norm, df_i])
    print(f'{model_names[m]} with {bayes_search.best_params_} \n {classification_report(y_val, model_best_pred)}\n')
    ConfusionMatrixDisplay(confusion_matrix(y_val, model_best_pred)).plot()
    show()

In [None]:
results_df = results_df.reset_index(drop=True).iloc[:20,:]

In [None]:
results_df['test'] = 'original'
results_df_drop['test'] = 'drop_orientation'
results_df_norm['test'] = 'z-score_norm'

In [None]:
results_final = pd.concat([results_df, results_df_drop, results_df_norm]).reset_index(drop=True)

In [None]:
mask = results_final['test'] != 'z-score_norm'
plt.title('Model Comparison\nOriginal vs. Drop Orientation Dataset')
sns.barplot(x=results_final['model'][mask], y=results_final['weighted_avg_f1-score'][mask], hue=results_final['test'][mask])
plt.ylim(0.5, 0.75)
plt.show()

In [None]:
mask = results_final['test'] != 'drop_orientation'
plt.title('Model Comparison\nOriginal vs. Z-score Normalization Dataset')
sns.barplot(x=results_final['model'][mask], y=results_final['weighted_avg_f1-score'][mask], hue=results_final['test'][mask])
plt.ylim(0.6, 0.75)
plt.show()