In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

import matplotlib.style as style
#style.available
style.use('tableau-colorblind10')
# style.use('seaborn-notebook')
# style.use('seaborn-whitegrid')

# To pre-process and test cleaning results
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, ConfusionMatrixDisplay

In [None]:
def save_plot(figname: str):
    figname = figname+'.jpg'
    figdir = os.getcwd()
    filedir = '\\'.join([figdir, figname])
    plt.savefig(filedir, bbox_inches='tight', pad_inches=0, transparent=True)
    return

# Data Pre-processing for Classification

In [None]:
data_df = pd.read_csv('../data/rock_data.csv')
data_df.head()

In [None]:
data_df.describe()

## Main parameters distribution analysis

### Density distribution

In [None]:
# Plot distributions
cols = data_df.columns[4:] # colums to plot - leaving out sample ID, rock #, Class and orientation
rows_num = 3 # Subplot grid rows
cols_num = int(len(cols)/rows_num) # Subplot grid columns
# Plot
plt.figure(figsize=(18, 12))
for i, col in enumerate(cols):
    plt.subplot(rows_num, cols_num, i+1)
    sns.kdeplot(data=data_df, x=col, shade='fill', hue='Class')

plt.tight_layout()
plt.show()

In [None]:
plt.title('Axis Minor Length Distribution per Class')
sns.kdeplot(data=data_df, x='Axis Minor Length', shade='fill', hue='Class')
save_plot('axis_minor_len_dist')
plt.show()

In [None]:
plt.title('Max Height Distribution per Class')
sns.kdeplot(data=data_df, x='Max Height', shade='fill', hue='Class')
save_plot('max_height_dist')
plt.show()

### Comments
The main parameters (Perimeter, Area, Axis Major and Minor lengths and Mean Height) seem to show normal distribution. The distribution for the different classes tend to overlap, which will make the classificaiton task harder.

Median Height, Max Heaight and STD Height show binomial distributions, again overlapped among classes. 

## Boxplots

In [None]:
# Boxplots
cols = data_df.columns[4:] # colums to plot - leaving out sample ID, rock #, Class and orientation
rows_num = 3 # Subplot grid rows
cols_num = int(len(cols)/rows_num) # Subplot grid columns
# Plot
plt.figure(figsize=(18, 12))
for i, col in enumerate(cols):
    plt.subplot(rows_num, cols_num, i+1)
    sns.boxplot(data=data_df, y=col, x='Class', orient='v', dodge=True)

plt.tight_layout()
plt.show()

### Comments
In all the main parameters there's an indication of outliers over the high side, with the distribution skewed over the low side.

## Parameters correlation

In [None]:
data_pairplot = data_df[data_df.columns[2:9]].drop(columns='Orientation')

In [None]:
sns.pairplot(data_pairplot, hue='Class',plot_kws={'alpha':0.5})
# plt.tight_layout()
plt.show()

There's a clear indication of correlation between Perimeter and Area (which makes sense), and a softer correlation between Perimeter & Area with Major and Minor lengths (which also makes sense).

### Pearson's Correlation Heatmap

In [None]:
# Compute correlation
df_corr = data_df[data_df.columns[2:]].drop(columns='Orientation').corr()
cols = data_df[data_df.columns[2:]].drop(columns='Orientation').columns

In [None]:
# Plot Correlation Heatmap
plt.figure(figsize=(12,6))
sns.heatmap(df_corr, cbar = True,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 8},
           xticklabels=  cols, 
           yticklabels=  cols,
           cmap= 'coolwarm',
           cbar_kws={'label': 'Pearson Correlation'})

plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title('Parameters Pearson Correlation Heatmap')
plt.show()

From the heatmap it can be observed that Area, Perimeter and Axis Major and Minor Length are highly correlated. The same happens with the Height mean, median, max. and standatd deviation, and the gradiend mean, max. and standard deviation.

In [None]:
# Countplot per Class
order = data_df.Class.unique()
plt.title('Sample count by Class')
ax = sns.countplot(data=data_df,x='Class', order=order)
ax.bar_label(ax.containers[0])

plt.show()

Classes seem to be mostly balanced, with a little imbalance on the middle class (58) respect to classes 12 and 34.

## Simple model training
This step is done to understand main predictors and impact of removal of correlated parameters. The same for outliers removal.

In [None]:
X = data_df[data_df.columns[4:]].to_numpy()
y = data_df['Class'].to_numpy()
# Train/Test Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Train simple classifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
# Get Feature Importances
importances = tree_clf.feature_importances_
columns = data_df.columns[4:]
importances_df = pd.DataFrame({'feature_name': columns,
                               'feature_importance': importances})
importances_df = importances_df.sort_values(by='feature_importance', ascending=False)

In [None]:
# Plot feature importances
plt.figure(figsize=(18,6))
sns.barplot(x=importances_df.feature_name, y=importances_df.feature_importance)
plt.title('Feature importance - Test model',fontsize=14, fontweight = 'black')
plt.xticks(rotation = 90)
plt.xlabel("Dataset features - Test model",fontsize=12, fontweight = 'black')
plt.ylabel("Feature importance [0 to 1]",fontsize=12, fontweight = 'black')
plt.show()

In [None]:
y_pred = tree_clf.predict(X_test)
f1_score(y_test, y_pred, average='weighted')

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=tree_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=tree_clf.classes_)
disp.plot()
plt.show()

In [None]:
def parse_report(y_val, model_best_pred, m, model_names, tree_clf, dataset, opt_method):
    """
    Parse classification report valeus to dataframe for posterior comparison of model performance.
    Returns:
    --------
    df: classification report values dataframe
    """
    report = classification_report(y_val, model_best_pred, output_dict=True)
    dict_values = {}
    dict_values['model']=model_names[m]
    dict_values['params']=np.array([tree_clf.get_params()])
    dict_values['dataset']=dataset
    dict_values['opt_method']=opt_method
    for elem, score in report.items():
        try:
            for score_name, score_num in score.items():
                if elem.isnumeric():
                    col_i = '_'.join(['class', elem.replace(" ", "_"), score_name])
                else:
                    col_i = '_'.join([elem.replace(" ", "_"), score_name])
                dict_values[col_i] = score_num
#                 col = np.append(col, col_i)
        except:
            col_i = elem
            dict_values[col_i] = score_num
#             col = np.append(col, col_i)
            continue
    df = pd.DataFrame(dict_values, index=[m])
    return df

In [None]:
target = ['12', '34', '58']
print(classification_report(y_test, y_pred, target_names=target))

In [None]:
results_ = parse_report(y_test, y_pred, 0, ['DecisionTreeClassifier'], tree_clf, 'mean_height', 'no_optimization')

In [None]:
results_.to_csv('simple_tree_performance.csv')

In [None]:
tree_clf.get_params()

### Train simple model with main parameters

In [None]:
cols = importances_df['feature_name'].iloc[:7].values

In [None]:
cols

In [None]:
X = data_df[cols].to_numpy()
y = data_df['Class'].to_numpy()
# Train/Test Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Train simple classifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
# Get Feature Importances
importances = tree_clf.feature_importances_
# columns = data_df.columns[4:]
importances_df = pd.DataFrame({'feature_name': cols,
                               'feature_importance': importances})
importances_df = importances_df.sort_values(by='feature_importance', ascending=False)

In [None]:
# Plot feature importances
plt.figure(figsize=(18,6))
sns.barplot(x=importances_df.feature_name, y=importances_df.feature_importance)
plt.title('Feature importance - Test model',fontsize=14, fontweight = 'black')
plt.xticks(rotation = 90)
plt.xlabel("Dataset features - Test model",fontsize=12, fontweight = 'black')
plt.ylabel("Feature importance [0 to 1]",fontsize=12, fontweight = 'black')
plt.show()

In [None]:
y_pred = tree_clf.predict(X_test)
f1_score(y_test, y_pred, average='weighted')

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=tree_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=tree_clf.classes_)
disp.plot()
plt.show()

In [None]:
target = ['12', '34', '58']
print(classification_report(y_test, y_pred, target_names=target))