# DEMO MACHINE LEARNING PIPELINE WITH SCIKIT-LEARN
# (WITH TRAIN - TEST SPLIT AND VALIDATION)

In [None]:
myRandomState=0
mySampleSize=10

### DATA PREPARATION

In [None]:
# DATA PREPARATION

import pandas as pd
pd.options.display.max_rows = None
import seaborn as sns 
iris = sns.load_dataset('iris')
y = iris['species'] # Target feature to predict
X = iris.copy().drop('species', axis=1) # Predictors

print(type(iris), iris.shape)
print(type(X), X.shape)
print(type(y), y.shape)

display(X.head(5))
display(y.head(5))

In [None]:
# Explore data
display(X.sample(mySampleSize, random_state=myRandomState))
display(y.sample(mySampleSize, random_state=myRandomState))
# Mind that the indexes of the sample of y might be different of the indexes
# of the sample of X because of the random selection.
# When using random_state with the same state, you should get the same 
# indexes.
display(X.describe())

In [None]:
# If you want to be sure to get the same indexes, e.g. when you do not use
# random_state, extract the selected indexes from the sample of X and use
# those indexes to slice y, but this requires you to store the sample of
# X 
X_smpl = X.sample(mySampleSize)
display(X_smpl)
display(X_smpl.index)
y_smpl = y[X_smpl.index]
display(y_smpl)

### SPLIT LABELED DATA INTO TRAIN - TEST SAMPLE

In [None]:
# SPLIT LABELED DATA INTO TRAIN - TEST SAMPLE

from sklearn.model_selection import train_test_split
# Split the data randomly into 80% training set and 20% test set
X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, random_state=0, train_size=0.8)
# (use random_state to be sure that every time the same random sample is drawn)

print(type(X_tr), X_tr.shape)
print(type(X_tst), X_tst.shape)
print(type(y_tr), y_tr.shape)
print(type(y_tst), y_tst.shape)

### MODEL SELECTION AND HYPERPARAMETER SELECTION (MODEL SPECIFIC)

In [None]:
# MODEL SELECTION AND HYPERPARAMETER SELECTION (MODEL SPECIFIC)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=1)
print(model)
# List all selected hyperparameters
print(model.get_params(deep=True))

### DERIVE MODEL (TRAIN MODEL/FIT MODEL)

In [None]:
# DERIVE MODEL (TRAIN MODEL/FIT MODEL)

model.fit(X_tr,y_tr)

### DISPLAY MODEL (MODEL SPECIFIC)

In [None]:
# DISPLAY MODEL (MODEL SPECIFIC)

from sklearn.tree import plot_tree
plot_tree(model)

### VALIDATE MODEL USING TEST DATA

In [None]:
# VALIDATE MODEL USING TEST DATA

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
import matplotlib.pyplot as plt

# Predict target feature for the test data
y_tst_pred = pd.Series(model.predict(X_tst), name='y_tst_pred')

# Calculate the difference between predicted and real values for the test data
err = pd.Series(y_tst_pred.reset_index(drop=True)!=y_tst.reset_index(drop=True), name='err').astype(int)
display(pd.concat([y_tst.reset_index(drop=True), y_tst_pred.reset_index(drop=True), err], axis=1))

In [None]:
# Confusion matrix
# Display as text (console output)
class_labels = sorted(list(pd.concat([y_tst,y_tst_pred], axis=0).unique()))
# Alternative : model.classes_
cm = confusion_matrix(y_true = y_tst, y_pred = y_tst_pred) 
print('Predicted label')
print(class_labels)
print(cm)
# Display as heatmap (nicer output in Jupyter)
disp = sns.heatmap(cm, square=True, annot=True, cbar=True, cmap='Greys', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted label')
plt.ylabel('True label')
disp.xaxis.tick_top()                # Put x-axis tickers on top
disp.xaxis.set_label_position('top') # Put x-axis label on top

In [None]:
# Metrics
acc = accuracy_score(y_true=y_tst, y_pred=y_tst_pred)
prec = precision_score(y_true=y_tst, y_pred=y_tst_pred, average='weighted')
rec = recall_score(y_true=y_tst, y_pred=y_tst_pred, average='weighted')
f1 = f1_score(y_true=y_tst, y_pred=y_tst_pred, average='weighted')
# Mind this is a multiclass classification problem, so precision, recall and F1 
# is calculated by class and averaged.
print(f'ACC : {acc:.3f} - PREC : {prec:.3f} - REC : {rec:.3f} - F1 : {f1:.3f}')


In [None]:
# The easiest way to get results by class is to use precision_recall_fscore_support
classes = sorted(list(pd.concat([y_tst,y_tst_pred], axis=0).unique()))
# Display precision/recall/fscore/support table as text (consule output)
display(precision_recall_fscore_support(y_true=y_tst, y_pred=y_tst_pred))
# Display precision/recall/fscore/support as pandas dataframe (nicer outputin Jupyter)
display(pd.DataFrame(precision_recall_fscore_support(y_true=y_tst, y_pred=y_tst_pred), index=['prec','rec','fscore','sup'], columns=classes))

In [None]:
# Or use classification_report
print(classification_report(y_true=y_tst, y_pred=y_tst_pred, target_names=class_labels))

# APPLY MODEL ON NEW DATA

In [None]:
# APPLY MODEL ON NEW DATA

X_pred = pd.DataFrame([[10,10,10,10],[5,5,5,5]], columns=(X.columns))
y_pred = model.predict(X_pred)
print(y_pred)
