In [None]:
%matplotlib inline
import numpy as np
import sklearn.datasets as data
import matplotlib.pyplot as plt
import matplotlib

import seaborn as sb; sb.set_style( 'darkgrid' ) # use whitegrid if prefer a white background
import pandas as pd

from numpy.random import SeedSequence, default_rng
rng = default_rng( SeedSequence().entropy )

import warnings
warnings.filterwarnings("ignore")

#matplotlib.rcParams.update( { 'font.size': 18 } ) # Use this to setup your preferred font size for plotting

#### 1- Use the Diabetes data set to perform a regression task using Decision trees, Gradient boosting, Multi-layered Perceptron and SVM. Compare the two models by inspecting the decision path done by the Decision tree, feature importanceby Gradient boosting, support vectors of the SVM and partial featuer dependence of MLPs. Which gives more intepretable results in your mind? Vary different parameters in both models and compare the resulting modelling. 

In [None]:
diabetes = data.load_diabetes()
X, Y = data.load_diabetes( return_X_y = True )

#### Diabetes data

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

Data Set Characteristics:

Number of Instances
442

Number of Attributes
First 10 columns are numeric predictive values

Target
Column 11 is a quantitative measure of disease progression one year after baseline

Attribute Information
age age in years

sex

bmi body mass index

bp average blood pressure

s1 tc, total serum cholesterol

s2 ldl, low-density lipoproteins

s3 hdl, high-density lipoproteins

s4 tch, total cholesterol / HDL

s5 ltg, possibly log of serum triglycerides level

s6 glu, blood sugar level

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 666 )

clf = DecisionTreeRegressor( max_leaf_nodes=5, random_state = 34)
clf.fit(X_train, y_train)

print( "Training score: {0}".format( clf.score( X_train, y_train ) ) )
print( "Test score: {0}".format( clf.score(X_test, y_test ) ) )

plt.figure( figsize = ( 20, 10 ) )
tree.plot_tree(clf)
plt.show()

In [None]:
clf.fit(X, Y)

print( "Training score: {0}".format( clf.score( X, Y ) ) )

plt.figure( figsize = ( 20, 10 ) )
tree.plot_tree(clf)
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.1, random_state=13 )

gb = GradientBoostingRegressor()
gb.fit( X, Y ) # Try also with the train/test split

# print the mean squared error of model fitting
print( "The mean squared error (MSE) on test set: {:.4f}".format( mean_squared_error( Y, gb.predict( X ) ) ) )

In [None]:
feature_importance = gb.feature_importances_
sorted_idx         = np.argsort(feature_importance)
pos                = np.arange(sorted_idx.shape[0]) + 0.5


fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    gb, X, Y, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(diabetes.feature_names)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

#### 2- Compare applying regularization on the Diabates dataset and Breast Cancer data using SVM, ANNs and Gradient Boosting
+ Plot similar training and test error curves as in FML 2- SVMs and ANNs
+ Plot feature importance from Gradient Boosting, and extract the support vectors from SVM and compute the extracted data's statistics (as in FML 2 notebook). For MLP, use partial dependece plot (code given)

In [None]:
X, Y = data.load_breast_cancer( return_X_y = True, as_frame = True )
print( X.shape )
X.head()

In [None]:
# Setup here train and test split
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split( X, Y, test_size = 0.3, random_state = rng.integers( 342 ) )

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb  = GradientBoostingClassifier()


# To keep it simple, vary the number of estimators and min number of samples before splitting
# Start with varying number of estimators while keeping min_sample_spit constant. Then switch around
param_range = np.arange( 10, 20 )
train_e     = []
test_e      = []

for a in param_range:
    gb.set_params( n_estimators = a )
    gb.fit( Xtrain, ytrain )
    train_e.append( gb.score( Xtrain, ytrain ) )
    test_e.append( gb.score( Xtest, ytest ) )
    
i_optim = np.argmax(test_e)
optim   = param_range[i_optim]
print("Optimal regularization parameter : %s" % optim)

gb.set_params( n_estimators = optim )
gb.fit( X, Y )


plt.figure( figsize = ( 18, 10 ) )
plt.title( 'Gradient Boosting ')
plt.semilogx(param_range, train_e, label='Train')
plt.semilogx(param_range, test_e, label='Test')
plt.vlines(optim, plt.ylim()[0], np.max(test_e), color='k',
           linewidth=5, label='Optimum on test')
plt.legend(loc='lower left')
plt.ylim([min(test_e)-0.1, max(train_e)+0.1])
plt.xlabel('Parameter range')
plt.ylabel('Performance');

In [None]:
feature_importance = gb.feature_importances_
sorted_idx         = np.argsort(feature_importance)
pos                = np.arange(sorted_idx.shape[0]) + 0.5


fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(X.columns)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    gb, Xtrain, ytrain, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(X.columns)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier( hidden_layer_sizes = 10, activation = 'logistic', solver = 'sgd' )

param_range = np.logspace( -2, 1, 3 )
train_e     = []
test_e      = []

for a in param_range:
    mlp.set_params( alpha = a )
    mlp.fit( Xtrain, ytrain )
    train_e.append( gb.score( Xtrain, ytrain ) )
    test_e.append( gb.score( Xtest, ytest ) )
    
i_optim = np.argmax(test_e)
optim   = param_range[i_optim]
print("Optimal regularization parameter : %s" % optim)

mlp.set_params( alpha = optim )
mlp.fit( X, Y )

plt.figure( figsize = ( 18, 10 ) )
plt.title( 'Multi-layered Perceptron')
plt.semilogx(param_range, train_e, label='Train')
plt.semilogx(param_range, test_e, label='Test')
plt.vlines(optim, plt.ylim()[0], np.max(test_e), color='k',
           linewidth=5, label='Optimum on test')
plt.legend(loc='lower left')
plt.ylim([min(test_e)-0.1, max(train_e)+0.1])
plt.xlabel('Parameter range')
plt.ylabel('Performance');

In [None]:
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence


# Pick the top four features from the results from Gradient Boosting feature importance
features = [ 'worst radius', 'worst perimeter', 'mean concave points', 'worst concave points' ]

display = plot_partial_dependence(
       mlp, Xtrain, features, kind="both", subsample=20,
       n_jobs=3, grid_resolution=20, random_state=0
)

display.figure_.suptitle(
    'Comparing Partial dependence of top features found by Gradient Boosting'
)
display.figure_.subplots_adjust(hspace=0.3)

In [None]:
from sklearn.svm import SVC

param_range = np.logspace( -10, 1, 10 ) # gamma parameter
C           = 122.0
svc         = SVC( C = C, kernel = 'linear' )
train_e     = []
test_e      = []

for a in param_range:
    svc.set_params( gamma = a )
    svc.fit( Xtrain, ytrain )
    train_e.append( svc.score( Xtrain, ytrain ) )
    test_e.append( svc.score( Xtest, ytest ) )
    
i_optim = np.argmax(test_e)
optim   = param_range[i_optim]
print("Optimal regularization parameter : %s" % optim)

svc.set_params( gamma = optim)
svc.fit( Xtrain, ytrain )
print( "Number of support vectors per class:")
print( "Class 0: {0} | Class 1: {1}".format( svc.n_support_[ 0 ], svc.n_support_[ 1 ] ) )

plt.figure( figsize = ( 18, 10 ) )
plt.title( 'Linear SVM')
plt.semilogx(param_range, train_e, label='Train')
plt.semilogx(param_range, test_e, label='Test')
plt.vlines(optim, plt.ylim()[0], np.max(test_e), color='k',
           linewidth=5, label='Optimum on test')
plt.legend(loc='lower left')
plt.ylim([min(test_e)-0.1, max(train_e)+0.1])
plt.xlabel('Parameter range')
plt.ylabel('Performance');

In [None]:
Xsv = Xtrain.iloc[ svc.support_ ]
Xsv[ features ].describe()