# Bias - Variance Decomposition - Supervised discretizer

- Dataset: satimage
- Discretization: supervised: ChiMerge, DecisionTree
- Model: Knn-Hamming
- Updated: 27/04/23

Process:
- Load pre-trained model (skops)
- Run bias-variance decomposition
- Save result to "satimage_evaluation_sup_knn.txt"


In [1]:
# Import 
import pandas as pd
import numpy as np

import skops.io as sio
import mlxtend
from collections import Counter

In [2]:
# For model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [3]:
import six
import sys
sys.modules['sklearn.externals.six'] = six

In [4]:
# Import 
import skops.io as sio
import joblib
import mlxtend

# 1. ChiMerge data

## CM, Max intervals = 6

In [5]:
# Complete code for data preperation
# Read data
df_cm1 = pd.read_csv('cm_satimage_6int.csv')
df_cm1.rename(columns={'class':'label'}, inplace=True)
disc = 'CM'
k = 6

df_cm1.info()
data = df_cm1.values
data.shape

features = df_cm1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_cm1[features]
#Y = df_cm1['label']

print(X.shape, Y.shape)

# Split train test
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_cm1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [6]:
# Load models
model = 'Knn-Hamming'
dataset = 'satimage'
discretizer = 'CM'
disc_param = 'k = 6'
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_Knn-Hamming_CM_6.skops


In [7]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## CM, Max intervals = 8

In [8]:
# Complete code for data preperation
# Read data
df_cm2 = pd.read_csv('cm_satimage_8int.csv')
df_cm2.rename(columns={'class':'label'}, inplace=True)
disc = 'CM'
k = 8

df_cm2.info()
data = df_cm2.values
data.shape

features = df_cm2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_cm2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [9]:
# Load models
model = 'Knn-Hamming'
dataset = 'satimage'
discretizer = 'CM'
disc_param = 'k = 8'
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_Knn-Hamming_CM_8.skops


In [10]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## ChiMerge, max_intervals = 10

In [11]:
# Complete code for data preperation
# Read data
df_cm3 = pd.read_csv('cm_satimage_10int.csv')
df_cm3.rename(columns={'class':'label'}, inplace=True)
disc = 'cm'
k = 10

df_cm3.info()
data = df_cm3.values
data.shape

features = df_cm3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_cm3[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [12]:
# Load models
model = 'Knn-Hamming'
dataset = 'satimage'
discretizer = 'CM'
disc_param = 'k = 10'
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_Knn-Hamming_CM_10.skops


In [13]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## ChiMerge, max_intervals = 15

In [14]:
# Complete code for data preperation
# Read data
df_cm4 = pd.read_csv('cm_satimage_15int.csv')
df_cm4.rename(columns={'class':'label'}, inplace=True)
disc = 'cm'
k = 15

df_cm4.info()
data = df_cm4.values
data.shape

features = df_cm4.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_cm4[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [15]:
# Load models
model = 'Knn-Hamming'
dataset = 'satimage'
discretizer = 'CM'
disc_param = 'k = 15'
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_Knn-Hamming_CM_15.skops


In [16]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Decision Tree discretizer

## DT, max_depth = 2

In [17]:
# Complete code for data preperation
# Read data
df_dt1 = pd.read_csv('DT_small_discretized_satimage.csv')
df_dt1.rename(columns={'class':'label'}, inplace=True)
disc = 'DT'
max_depth = 2

df_dt1.info()
data = df_dt1.values
data.shape

features = df_dt1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

#make train test split
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_dt1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [18]:
# Load models
model = 'Knn-Hamming'
dataset = 'satimage'
discretizer = 'DT'
disc_param = 'max_depth = 2'

model_name = f"{dataset}_{model}_{discretizer}_{max_depth}.skops"

print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_Knn-Hamming_DT_2.skops


In [19]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, max_depth = {max_depth} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## DT, max_depth = 3

In [20]:
# Complete code for data preperation
# Read data
df_dt2 = pd.read_csv('DT_medium_discretized_satimage.csv')
df_dt2.rename(columns={'class':'label'}, inplace=True)
disc = 'DT'
max_depth = 3

df_dt2.info()
data = df_dt2.values
data.shape

features = df_dt2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
#make train test split
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_dt2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [22]:
# Load models
model = 'Knn-Hamming'
dataset = 'satimage'
discretizer = 'DT'
disc_param = 'max_depth = 3'

model_name = f"{dataset}_{model}_{discretizer}_{max_depth}.skops"

print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_Knn-Hamming_DT_3.skops


In [23]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, max_depth = {max_depth} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## DT, max_depth = 4

In [24]:
# Complete code for data preperation
# Read data
df_dt3 = pd.read_csv('DT_large_discretized_satimage.csv')
df_dt3.rename(columns={'class':'label'}, inplace=True)
disc = 'DT'
max_depth = 4

df_dt3.info()
data = df_dt3.values
data.shape

features = df_dt3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_dt3[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [25]:
# Load models
model = 'Knn-Hamming'
dataset = 'satimage'
discretizer = 'DT'
disc_param = 'max_depth = 4'

model_name = f"{dataset}_{model}_{discretizer}_{max_depth}.skops"

print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_Knn-Hamming_DT_4.skops


In [26]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, max_depth = {max_depth} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## DT, max_depth = 5

In [27]:
# Complete code for data preperation
# Read data
df_dt4 = pd.read_csv('DT_verylarge_discretized_satimage.csv')
df_dt4.rename(columns={'class':'label'}, inplace=True)
disc = 'DT'
max_depth = 5

df_dt4.info()
data = df_dt4.values
data.shape

features = df_dt4.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
#make train test split
x_train = X[:4435, :]
y_train = Y[:4435]
x_test= X[4435:, :]
y_test= Y[4435:]

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_dt4[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [28]:
# Load models
model = 'KNN-Hamming'
dataset = 'satimage'
discretizer = 'DT'
disc_param = 'max_depth = 5'

model_name = f"{dataset}_{model}_{discretizer}_{max_depth}.skops"

print(model_name)
loaded_knn = sio.load(model_name, trusted=True)
y_pred_knn = loaded_knn.predict(x_test)

satimage_KNN-Hamming_DT_5.skops


In [29]:
# Decomposition
f = open("satimage_evaluation_sup_knn.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_knn, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, max_depth = {max_depth} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()