# Bias - Variance Decomposition

- Dataset: pageblock
- Discretization: unsupervised: EWD, EFD, FFD
- Model: Categorical Naive Bayes
- Updated: 22/03/23

Process:
- Load pre-trained model (sav).
- Run bias-variance decomposition
- Save result to "pageblock_evaluation_cnb.txt"

NOTE: NO CNB models for EWD (error index out of bound)

In [7]:
# Import 
import pandas as pd
import numpy as np

import skops.io as sio
import mlxtend
from collections import Counter

In [8]:
# For model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [9]:
import six
import sys
sys.modules['sklearn.externals.six'] = six

In [10]:
# Import 
import skops.io as sio
import joblib
import mlxtend

# 1. EWD

## EWD, k = 4

In [11]:
# Complete code for data preperation
# Read data
df_ewd1 = pd.read_csv('pageblock_ewd1.csv')
df_ewd1.drop(df_ewd1.columns[0], axis=1, inplace = True)
df_ewd1.rename(columns={'class':'label'}, inplace=True)

disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ewd1[features].nunique()

from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [12]:
# Load models
k=4
model = 'CNB'
dataset = 'pageblock'
discretizer = 'EWD'
disc_param = 'k = 4'
model_name = f"{dataset}_{model}_{discretizer}_{k}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_EWD_4.sav


FileNotFoundError: [Errno 2] No such file or directory: 'pageblock_CNB_EWD_4.sav'

In [None]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## EWD, k = 7

In [None]:
# # Complete code for data preperation
# # Read data

# df_ewd2 = pd.read_csv('pageblock_ewd2.csv')
# df_ewd2.drop(df_ewd2.columns[0], axis=1, inplace = True)
# df_ewd2.rename(columns={'class':'label'}, inplace=True)

# disc = 'EWD'
# k = 7

# df_ewd2.info()
# data = df_ewd2.values
# data.shape

# features = df_ewd2.drop('label', axis = 1).columns

# # separate the data into X and y
# X = data[:, : len(features)]
# Y = data[:,-1]
# #X = df_ewd2[features]
# #Y = df_ewd2['class']

# print(X.shape, Y.shape)

# # Split train test
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# # Check representation of class
# print('Class representation - original: ', Counter(Y)) 
# print('Class representation - training data: ', Counter(y_train)) 
# print('Class representation - testing data: ', Counter(y_test)) 

# # Check number of categories for features
# n_categories = df_ewd2[features].nunique()

# # SMOTE-Tomek
# from imblearn.combine import SMOTETomek
# smt_tomek = SMOTETomek(random_state=42)
# x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

In [None]:
# # Load models
# k=7
# model = 'CNB'
# dataset = 'pageblock'
# discretizer = 'EWD'
# disc_param = 'k = 7'
# model_name = f"{dataset}_{model}_{discretizer}_{k}.sav"
# print(model_name)
# loaded_cnb = joblib.load(model_name)
# y_pred_cnb = loaded_cnb.predict(x_test)

In [None]:
# # Decomposition
# f = open("pageblock_evaluation_cnb.txt", "a")
# import time
# start = time.time() # For measuring time execution

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# loaded_cnb, x_resample, y_resample, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---

# print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
# print('Average expected loss: %.3f' % avg_expected_loss, file = f)
# print('Average bias: %.3f' % avg_bias, file = f)
# print('Average variance: %.3f' % avg_var, file = f)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

# end = time.time()
# print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
# print('=='*20, file = f)
# f.close()

## EWD, k = 10

In [None]:
# # Complete code for data preperation
# # Read data
# df_ewd3 = pd.read_csv('pageblock_ewd3.csv')
# df_ewd3.drop(df_ewd3.columns[0], axis=1, inplace = True)
# df_ewd3.rename(columns={'class':'label'}, inplace=True)


# disc = 'EWD'
# k = 10

# df_ewd3.info()
# data = df_ewd3.values
# data.shape

# features = df_ewd3.drop('label', axis = 1).columns

# # separate the data into X and y
# X = data[:, : len(features)]
# Y = data[:,-1]
# #X = df_ewd3[features]
# #Y = df_ewd3['class']

# print(X.shape, Y.shape)

# # Split train test
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# # Check representation of class
# print('Class representation - original: ', Counter(Y)) 
# print('Class representation - training data: ', Counter(y_train)) 
# print('Class representation - testing data: ', Counter(y_test)) 

# # Check number of categories for features
# n_categories = df_ewd3[features].nunique()

# # SMOTE-Tomek
# from imblearn.combine import SMOTETomek
# smt_tomek = SMOTETomek(random_state=42)
# x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

In [None]:
# # Load models
# k=10
# model = 'CNB'
# dataset = 'pageblock'
# discretizer = 'EWD'
# disc_param = 'k = 10'
# model_name = f"{dataset}_{model}_{discretizer}_{k}.sav"
# print(model_name)
# loaded_cnb = joblib.load(model_name)
# y_pred_cnb = loaded_cnb.predict(x_test)

In [None]:
# # Decomposition
# f = open("pageblock_evaluation_cnb.txt", "a")
# import time
# start = time.time() # For measuring time execution

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# loaded_cnb, x_resample, y_resample, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---

# print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
# print('Average expected loss: %.3f' % avg_expected_loss, file = f)
# print('Average bias: %.3f' % avg_bias, file = f)
# print('Average variance: %.3f' % avg_var, file = f)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

# end = time.time()
# print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
# print('=='*20, file = f)
# f.close()

# 2. EFD

# EFD, k = 4

In [13]:
# Complete code for data preperation
# Read data
df_efd1 = pd.read_csv('pageblock_efd1.csv')
df_efd1.drop(df_efd1.columns[0], axis=1, inplace = True)
df_efd1.rename(columns={'class':'label'}, inplace=True)

disc = 'efd'
k = 4

df_efd1.info()
data = df_efd1.values
data.shape

features = df_efd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]


print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd1[features].nunique()

# SMOTE-Tomek
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [14]:
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

In [15]:
# Load models
model = 'CNB'
dataset = 'pageblock'
discretizer = 'EFD'
disc_param = 'k = 4'
model_name = f"{dataset}_{model}_{discretizer}_{k}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_EFD_4.sav


In [16]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## EFD, k = 7

In [17]:
# Complete code for data preperation
# Read data
df_efd2 = pd.read_csv('pageblock_efd2.csv')
df_efd2.drop(df_efd2.columns[0], axis=1, inplace = True)

df_efd2.rename(columns={'class':'label'}, inplace=True)

disc = 'efd'
k = 7

df_efd2.info()
data = df_efd2.values
data.shape

features = df_efd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_efd2[features]
#Y = df_efd2['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd2[features].nunique()

# SMOTE-Tomek
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [18]:
# Load models
k=7
model = 'CNB'
dataset = 'pageblock'
discretizer = 'EFD'
disc_param = 'k = 7'
model_name = f"{dataset}_{model}_{discretizer}_{k}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_EFD_7.sav


In [19]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## EFD, k = 10

In [20]:
# Complete code for data preperation
# Read data
df_efd3 = pd.read_csv('pageblock_efd3.csv')
df_efd3.drop(df_efd3.columns[0], axis=1, inplace = True)
df_efd3.rename(columns={'class':'label'}, inplace=True)

disc = 'efd'
k = 10

df_efd3.info()
data = df_efd3.values
data.shape

features = df_efd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_efd3[features]
#Y = df_efd3['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd3[features].nunique()

# SMOTE-Tomek
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [21]:
# Load models
k=10
model = 'CNB'
dataset = 'pageblock'
discretizer = 'EFD'
disc_param = 'k = 10'
model_name = f"{dataset}_{model}_{discretizer}_{k}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_EFD_10.sav


In [22]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# 3. FFD

## FFD, m = 10

In [23]:
# Complete code for data preperation
# Read data
df_ffd1 = pd.read_csv('pageblock_ffd1.csv')
df_ffd1.drop(df_ffd1.columns[0], axis=1, inplace = True)
df_ffd1.rename(columns={'class':'label'}, inplace=True)


disc = 'ffd'
m = 10

df_ffd1.info()
data = df_ffd1.values
data.shape

features = df_ffd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ffd1[features]
#Y = df_ffd1['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd1[features].nunique()

# SMOTE-Tomek
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [24]:
# Load models
model = 'CNB'
dataset = 'pageblock'
discretizer = 'FFD'
disc_param = 'm = 10'

model_name = f"{dataset}_{model}_{discretizer}_{m}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_FFD_10.sav


In [25]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## FFD, m = 30

In [26]:
# Complete code for data preperation
# Read data
df_ffd2 = pd.read_csv('pageblock_ffd2.csv')
df_ffd2.drop(df_ffd2.columns[0], axis=1, inplace = True)
df_ffd2.rename(columns={'class':'label'}, inplace=True)


disc = 'ffd'
m = 30

df_ffd2.info()
data = df_ffd2.values
data.shape

features = df_ffd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ffd2[features]
#Y = df_ffd2['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd2[features].nunique()

from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

# SMOTE-Tomek
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [27]:
# Load models
model = 'CNB'
dataset = 'pageblock'
discretizer = 'FFD'
disc_param = 'm = 30'

model_name = f"{dataset}_{model}_{discretizer}_{m}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_FFD_30.sav


In [28]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## FFD, m = 60

In [29]:
# Complete code for data preperation
# Read data
df_ffd3 = pd.read_csv('pageblock_ffd3.csv')
df_ffd3.drop(df_ffd3.columns[0], axis=1, inplace = True)
df_ffd3.rename(columns={'class':'label'}, inplace=True)
disc = 'ffd'
m = 60

df_ffd3.info()
data = df_ffd3.values
data.shape

features = df_ffd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ffd3[features]
#Y = df_ffd3['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd3[features].nunique()

# SMOTE-Tomek
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [30]:
# Load models
model = 'CNB'
dataset = 'pageblock'
discretizer = 'FFD'
disc_param = 'm = 60'

model_name = f"{dataset}_{model}_{discretizer}_{m}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_FFD_60.sav


In [31]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

## FFD, m = 100

In [32]:
# Complete code for data preperation
# Read data
df_ffd4 = pd.read_csv('pageblock_ffd4.csv')
df_ffd4.drop(df_ffd4.columns[0], axis=1, inplace = True)
df_ffd4.rename(columns={'class':'label'}, inplace=True)

disc = 'ffd'
m = 100

df_ffd4.info()
data = df_ffd4.values
data.shape

features = df_ffd4.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd4[features].nunique()

# SMOTE-Tomek
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
x_resample, y_resample = smt_tomek.fit_resample(x_train, y_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  label     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB
(5473, 10) (5473,)
Class representation - original:  Counter({1: 4913, 2: 329, 5: 115, 4: 88, 3: 28})
Class representation - training data:  Counter({1: 3684, 2: 247, 5: 86, 4: 66, 3: 21})
Class representation - testing data:  Counter({1: 1229, 2: 82, 5: 29, 4: 22, 3: 7})


In [33]:
# Load models
model = 'CNB'
dataset = 'pageblock'
discretizer = 'FFD'
disc_param = 'm = 100'

model_name = f"{dataset}_{model}_{discretizer}_{m}.sav"
print(model_name)
loaded_cnb = joblib.load(model_name)
y_pred_cnb = loaded_cnb.predict(x_test)

pageblock_CNB_FFD_100.sav


In [34]:
# Decomposition
f = open("pageblock_evaluation_cnb.txt", "a")
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
loaded_cnb, x_resample, y_resample, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---

print(f'Evaluation result: {model}, {discretizer}, {disc_param}', file = f)
print('Average expected loss: %.3f' % avg_expected_loss, file = f)
print('Average bias: %.3f' % avg_bias, file = f)
print('Average variance: %.3f' % avg_var, file = f)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb), file = f)

end = time.time()
print(f'Execution time {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()