# Classification models

Dataset: australia
By: Sam
Update at: 12/10/2022
====

Summary:

- Import unsupervised discretised datasets (already encoded categorical attributes)
- Split dataset: 75% training, 25% testing, seed = 30
- Perform 3 classification models: ID3, Naive Bayes, KNN
- Evaluation metrics: accuracy, time for training, bias, variance

### About Dataset
*Continuous attributes:* 6

- A2:	continuous.
- A3:	continuous.
- A7:	continuous.
- A10: continuous.
- A13: continuous.
- A14: continuous.

*Categorical attributes:*
- A1:	0,1    CATEGORICAL a,b
- A4:	1,2,3         CATEGORICAL p,g,gg
- A5:  1, 2,3,4,5, 6,7,8,9,10,11,12,13,14    CATEGORICAL ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x 
- A6:	 1, 2,3, 4,5,6,7,8,9    CATEGORICAL ff,dd,j,bb,v,n,o,h,z 
- A8:	1, 0       CATEGORICAL t, f.
- A9: 1, 0	    CATEGORICAL t, f.
- A11:  1, 0	    CATEGORICAL t, f.
- A12:    1, 2, 3    CATEGORICAL s, g, p 

*Label*
A15:   1,2 +,- (class attribute)

In [23]:
import pandas as pd
from pandas import read_csv
from pandas import set_option
import numpy as np
from numpy import arange
## EDA
from collections import Counter

In [24]:
# Pre-processing
from sklearn.preprocessing import OrdinalEncoder
# Cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score # 1 metric
from sklearn.model_selection import cross_validate # more than 1 metric
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [25]:
# RIPPER (https://pypi.org/project/wittgenstein/) Only for binary
import wittgenstein as lw 

In [26]:
# For Naive Bayes
from sklearn.naive_bayes import CategoricalNB # Categorical Naive Bayes
from sklearn.naive_bayes import MultinomialNB # Multinominal Naive Bayes (suitable for NLP)
from mixed_naive_bayes import MixedNB # Mixed Naive Bayes for combination of both discrete & continuous feature

In [27]:
# For decision tree ID3 
# https://stackoverflow.com/questions/61867945/python-import-error-cannot-import-name-six-from-sklearn-externals
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from id3 import Id3Estimator # ID3 Decision Tree (https://pypi.org/project/decision-tree-id3/)
from id3 import export_graphviz

In [28]:
# For model evaluation
from sklearn.metrics import classification_report
from sklearn import metrics
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

In [29]:
# For Knn using VDM metrics
from sklearn.neighbors import KNeighborsClassifier
from vdm3 import ValueDifferenceMetric

In [30]:
import seaborn as sns
import matplotlib.pyplot as plt

# EWD data

## EWD, k = 4

In [155]:
# Complete code for data preperation
# Read data
df_ewd1 = pd.read_csv('aus_ewd1.csv')
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ewd1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models - EWD, k=4

In [32]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.74      0.85      0.79        95
           1       0.78      0.64      0.70        78

    accuracy                           0.76       173
   macro avg       0.76      0.75      0.75       173
weighted avg       0.76      0.76      0.75       173

Time for training model ID3 - default, EWD, k = 4 is: 0.06848788261413574.


In [33]:
# Naive Bayes - min_categories = n_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.80      0.85      0.83        95
           1       0.81      0.74      0.77        78

    accuracy                           0.80       173
   macro avg       0.80      0.80      0.80       173
weighted avg       0.80      0.80      0.80       173

Time for training model Naive Bayes - default, EWD, k = 4 is: 0.007742881774902344.


In [156]:
# Knn-VDM complete code
# DONE
# Accuracy: 0.79
# Time for training: (1st attempt): 42.37504291534424.
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.78      0.84      0.81        95
           1       0.79      0.72      0.75        78

    accuracy                           0.79       173
   macro avg       0.79      0.78      0.78       173
weighted avg       0.79      0.79      0.78       173

Time for training model Knn-VDM, EWD, k = 4 is: 42.7268431186676.


In [35]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EWD, k = 4.
ID3: - Mean: 0.830918, Standard deviation: 0.038948
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.844928, Standard deviation: 0.037448


### Evaluation, EWD, k =4

In [36]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [37]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.238
Average bias: 0.220
Average variance: 0.106
Sklearn 0-1 loss: 0.243


In [38]:
# Naive Bayes - min_categories
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.198
Average bias: 0.197
Average variance: 0.035
Sklearn 0-1 loss: 0.197


In [157]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.229
Average bias: 0.220
Average variance: 0.092
Sklearn 0-1 loss: 0.214
Computing time:, EWD, k = 4 is: 14917.971321105957.


## 1.2 EWD, k = 7

In [130]:
# Complete code for data preperation
# Read data
df_ewd2 = pd.read_csv('aus_ewd2.csv')
disc = 'EWD'
k = 7

df_ewd2.info()
data = df_ewd2.values
data.shape

features = df_ewd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ewd2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models - EWD, k=7

In [41]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.75      0.83      0.79        95
           1       0.76      0.67      0.71        78

    accuracy                           0.76       173
   macro avg       0.76      0.75      0.75       173
weighted avg       0.76      0.76      0.75       173

Time for training model ID3 - default, EWD, k = 7 is: 0.08396506309509277.


In [51]:
# Naive Bayes - min_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.79      0.84      0.82        95
           1       0.79      0.73      0.76        78

    accuracy                           0.79       173
   macro avg       0.79      0.79      0.79       173
weighted avg       0.79      0.79      0.79       173

Time for training model Naive Bayes - default, EWD, k = 7 is: 0.007652759552001953.


In [43]:
# # Knn-VDM complete code
# import time
# start = time.time() # For measuring time execution

# # specific the continuous columns index if any
# vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
# vdm.fit()
# # Knn model, n_neigbour = 3, metrics = vdm
# knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# ## Fit model
# knn_vdm.fit(x_train, y_train)
# # Testing
# y_pred_knn = knn_vdm.predict(x_test)
# knn_vdm.classes_
# print(classification_report(y_test, y_pred_knn))

# end = time.time()
# print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

KeyError: 3.0

In [52]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EWD, k = 7.
ID3: - Mean: 0.818841, Standard deviation: 0.041290
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: nan, Standard deviation: nan


### Evaluation, EWD, k=7

In [53]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [54]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.248
Average bias: 0.225
Average variance: 0.110
Sklearn 0-1 loss: 0.243


In [55]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.207
Average bias: 0.214
Average variance: 0.036
Sklearn 0-1 loss: 0.208


In [None]:
# # WARINING: LONG TIME 
# # Knn-VDM
# import time
# start = time.time() # For measuring time execution

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

# end = time.time()
# print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

## 1.3 EWD, k = 10

In [131]:
# Complete code for data preperation
# Read data
df_ewd3 = pd.read_csv('aus_ewd3.csv')
disc = 'EWD'
k = 10

df_ewd3.info()
data = df_ewd3.values
data.shape

features = df_ewd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ewd3[features].nunique()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, EWD, k=10

In [57]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.75      0.87      0.81        95
           1       0.81      0.64      0.71        78

    accuracy                           0.77       173
   macro avg       0.78      0.76      0.76       173
weighted avg       0.77      0.77      0.76       173

Time for training model ID3 - default, EWD, k = 10 is: 0.11701393127441406.


In [48]:
# Naive Bayes - min_categories = n_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.79      0.84      0.82        95
           1       0.79      0.73      0.76        78

    accuracy                           0.79       173
   macro avg       0.79      0.79      0.79       173
weighted avg       0.79      0.79      0.79       173

Time for training model Naive Bayes - default, EWD, k = 7 is: 0.008220911026000977.


In [132]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.78      0.81      0.79        95
           1       0.76      0.72      0.74        78

    accuracy                           0.77       173
   macro avg       0.77      0.76      0.77       173
weighted avg       0.77      0.77      0.77       173

Time for training model Knn-VDM, EWD, k = 10 is: 45.563827991485596.


In [59]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EWD, k = 10.
ID3: - Mean: 0.817391, Standard deviation: 0.037718
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.849275, Standard deviation: 0.042927


### Evaluation, EWD, k=10

In [60]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [61]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.234
Average bias: 0.185
Average variance: 0.112
Sklearn 0-1 loss: 0.231


In [62]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

IndexError: index 8 is out of bounds for axis 1 with size 8

In [133]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.224
Average bias: 0.214
Average variance: 0.092
Sklearn 0-1 loss: 0.231
Computing time:, EWD, k = 10 is: 23212.079768896103.


# 2. EFD datasets

## 2.1 EFD, k = 4

In [134]:
# Complete code for data preperation
# Read data
df_efd1 = pd.read_csv('aus_efd1.csv')
disc = 'EFD'
k = 4

df_efd1.info()
data = df_efd1.values
data.shape

features = df_efd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, EFD, k=4

In [65]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.76      0.82      0.79        95
           1       0.76      0.69      0.72        78

    accuracy                           0.76       173
   macro avg       0.76      0.76      0.76       173
weighted avg       0.76      0.76      0.76       173

Time for training model ID3 - default, EFD, k = 4 is: 0.08470487594604492.


In [66]:
# Naive Bayes - min_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.88      0.82        95
           1       0.83      0.67      0.74        78

    accuracy                           0.79       173
   macro avg       0.79      0.78      0.78       173
weighted avg       0.79      0.79      0.78       173

Time for training model Naive Bayes - default, EFD, k = 4 is: 0.007628679275512695.


In [135]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.85      0.80        95
           1       0.79      0.67      0.72        78

    accuracy                           0.77       173
   macro avg       0.77      0.76      0.76       173
weighted avg       0.77      0.77      0.77       173

Time for training model Knn-VDM, EFD, k = 4 is: 45.63760209083557.


In [68]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EFD, k = 4.
ID3: - Mean: 0.812077, Standard deviation: 0.042416
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.851208, Standard deviation: 0.041658


### Evaluation, EFD, k=4

In [69]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [70]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.235
Average bias: 0.185
Average variance: 0.123
Sklearn 0-1 loss: 0.237


In [71]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.221
Average bias: 0.214
Average variance: 0.027
Sklearn 0-1 loss: 0.214


In [136]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.232
Average bias: 0.214
Average variance: 0.087
Sklearn 0-1 loss: 0.231
Computing time:, EFD, k = 4 is: 43277.95894289017.


## 2.2 EFD, k = 7 (aus_efd2)

In [137]:
# Complete code for data preperation
# Read data
df_efd2 = pd.read_csv('aus_efd2.csv')
disc = 'EFD'
k = 7

df_efd2.info()
data = df_efd2.values
data.shape

features = df_efd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, EFD, k=7

In [74]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.76      0.82      0.79        95
           1       0.76      0.69      0.72        78

    accuracy                           0.76       173
   macro avg       0.76      0.76      0.76       173
weighted avg       0.76      0.76      0.76       173

Time for training model ID3 - default, EFD, k = 7 is: 0.08503985404968262.


In [75]:
# Naive Bayes - min_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.88      0.82        95
           1       0.83      0.67      0.74        78

    accuracy                           0.79       173
   macro avg       0.79      0.78      0.78       173
weighted avg       0.79      0.79      0.78       173

Time for training model Naive Bayes - default, EFD, k = 7 is: 0.007208824157714844.


In [138]:
# Knn-VDM complete code
# DONE
# accuracy: 0.77 
# time: 44.63345527648926.
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.85      0.80        95
           1       0.79      0.67      0.72        78

    accuracy                           0.77       173
   macro avg       0.77      0.76      0.76       173
weighted avg       0.77      0.77      0.77       173

Time for training model Knn-VDM, EFD, k = 7 is: 41.25791263580322.


In [77]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EFD, k = 7.
ID3: - Mean: 0.812077, Standard deviation: 0.042416
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.855072, Standard deviation: 0.037233


### Evaluation, EFD, k=7 

In [78]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [79]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.235
Average bias: 0.185
Average variance: 0.123
Sklearn 0-1 loss: 0.237


In [80]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.221
Average bias: 0.214
Average variance: 0.027
Sklearn 0-1 loss: 0.214


In [139]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.232
Average bias: 0.214
Average variance: 0.087
Sklearn 0-1 loss: 0.231
Computing time:, EFD, k = 7 is: 15698.569708108902.


## 2.3 EFD, k =10 (aus_efd3)

In [140]:
# Complete code for data preperation
# Read data
df_efd3 = pd.read_csv('aus_efd3.csv')
disc = 'EFD'
k = 10

df_efd3.info()
data = df_efd3.values
data.shape

features = df_efd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test))

# Check number of categories for features
n_categories = df_efd3[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, EFD, k=10

In [83]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.82      0.79        95
           1       0.76      0.69      0.72        78

    accuracy                           0.76       173
   macro avg       0.76      0.76      0.76       173
weighted avg       0.76      0.76      0.76       173

Time for training model ID3 - default, EFD, k = 10 is: 0.07958078384399414.


In [84]:
# Naive Bayes - min_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.88      0.82        95
           1       0.83      0.67      0.74        78

    accuracy                           0.79       173
   macro avg       0.79      0.78      0.78       173
weighted avg       0.79      0.79      0.78       173

Time for training model Naive Bayes - default, EFD, k = 10 is: 0.007249355316162109.


In [141]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.85      0.80        95
           1       0.79      0.67      0.72        78

    accuracy                           0.77       173
   macro avg       0.77      0.76      0.76       173
weighted avg       0.77      0.77      0.77       173

Time for training model Knn-VDM, EFD, k = 10 is: 42.22964096069336.


In [86]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EFD, k = 10.
ID3: - Mean: 0.812077, Standard deviation: 0.042416
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.855072, Standard deviation: 0.037233


### Evaluation, EFD, k=10

In [87]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [88]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.235
Average bias: 0.185
Average variance: 0.123
Sklearn 0-1 loss: 0.237


In [89]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.221
Average bias: 0.214
Average variance: 0.027
Sklearn 0-1 loss: 0.214


In [142]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.232
Average bias: 0.214
Average variance: 0.087
Sklearn 0-1 loss: 0.231
Computing time:, EFD, k = 10 is: 26336.05382823944.


# 3. FFD datasets

## 3.1 FFD, m =10 (aus_ffd1)

In [143]:
# Complete code for data preperation
# Read data
df_ffd1 = pd.read_csv('aus_ffd1.csv')
disc = 'FFD'
m = 10

df_ffd1.info()
data = df_ffd1.values
data.shape

features = df_ffd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, FFD, m=10

In [92]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.75      0.83      0.79        95
           1       0.76      0.67      0.71        78

    accuracy                           0.76       173
   macro avg       0.76      0.75      0.75       173
weighted avg       0.76      0.76      0.75       173

Time for training model ID3 - default, FFD, m = 10 is: 0.18407702445983887.


In [93]:
# Naive Bayes - min_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.75      0.87      0.81        95
           1       0.81      0.64      0.71        78

    accuracy                           0.77       173
   macro avg       0.78      0.76      0.76       173
weighted avg       0.77      0.77      0.76       173

Time for training model Naive Bayes - default, FFD, m = 10 is: 0.006767749786376953.


In [144]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.77      0.84      0.80        95
           1       0.78      0.69      0.73        78

    accuracy                           0.77       173
   macro avg       0.78      0.77      0.77       173
weighted avg       0.78      0.77      0.77       173

Time for training model Knn-VDM, FFD, m = 10 is: 44.93128705024719.


In [95]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 10.
ID3: - Mean: 0.835266, Standard deviation: 0.034395
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: nan, Standard deviation: nan


### Evaluation, FFD, m=10

In [96]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [97]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.246
Average bias: 0.225
Average variance: 0.124
Sklearn 0-1 loss: 0.243


In [98]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.222
Average bias: 0.231
Average variance: 0.055
Sklearn 0-1 loss: 0.231


In [145]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.230
Average bias: 0.214
Average variance: 0.077
Sklearn 0-1 loss: 0.225
Computing time:, FFD, k = 10 is: 9381.151834011078.


## 3.2 FFD, m = 30 (aus_ffd2)

In [146]:
# Complete code for data preperation
# Read data
df_ffd2 = pd.read_csv('aus_ffd2.csv')
disc = 'FFD'
m = 30

df_ffd2.info()
data = df_ffd2.values
data.shape

features = df_ffd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test))

# Check number of categories for features
n_categories = df_ffd2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, FFD, m=30

In [101]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.75      0.80      0.77        95
           1       0.73      0.67      0.70        78

    accuracy                           0.74       173
   macro avg       0.74      0.73      0.73       173
weighted avg       0.74      0.74      0.74       173

Time for training model ID3 - default, FFD, m = 30 is: 0.13190603256225586.


In [102]:
# Naive Bayes - min_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.88      0.82        95
           1       0.83      0.67      0.74        78

    accuracy                           0.79       173
   macro avg       0.79      0.78      0.78       173
weighted avg       0.79      0.79      0.78       173

Time for training model Naive Bayes - default, FFD, m = 30 is: 0.008085012435913086.


In [147]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.80      0.85      0.83        95
           1       0.81      0.74      0.77        78

    accuracy                           0.80       173
   macro avg       0.80      0.80      0.80       173
weighted avg       0.80      0.80      0.80       173

Time for training model Knn-VDM, FFD, m = 30 is: 41.25654697418213.


In [104]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 30.
ID3: - Mean: 0.833816, Standard deviation: 0.038840
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: nan, Standard deviation: nan


### Evaluation, FFD, m=30

In [105]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [106]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.244
Average bias: 0.225
Average variance: 0.123
Sklearn 0-1 loss: 0.260


In [107]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.215
Average bias: 0.214
Average variance: 0.039
Sklearn 0-1 loss: 0.214


In [148]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.217
Average bias: 0.197
Average variance: 0.089
Sklearn 0-1 loss: 0.197
Computing time:, FFD, k = 10 is: 58174.83967471123.


## 3.3 FFD, m = 60 (aus_ffd3)

In [149]:
# Complete code for data preperation
# Read data
df_ffd3 = pd.read_csv('aus_ffd3.csv')
disc = 'FFD'
m = 60

df_ffd3.info()
data = df_ffd3.values
data.shape

features = df_ffd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd3[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, FFD, m= 60

In [110]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.75      0.82      0.78        95
           1       0.75      0.67      0.71        78

    accuracy                           0.75       173
   macro avg       0.75      0.74      0.75       173
weighted avg       0.75      0.75      0.75       173

Time for training model ID3 - default, FFD, m = 60 is: 0.10687708854675293.


In [111]:
# Naive Bayes - min_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - min_categories, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.75      0.87      0.81        95
           1       0.81      0.65      0.72        78

    accuracy                           0.77       173
   macro avg       0.78      0.76      0.77       173
weighted avg       0.78      0.77      0.77       173

Time for training model Naive Bayes - min_categories, FFD, m = 60 is: 0.0066378116607666016.


In [150]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.78      0.85      0.81        95
           1       0.80      0.71      0.75        78

    accuracy                           0.79       173
   macro avg       0.79      0.78      0.78       173
weighted avg       0.79      0.79      0.78       173

Time for training model Knn-VDM, FFD, m = 60 is: 46.37032079696655.


In [113]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 60.
ID3: - Mean: 0.825121, Standard deviation: 0.042818
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.852174, Standard deviation: 0.044966


### Evaluation, FFD, m=60

In [114]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [115]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.238
Average bias: 0.185
Average variance: 0.128
Sklearn 0-1 loss: 0.249


In [116]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.221
Average bias: 0.220
Average variance: 0.042
Sklearn 0-1 loss: 0.225


In [151]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.225
Average bias: 0.191
Average variance: 0.086
Sklearn 0-1 loss: 0.214
Computing time:, FFD, k = 10 is: 21325.722386837006.


## 3.4 FFD, m = 100 (aus_ffd4)

In [152]:
# Complete code for data preperation
# Read data
df_ffd4 = pd.read_csv('aus_ffd4.csv')
disc = 'FFD'
m = 100

df_ffd4.info()
data = df_ffd4.values
data.shape

features = df_ffd4.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd4[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   A1      690 non-null    int64
 7   A11     690 non-null    int64
 8   A12     690 non-null    int64
 9   A4      690 non-null    int64
 10  A5      690 non-null    int64
 11  A6      690 non-null    int64
 12  A8      690 non-null    int64
 13  A9      690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB
(690, 14) (690,)
Class representation - original:  Counter({0: 383, 1: 307})
Class representation - training data:  Counter({0: 288, 1: 229})
Class representation - testing data:  Counter({0: 95, 1: 78})


### Models, FFD, m=100

In [119]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.74      0.79      0.77        95
           1       0.72      0.67      0.69        78

    accuracy                           0.73       173
   macro avg       0.73      0.73      0.73       173
weighted avg       0.73      0.73      0.73       173

Time for training model ID3 - default, FFD, m = 100 is: 0.09049105644226074.


In [120]:
# Naive Bayes - min_categories = n_categories
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB(min_categories = n_categories)
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - min_categories, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.76      0.86      0.81        95
           1       0.80      0.67      0.73        78

    accuracy                           0.77       173
   macro avg       0.78      0.76      0.77       173
weighted avg       0.78      0.77      0.77       173

Time for training model Naive Bayes - min_categories, FFD, m = 100 is: 0.007467985153198242.


In [153]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.79      0.89      0.84        95
           1       0.85      0.71      0.77        78

    accuracy                           0.81       173
   macro avg       0.82      0.80      0.80       173
weighted avg       0.81      0.81      0.81       173

Time for training model Knn-VDM, FFD, m = 100 is: 43.647905111312866.


In [122]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 100.
ID3: - Mean: 0.823188, Standard deviation: 0.042071
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.871014, Standard deviation: 0.033123


### Evaluation, FFD, m=100

In [123]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [124]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.236
Average bias: 0.202
Average variance: 0.128
Sklearn 0-1 loss: 0.266


In [125]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.224
Average bias: 0.237
Average variance: 0.030
Sklearn 0-1 loss: 0.225


In [154]:
# WARINING: LONG TIME 
# Knn-VDM
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time:, {disc}, k = {k} is: {end - start}.') # Total time execution

Average expected loss: 0.213
Average bias: 0.202
Average variance: 0.082
Sklearn 0-1 loss: 0.191
Computing time:, FFD, k = 10 is: 49172.59988498688.
