# Classification models - Supervised Discretization
## Dataset: iris
 <br>
Updated by: Sam
Update at: 13/10/2022, Changes: Only re-run the Naive Bayes Discretization with Ïndex out of bound error <br>
---
Discretization methods: DecisionTree <br>
Classification models: CNB, ID3, KNN-VDM <br>
Evaluation metrics: accuracy, time for training, bias, variance <br>
---
NOTE: <br>
Long time for computation of Knn-VDM (Run this part last) <br>
Use Malina scripts for Knn, ID3 <br>
Use Sam scripts (with min_categories) for Naive Bayes <br>


In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import six
import sys
sys.modules['sklearn.externals.six'] = six
import id3
from id3 import Id3Estimator
from id3 import export_graphviz
import wittgenstein as lw
#import warnings
#warnings.filterwarnings('ignore')

# use repeated k-fold cross-validation - 3 repeats
from sklearn.model_selection import KFold
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# Knn-VDM 3
from vdm3 import ValueDifferenceMetric
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#read in data from DecisionTree discretization
iris_m2 = pd.read_csv('DT_small_discretized_iris.csv')
iris_m3 = pd.read_csv('DT_medium_discretized_iris.csv')
iris_m4 = pd.read_csv('DT_large_discretized_iris.csv')
iris_m5 = pd.read_csv('DT_verylarge_discretized_iris.csv')

In [3]:
iris_m3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB


In [4]:
iris_m4.head(10)

Unnamed: 0,slength,swidth,plength,pwidth,label
0,1,9,5,4,1
1,3,4,0,1,2
2,3,3,0,0,2
3,3,6,1,3,2
4,3,5,0,1,2
5,6,5,5,4,1
6,0,4,0,0,2
7,3,3,4,3,1
8,5,9,5,4,1
9,3,5,3,1,2


# 1. Categorical Naive Bayes

## 1.1 CNB model, DT, max_depth = 2

In [5]:
# make test & train split
X = iris_m2.drop('label', axis=1)
y = iris_m2['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)

In [6]:
# Count number of categories
n_categories = iris_m2.drop('label', axis=1).nunique()

In [7]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 1
Accuracy: 0.97
Recall score :  0.9736842105263158
Precision score :  0.9736842105263158
F1 score :  0.9736842105263158
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.94      1.00      0.97        15
           2       1.00      0.92      0.96        12

    accuracy                           0.97        38
   macro avg       0.98      0.97      0.97        38
weighted avg       0.98      0.97      0.97        38

Computation time:
0.012999534606933594


In [8]:
# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.023
Average bias: 0.026
Average variance: 0.005
Sklearn 0-1 loss: 0.026


In [9]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

# evaluate model
scores = cross_val_score(cnb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.960 (0.053)


## 1.2 CNB model, DT, max_depth = 3

In [10]:
# make test & train split
X = iris_m3.drop('label', axis=1)
y = iris_m3['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)

In [11]:
# Count number of categories
n_categories = iris_m3.drop('label', axis=1).nunique()

In [12]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories) # Update 13/10
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 4
Accuracy: 0.89
Recall score :  0.8947368421052632
Precision score :  0.8947368421052632
F1 score :  0.8947368421052632
Classification report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        11
           1       0.94      1.00      0.97        15
           2       1.00      0.67      0.80        12

    accuracy                           0.89        38
   macro avg       0.91      0.89      0.88        38
weighted avg       0.91      0.89      0.89        38

Computation time:
0.011998176574707031


In [13]:
# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.122
Average bias: 0.105
Average variance: 0.027
Sklearn 0-1 loss: 0.105


In [14]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

# evaluate model
scores = cross_val_score(cnb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.902 (0.056)


## 1.3 CNB model, DT, max_depth = 4

In [15]:
X = iris_m4.drop('label', axis=1)
y = iris_m4['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)

In [16]:
# Count number of categories
n_categories = iris_m4.drop('label', axis=1).nunique()

In [17]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories) # Update 13/10
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 5
Accuracy: 0.87
Recall score :  0.868421052631579
Precision score :  0.868421052631579
F1 score :  0.868421052631579
Classification report:
              precision    recall  f1-score   support

           0       0.73      1.00      0.85        11
           1       0.94      1.00      0.97        15
           2       1.00      0.58      0.74        12

    accuracy                           0.87        38
   macro avg       0.89      0.86      0.85        38
weighted avg       0.90      0.87      0.86        38

Computation time:
0.013999223709106445


In [18]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

# evaluate model
scores = cross_val_score(cnb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.900 (0.070)


In [19]:
# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.136
Average bias: 0.132
Average variance: 0.032
Sklearn 0-1 loss: 0.132


## 1.4 CNB model, DT, max_depth = 5

In [20]:
X = iris_m5.drop('label', axis=1)
y = iris_m5['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)

In [21]:
# Count number of categories
n_categories = iris_m5.drop('label', axis=1).nunique()

In [22]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 5
Accuracy: 0.87
Recall score :  0.868421052631579
Precision score :  0.868421052631579
F1 score :  0.868421052631579
Classification report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        11
           1       0.93      0.93      0.93        15
           2       0.89      0.67      0.76        12

    accuracy                           0.87        38
   macro avg       0.87      0.87      0.86        38
weighted avg       0.88      0.87      0.86        38

0.010995149612426758


In [23]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

# evaluate model
scores = cross_val_score(cnb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.889 (0.067)


In [24]:
# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.145
Average bias: 0.132
Average variance: 0.038
Sklearn 0-1 loss: 0.132


# 2. Decision Tree ID3

## 2.1 ID3 model, DT, max_depth = 2

In [29]:
#make splits
X = iris_m2.drop('label', axis=1)
y = iris_m2['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.97
Recall score :  0.9736842105263158
Precision score :  0.9736842105263158
F1 score :  0.9736842105263158
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.94      1.00      0.97        15
           2       1.00      0.92      0.96        12

    accuracy                           0.97        38
   macro avg       0.98      0.97      0.97        38
weighted avg       0.98      0.97      0.97        38

Computation time:
0.014980316162109375


In [30]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.067
Average bias: 0.026
Average variance: 0.041
Sklearn 0-1 loss: 0.026


In [31]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

if __name__ == '__main__':
    # evaluate model
    #BUG: n_jobs=-1 does not work
    #SOLVED: n_jobs = 1 --> does not fix it but no error because of praralell processing issue in windows
    # see https://github.com/scikit-learn/scikit-learn/issues/13228
    #scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv, n_jobs=1)
    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.960 (0.047)


## 2.2 ID3 model, DT, max_depth = 3

In [32]:
#make splits
X = iris_m3.drop('label', axis=1)
y = iris_m3['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.84
Recall score :  0.8421052631578947
Precision score :  0.8421052631578947
F1 score :  0.8421052631578947
Classification report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.94      1.00      0.97        15
           2       0.80      0.67      0.73        12

    accuracy                           0.84        38
   macro avg       0.83      0.83      0.83        38
weighted avg       0.84      0.84      0.84        38

Computation time:
0.01839733123779297


In [33]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.202
Average bias: 0.158
Average variance: 0.104
Sklearn 0-1 loss: 0.158


In [34]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

if __name__ == '__main__':
    # evaluate model
    #BUG: n_jobs=-1 does not work
    #SOLVED: n_jobs = 1 --> does not fix it but no error because of praralell processing issue in windows
    # see https://github.com/scikit-learn/scikit-learn/issues/13228
    #scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    cores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv)
    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.960 (0.047)


## 2.3 ID3 model, DT, max_depth = 4

In [35]:
#make splits
X = iris_m4.drop('label', axis=1)
y = iris_m4['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.87
Recall score :  0.868421052631579
Precision score :  0.868421052631579
F1 score :  0.868421052631579
Classification report:
              precision    recall  f1-score   support

           0       0.77      0.91      0.83        11
           1       0.94      1.00      0.97        15
           2       0.89      0.67      0.76        12

    accuracy                           0.87        38
   macro avg       0.87      0.86      0.85        38
weighted avg       0.87      0.87      0.86        38

Computation time:
0.017242431640625


In [36]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.166
Average bias: 0.132
Average variance: 0.074
Sklearn 0-1 loss: 0.132


In [37]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

if __name__ == '__main__':
    # evaluate model
    #BUG: n_jobs=-1 does not work
    #SOLVED: n_jobs = 1 --> does not fix it but no error because of praralell processing issue in windows
    # see https://github.com/scikit-learn/scikit-learn/issues/13228
    #scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    cores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv)
    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.960 (0.047)


## 2.4 ID3 model, DT, max_depth = 5

In [38]:
#make splits
X = iris_m5.drop('label', axis=1)
y = iris_m5['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.87
Recall score :  0.868421052631579
Precision score :  0.868421052631579
F1 score :  0.868421052631579
Classification report:
              precision    recall  f1-score   support

           0       0.77      0.91      0.83        11
           1       0.94      1.00      0.97        15
           2       0.89      0.67      0.76        12

    accuracy                           0.87        38
   macro avg       0.87      0.86      0.85        38
weighted avg       0.87      0.87      0.86        38

Computation time:
0.018237590789794922


In [39]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.168
Average bias: 0.132
Average variance: 0.073
Sklearn 0-1 loss: 0.132


In [40]:
# use repeated k-fold cross-validation - 3 repeats
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model

if __name__ == '__main__':
    # evaluate model
    #BUG: n_jobs=-1 does not work
    #SOLVED: n_jobs = 1 --> does not fix it but no error because of praralell processing issue in windows
    # see https://github.com/scikit-learn/scikit-learn/issues/13228
    #scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    cores = cross_val_score(estimator, X, y, scoring='accuracy', cv=cv)
    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.960 (0.047)


# 4. KNN classification

In [51]:
#read in data from ChiMerge discretization
iris_m2 = pd.read_csv('DT_small_discretized_iris.csv')
iris_m3 = pd.read_csv('DT_medium_discretized_iris.csv')
iris_m4 = pd.read_csv('DT_large_discretized_iris.csv')
iris_m5 = pd.read_csv('DT_verylarge_discretized_iris.csv')

## 3.1 KNN-VDM, DT, max_depth = 1

In [52]:
# Complete code for data preperation
# Read data
df_ewd1 = iris_m2
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({1: 50, 2: 50, 0: 50})
Class representation - training data:  Counter({0: 39, 2: 38, 1: 35})
Class representation - testing data:  Counter({1: 15, 2: 12, 0: 11})


In [53]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        12

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

Time for training model Knn-VDM: 0.7805743217468262.


In [54]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, x_train, y_train, x_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.043
Average bias: 0.000
Average variance: 0.043
Sklearn 0-1 loss: 0.000


## 3.2 KNN-VDM, DT, max_depth = 3

In [55]:
# Complete code for data preperation
# Read data
df_ewd1 = iris_m3
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({1: 50, 2: 50, 0: 50})
Class representation - training data:  Counter({0: 39, 2: 38, 1: 35})
Class representation - testing data:  Counter({1: 15, 2: 12, 0: 11})


In [56]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.94      1.00      0.97        15
           2       0.80      0.67      0.73        12

    accuracy                           0.84        38
   macro avg       0.83      0.83      0.83        38
weighted avg       0.84      0.84      0.84        38

Time for training model Knn-VDM: 0.7498893737792969.


In [57]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, x_train, y_train, x_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.193
Average bias: 0.105
Average variance: 0.115
Sklearn 0-1 loss: 0.158


## 3.3 KNN-VDM, DT, max_depth = 4

In [58]:
# Complete code for data preperation
# Read data
df_ewd1 = iris_m4
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({1: 50, 2: 50, 0: 50})
Class representation - training data:  Counter({0: 39, 2: 38, 1: 35})
Class representation - testing data:  Counter({1: 15, 2: 12, 0: 11})


In [59]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.94      1.00      0.97        15
           2       0.80      0.67      0.73        12

    accuracy                           0.84        38
   macro avg       0.83      0.83      0.83        38
weighted avg       0.84      0.84      0.84        38

Time for training model Knn-VDM: 0.8942842483520508.


In [60]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, x_train, y_train, x_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.186
Average bias: 0.105
Average variance: 0.102
Sklearn 0-1 loss: 0.158


## 3.4 KNN-VDM, DT, max_depth = 5

In [61]:
# Complete code for data preperation
# Read data
df_ewd1 = iris_m5
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({1: 50, 2: 50, 0: 50})
Class representation - training data:  Counter({0: 39, 2: 38, 1: 35})
Class representation - testing data:  Counter({1: 15, 2: 12, 0: 11})


In [62]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        11
           1       0.94      1.00      0.97        15
           2       0.82      0.75      0.78        12

    accuracy                           0.87        38
   macro avg       0.86      0.86      0.86        38
weighted avg       0.87      0.87      0.87        38

Time for training model Knn-VDM: 1.1675591468811035.


In [63]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, x_train, y_train, x_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.183
Average bias: 0.105
Average variance: 0.094
Sklearn 0-1 loss: 0.132
