# Classification models - Supervised Discretization
## Dataset: australia

Updated by: Sam
Update at: 13/10/2022 <br>
---
Discretization methods: ChiMerge (manually defined function)<br>
Classification models: CNB, ID3, KNN-VDM
---
NOTE: 
Long time for computation of Knn-VDM (Run this part last) <br>
Use Malina scripts for Knn, ID3 <br>
Use Sam scripts (with min_categories) for Naive Bayes <br>
Key Errors for KNN-VDM models (k = 8, 10, 15)

In [94]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import six
import sys
sys.modules['sklearn.externals.six'] = six
import id3
from id3 import Id3Estimator
from id3 import export_graphviz
import wittgenstein as lw
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from pandas import read_csv
from pandas import set_option
import numpy as np
from numpy import arange
## EDA
from collections import Counter

# Knn-VDM 3
from vdm3 import ValueDifferenceMetric
from sklearn.neighbors import KNeighborsClassifier

# Pre-processing
from sklearn.preprocessing import OrdinalEncoder
# Cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score # 1 metric
from sklearn.model_selection import cross_validate # more than 1 metric
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [95]:
#read in data from ChiMerge discretization
aus6 = pd.read_csv('chim_aus_6int.csv')
aus8 = pd.read_csv('chim_aus_8int.csv')
aus10 = pd.read_csv('chim_aus_10int.csv')
aus15 = pd.read_csv('chim_aus_15int.csv')

## Interval frequencies

In [96]:
num_list6 = ["A2", "A3", "A7", "A10", "A13", "A14"]
num_list8 = ["A2", "A3", "A7", "A10", "A13", "A14"]
num_list10 = ["A2", "A3", "A7", "A10", "A13", "A14"]
num_list15 = ["A2", "A3", "A7", "A10", "A13", "A14"]

In [97]:
print('Interval frequency for 6 Intervals')
for i in num_list6:
    print('Interval for ' + i)
    print(Counter(aus6[i]))

Interval frequency for 6 Intervals
Interval for A2
Counter({2: 351, 0: 193, 3: 78, 5: 47, 1: 15, 4: 6})
Interval for A3
Counter({2: 204, 0: 191, 5: 172, 3: 87, 1: 25, 4: 11})
Interval for A7
Counter({0: 296, 3: 174, 4: 105, 1: 96, 5: 14, 2: 5})
Interval for A10
Counter({0: 395, 1: 116, 3: 77, 5: 51, 2: 43, 4: 8})
Interval for A13
Counter({1: 364, 0: 243, 3: 37, 2: 18, 5: 17, 4: 11})
Interval for A14
Counter({0: 295, 5: 178, 2: 141, 1: 57, 3: 12, 4: 7})


In [98]:
print('Frequency for aus with 8 Intervals')
for i in num_list8:
    print('Interval for ' + i)
    print(Counter(aus8[i]))

Frequency for aus with 8 Intervals
Interval for A2
Counter({4: 351, 0: 91, 5: 78, 2: 72, 7: 47, 1: 30, 3: 15, 6: 6})
Interval for A3
Counter({2: 204, 0: 191, 5: 160, 3: 87, 1: 25, 4: 11, 6: 9, 7: 3})
Interval for A7
Counter({0: 255, 5: 174, 6: 105, 3: 96, 2: 40, 7: 14, 4: 5, 1: 1})
Interval for A10
Counter({0: 395, 1: 116, 3: 57, 6: 46, 2: 43, 4: 20, 5: 8, 7: 5})
Interval for A13
Counter({0: 243, 3: 210, 1: 129, 5: 37, 2: 25, 4: 18, 7: 17, 6: 11})
Interval for A14
Counter({0: 295, 6: 155, 2: 79, 3: 62, 1: 57, 7: 23, 4: 12, 5: 7})


In [99]:
print('Frequency for aus with 10 Intervals')
for i in num_list10:
    print('Interval for ' + i)
    print(Counter(aus10[i]))

Frequency for aus with 10 Intervals
Interval for A2
Counter({6: 351, 0: 91, 7: 78, 4: 62, 9: 47, 1: 30, 5: 15, 8: 6, 2: 6, 3: 4})
Interval for A3
Counter({2: 204, 0: 191, 5: 142, 3: 87, 1: 25, 4: 11, 7: 11, 8: 9, 6: 7, 9: 3})
Interval for A7
Counter({7: 174, 1: 152, 8: 105, 0: 103, 5: 94, 3: 40, 9: 14, 6: 5, 4: 2, 2: 1})
Interval for A10
Counter({0: 395, 1: 116, 3: 57, 2: 43, 6: 27, 4: 20, 7: 19, 5: 8, 9: 3, 8: 2})
Interval for A13
Counter({0: 243, 3: 210, 1: 129, 2: 25, 4: 18, 9: 17, 5: 17, 7: 16, 8: 11, 6: 4})
Interval for A14
Counter({0: 295, 8: 155, 2: 67, 5: 62, 1: 57, 9: 23, 6: 12, 4: 10, 7: 7, 3: 2})


In [100]:
print('Frequency for aus with 15 Intervals')
for i in num_list15:
    print('Interval for ' + i)
    print(Counter(aus15[i]))

Frequency for aus with 15 Intervals
Interval for A2
Counter({11: 220, 9: 110, 12: 78, 2: 65, 6: 62, 14: 47, 3: 30, 8: 16, 1: 15, 7: 15, 0: 11, 13: 6, 4: 6, 10: 5, 5: 4})
Interval for A3
Counter({10: 142, 7: 125, 8: 87, 6: 79, 0: 58, 2: 44, 4: 43, 3: 40, 5: 25, 9: 11, 12: 11, 13: 9, 11: 7, 1: 6, 14: 3})
Interval for A7
Counter({1: 152, 10: 142, 13: 105, 0: 103, 5: 64, 3: 40, 12: 28, 8: 19, 14: 14, 6: 7, 9: 5, 11: 4, 7: 4, 4: 2, 2: 1})
Interval for A10
Counter({0: 395, 1: 71, 2: 45, 3: 28, 6: 23, 8: 20, 10: 19, 12: 19, 5: 18, 7: 16, 4: 15, 11: 8, 9: 8, 14: 3, 13: 2})
Interval for A13
Counter({0: 243, 1: 129, 4: 128, 2: 25, 8: 23, 5: 22, 3: 19, 9: 18, 14: 17, 10: 17, 12: 16, 6: 15, 13: 11, 11: 4, 7: 3})
Interval for A14
Counter({0: 295, 10: 138, 7: 62, 1: 57, 4: 40, 2: 24, 14: 23, 8: 12, 6: 10, 11: 8, 9: 7, 12: 6, 13: 3, 3: 3, 5: 2})


In [101]:
aus6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      690 non-null    int64
 1   A3      690 non-null    int64
 2   A7      690 non-null    int64
 3   A10     690 non-null    int64
 4   A13     690 non-null    int64
 5   A14     690 non-null    int64
 6   label   690 non-null    int64
dtypes: int64(7)
memory usage: 37.9 KB


# 1. Categorical Naive Bayes

## 1.1 6 Intervals from ChiMerge

In [56]:
# make test & train split
X = aus6.drop('label', axis=1)
y = aus6['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [57]:
# SAM
# Count number of categories
n_categories = aus6.drop('label', axis=1).nunique()

In [58]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 44
Accuracy: 0.75
Recall score :  0.7456647398843931
Precision score :  0.7456647398843931
F1 score :  0.745664739884393
Classification report:
              precision    recall  f1-score   support

           0       0.72      0.87      0.79        95
           1       0.79      0.59      0.68        78

    accuracy                           0.75       173
   macro avg       0.76      0.73      0.73       173
weighted avg       0.75      0.75      0.74       173

Computation time:
0.016733884811401367


In [59]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.257
Average bias: 0.254
Average variance: 0.050
Sklearn 0-1 loss: 0.254


## 1.2 8 Intervals from ChiMerge

In [60]:
# make test & train split
X = aus8.drop('label', axis=1)
y = aus8['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [61]:
# SAM
# Count number of categories
n_categories = aus8.drop('label', axis=1).nunique()

In [62]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 43
Accuracy: 0.75
Recall score :  0.7514450867052023
Precision score :  0.7514450867052023
F1 score :  0.7514450867052023
Classification report:
              precision    recall  f1-score   support

           0       0.73      0.87      0.79        95
           1       0.80      0.60      0.69        78

    accuracy                           0.75       173
   macro avg       0.76      0.74      0.74       173
weighted avg       0.76      0.75      0.75       173

Computation time:
0.010998964309692383


In [63]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.241
Average bias: 0.249
Average variance: 0.051
Sklearn 0-1 loss: 0.249


## 1.3 10 Intervals from ChiMerge

In [64]:
# make test & train split
X = aus10.drop('label', axis=1)
y = aus10['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [65]:
# SAM
# Count number of categories
n_categories = aus10.drop('label', axis=1).nunique()

In [66]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 39
Accuracy: 0.77
Recall score :  0.7745664739884393
Precision score :  0.7745664739884393
F1 score :  0.7745664739884393
Classification report:
              precision    recall  f1-score   support

           0       0.75      0.88      0.81        95
           1       0.82      0.64      0.72        78

    accuracy                           0.77       173
   macro avg       0.78      0.76      0.77       173
weighted avg       0.78      0.77      0.77       173

Computation time:
0.01660013198852539


In [67]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.220
Average bias: 0.220
Average variance: 0.053
Sklearn 0-1 loss: 0.225


## 1.4 15 Intervals from ChiMerge

In [68]:
# make test & train split
X = aus15.drop('label', axis=1)
y = aus15['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [69]:
# SAM
# Count number of categories
n_categories = aus15.drop('label', axis=1).nunique()

In [70]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 36
Accuracy: 0.79
Recall score :  0.791907514450867
Precision score :  0.791907514450867
F1 score :  0.791907514450867
Classification report:
              precision    recall  f1-score   support

           0       0.77      0.89      0.83        95
           1       0.84      0.67      0.74        78

    accuracy                           0.79       173
   macro avg       0.80      0.78      0.78       173
weighted avg       0.80      0.79      0.79       173

Computation time:
0.016885757446289062


In [71]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.220
Average bias: 0.202
Average variance: 0.068
Sklearn 0-1 loss: 0.208


# 2. Decision Tree ID3

## 2.1 6 Intervals from ChiMerge

In [72]:
#make splits
X = aus6.drop('label', axis=1)
y = aus6['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.72
Recall score :  0.7225433526011561
Precision score :  0.7225433526011561
F1 score :  0.7225433526011561
Classification report:
              precision    recall  f1-score   support

           0       0.71      0.84      0.77        95
           1       0.75      0.58      0.65        78

    accuracy                           0.72       173
   macro avg       0.73      0.71      0.71       173
weighted avg       0.73      0.72      0.72       173

Computation time:
0.03800511360168457


In [73]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.310
Average bias: 0.289
Average variance: 0.105
Sklearn 0-1 loss: 0.277


## 2.2 8 Intervals from ChiMerge

In [74]:
#make splits
X = aus8.drop('label', axis=1)
y = aus8['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.73
Recall score :  0.7341040462427746
Precision score :  0.7341040462427746
F1 score :  0.7341040462427746
Classification report:
              precision    recall  f1-score   support

           0       0.71      0.86      0.78        95
           1       0.78      0.58      0.66        78

    accuracy                           0.73       173
   macro avg       0.74      0.72      0.72       173
weighted avg       0.74      0.73      0.73       173

Computation time:
0.02316451072692871


In [75]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.309
Average bias: 0.295
Average variance: 0.119
Sklearn 0-1 loss: 0.266


## 2.3 10 Intervals from ChiMerge

In [76]:
#make splits
X = aus10.drop('label', axis=1)
y = aus10['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.72
Recall score :  0.7225433526011561
Precision score :  0.7225433526011561
F1 score :  0.7225433526011561
Classification report:
              precision    recall  f1-score   support

           0       0.71      0.84      0.77        95
           1       0.75      0.58      0.65        78

    accuracy                           0.72       173
   macro avg       0.73      0.71      0.71       173
weighted avg       0.73      0.72      0.72       173

Computation time:
0.04064154624938965


In [77]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.311
Average bias: 0.301
Average variance: 0.119
Sklearn 0-1 loss: 0.277


## 2.4 15 Intervals from CHiMerge

In [78]:
#make splits
X = aus15.drop('label', axis=1)
y = aus15['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.69
Recall score :  0.6936416184971098
Precision score :  0.6936416184971098
F1 score :  0.6936416184971098
Classification report:
              precision    recall  f1-score   support

           0       0.70      0.78      0.74        95
           1       0.69      0.59      0.63        78

    accuracy                           0.69       173
   macro avg       0.69      0.68      0.69       173
weighted avg       0.69      0.69      0.69       173

Computation time:
0.049723148345947266


In [79]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.318
Average bias: 0.301
Average variance: 0.128
Sklearn 0-1 loss: 0.306


# 3. KNN classification

## 3.1 6 Intervals from CHiMerge

In [102]:
# make test & train split
X = aus6.drop('label', axis=1)
y = aus6['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [103]:
# Knn-VDM complete code
# Time for training model Knn-VDM: 16.900068044662476.
# Accuracy: 0.71

import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.72      0.76      0.74        95
           1       0.68      0.64      0.66        78

    accuracy                           0.71       173
   macro avg       0.70      0.70      0.70       173
weighted avg       0.70      0.71      0.70       173

Time for training model Knn-VDM: 18.20798110961914.


In [104]:
X_train.values

array([[2, 3, 0, 0, 3, 0],
       [2, 5, 5, 3, 0, 5],
       [2, 2, 3, 2, 2, 5],
       ...,
       [0, 5, 1, 0, 1, 0],
       [0, 5, 0, 0, 3, 5],
       [2, 0, 3, 0, 1, 2]], dtype=int64)

In [105]:
# # !!!! WARING: TIME CONSUMING
import time
start = time.time() # For measuring time execution

# Bias and variance decomposition
# Convert all dataframe to array first
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Computing time: {end - start}.') # Total time execution

Average expected loss: 0.297
Average bias: 0.295
Average variance: 0.136
Sklearn 0-1 loss: 0.295
Computing time: 3911.5256078243256.


## 3.2 KNN with ChiMerge 8 intervals

In [85]:
# make test & train split
X = aus8.drop('label', axis=1)
y = aus8['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [86]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

KeyError: 1.0

In [87]:
# # !!!! WARING: TIME CONSUMING

# # Bias and variance decomposition
# # Convert all dataframe to array first
# X_train = X_train.values
# X_test = X_test.values
# y_train = y_train.values
# y_test = y_test.values

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
#         knn_vdm, X_train, y_train, X_test, y_test, 
#         loss='0-1_loss',
#         random_seed=123)

# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## 3.3 KNN with ChiMerge 10 intervals

In [88]:
# make test & train split
X = aus10.drop('label', axis=1)
y = aus10['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [89]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

KeyError: 8.0

In [90]:
# # !!!! WARING: TIME CONSUMING

# # Bias and variance decomposition
# # Convert all dataframe to array first
# X_train = X_train.values
# X_test = X_test.values
# y_train = y_train.values
# y_test = y_test.values

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
#         knn_vdm, X_train, y_train, X_test, y_test, 
#         loss='0-1_loss',
#         random_seed=123)

# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## 3.4 KNN with ChiMerge 15 intervals

In [91]:
# make test & train split
X = aus15.drop('label', axis=1)
y = aus15['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [92]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

KeyError: 13.0

In [93]:
# # !!!! WARING: TIME CONSUMING

# # Bias and variance decomposition
# # Convert all dataframe to array first
# X_train = X_train.values
# X_test = X_test.values
# y_train = y_train.values
# y_test = y_test.values

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
#         knn_vdm, X_train, y_train, X_test, y_test, 
#         loss='0-1_loss',
#         random_seed=123)

# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))