# Classification models - Supervised Discretization
## Dataset: australia <br>

Updated by: Sam
Update at: 13/10/2022 <br>
---
Discretization methods: Decision Tree <br>
Classification models: CNB, ID3, KNN-VDM
---
NOTE: 
Long time for computation of Knn-VDM (Run this part last) <br>
Use Malina scripts for Knn, ID3 <br>
Use Sam scripts (with min_categories) for Naive Bayes <br>


In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import six
import sys
sys.modules['sklearn.externals.six'] = six
import id3
from id3 import Id3Estimator
from id3 import export_graphviz
import wittgenstein as lw
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from pandas import read_csv
from pandas import set_option
import numpy as np
from numpy import arange
## EDA
from collections import Counter

# Knn-VDM 3
from vdm3 import ValueDifferenceMetric
from sklearn.neighbors import KNeighborsClassifier

# Pre-processing
from sklearn.preprocessing import OrdinalEncoder
# Cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score # 1 metric
from sklearn.model_selection import cross_validate # more than 1 metric
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
#read in data from Decisiton Tree discretization
aus_m2 = pd.read_csv('DT_small_discretized_aus.csv') # max_depth = 2
aus_m3 = pd.read_csv('DT_medium_discretized_aus.csv') # max_depth = 3
aus_m4 = pd.read_csv('DT_large_discretized_aus.csv') # max_depth = 4
aus_m5 = pd.read_csv('DT_verylarge_discretized_aus.csv') # max_depth = 5

## Interval frequencies

In [3]:
num_list_m2 = ["A2", "A3", "A7", "A10", "A13", "A14"]
num_list_m3 = ["A2", "A3", "A7", "A10", "A13", "A14"]
num_list_m4 = ["A2", "A3", "A7", "A10", "A13", "A14"]
num_list_m5 = ["A2", "A3", "A7", "A10", "A13", "A14"]

In [4]:
print('Interval frequency for 6 Intervals')
for i in num_list_m2:
    print('Interval for ' + i)
    print(Counter(aus_m2[i]))

Interval frequency for 6 Intervals
Interval for A2
Counter({1: 459, 2: 141, 0: 53, 3: 37})
Interval for A3
Counter({0: 396, 2: 236, 1: 46, 3: 12})
Interval for A7
Counter({2: 301, 0: 296, 1: 77, 3: 16})
Interval for A10
Counter({0: 395, 3: 136, 1: 116, 2: 43})
Interval for A13
Counter({0: 382, 1: 146, 2: 136, 3: 26})
Interval for A14
Counter({0: 455, 2: 150, 1: 62, 3: 23})


In [5]:
print('Frequency for aus with 8 Intervals')
for i in num_list_m3:
    print('Interval for ' + i)
    print(Counter(aus_m3[i]))

Frequency for aus with 8 Intervals
Interval for A2
Counter({5: 303, 4: 156, 6: 114, 2: 39, 7: 30, 3: 27, 0: 14, 1: 7})
Interval for A3
Counter({1: 391, 5: 197, 4: 43, 2: 39, 6: 13, 0: 5, 3: 2})
Interval for A7
Counter({3: 198, 0: 193, 1: 103, 4: 103, 2: 75, 5: 18})
Interval for A10
Counter({0: 395, 6: 77, 2: 71, 5: 59, 1: 45, 3: 28, 4: 15})
Interval for A13
Counter({1: 301, 5: 132, 4: 130, 2: 81, 7: 17, 0: 16, 6: 9, 3: 4})
Interval for A14
Counter({2: 295, 0: 160, 5: 133, 3: 50, 6: 23, 4: 17, 1: 12})


In [6]:
print('Frequency for aus with 10 Intervals')
for i in num_list_m4:
    print('Interval for ' + i)
    print(Counter(aus_m4[i]))

Frequency for aus with 10 Intervals
Interval for A2
Counter({5: 244, 2: 105, 7: 88, 8: 59, 4: 51, 1: 36, 10: 26, 9: 24, 3: 23, 0: 21, 11: 9, 6: 4})
Interval for A3
Counter({1: 368, 6: 195, 2: 35, 3: 26, 5: 23, 4: 17, 7: 16, 0: 10})
Interval for A7
Counter({1: 181, 7: 145, 8: 86, 3: 71, 2: 70, 6: 53, 4: 33, 9: 18, 5: 17, 0: 16})
Interval for A10
Counter({0: 395, 3: 71, 6: 57, 7: 51, 1: 45, 4: 28, 8: 20, 5: 15, 2: 8})
Interval for A13
Counter({3: 272, 7: 132, 4: 118, 5: 80, 1: 29, 10: 18, 2: 12, 9: 11, 8: 9, 0: 5, 6: 4})
Interval for A14
Counter({3: 295, 2: 103, 7: 84, 1: 57, 9: 49, 5: 32, 10: 23, 8: 18, 6: 16, 0: 8, 4: 5})


In [7]:
print('Frequency for aus with 15 Intervals')
for i in num_list_m5:
    print('Interval for ' + i)
    print(Counter(aus_m5[i]))

Frequency for aus with 15 Intervals
Interval for A2
Counter({4: 171, 8: 111, 2: 99, 7: 83, 5: 47, 0: 31, 1: 25, 13: 23, 3: 22, 12: 21, 11: 21, 9: 21, 6: 11, 10: 4})
Interval for A3
Counter({3: 198, 2: 170, 8: 117, 6: 78, 4: 33, 5: 25, 7: 24, 10: 18, 0: 11, 1: 11, 9: 5})
Interval for A7
Counter({9: 140, 1: 119, 3: 70, 11: 68, 5: 64, 2: 62, 8: 37, 7: 33, 14: 23, 13: 18, 0: 16, 4: 16, 6: 13, 10: 7, 12: 4})
Interval for A10
Counter({0: 395, 3: 71, 9: 46, 1: 45, 8: 39, 5: 28, 10: 20, 7: 18, 6: 15, 2: 8, 4: 5})
Interval for A13
Counter({3: 154, 8: 132, 5: 118, 6: 115, 7: 76, 11: 29, 1: 23, 2: 11, 0: 11, 9: 9, 4: 8, 10: 4})
Interval for A14
Counter({4: 295, 2: 89, 10: 60, 13: 44, 1: 44, 3: 28, 11: 28, 7: 24, 0: 24, 8: 18, 6: 14, 12: 14, 9: 4, 5: 4})


In [8]:
aus_m2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      690 non-null    int64
 1   A2      690 non-null    int64
 2   A3      690 non-null    int64
 3   A4      690 non-null    int64
 4   A5      690 non-null    int64
 5   A6      690 non-null    int64
 6   A7      690 non-null    int64
 7   A8      690 non-null    int64
 8   A9      690 non-null    int64
 9   A10     690 non-null    int64
 10  A11     690 non-null    int64
 11  A12     690 non-null    int64
 12  A13     690 non-null    int64
 13  A14     690 non-null    int64
 14  label   690 non-null    int64
dtypes: int64(15)
memory usage: 81.0 KB


# 1. Categorical Naive Bayes

## 1.1 Max_depth = 2

## 1.1 script version

In [9]:
# make test & train split
X = aus_m2.drop('label', axis=1)
y = aus_m2['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [10]:
# Count number of categories
n_categories = aus_m2.drop('label', axis=1).nunique()

In [11]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 28
Accuracy: 0.84
Recall score :  0.838150289017341
Precision score :  0.838150289017341
F1 score :  0.8381502890173411
Classification report:
              precision    recall  f1-score   support

           0       0.78      0.92      0.84        83
           1       0.91      0.77      0.83        90

    accuracy                           0.84       173
   macro avg       0.85      0.84      0.84       173
weighted avg       0.85      0.84      0.84       173

Computation time:
0.012994527816772461


In [12]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.162
Average bias: 0.162
Average variance: 0.023
Sklearn 0-1 loss: 0.162


## 1.2 Max_depth = 3

In [13]:
# make test & train split
X = aus_m3.drop('label', axis=1)
y = aus_m3['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [14]:
# Count number of categories
n_categories = aus_m3.drop('label', axis=1).nunique()

In [15]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 26
Accuracy: 0.85
Recall score :  0.8497109826589595
Precision score :  0.8497109826589595
F1 score :  0.8497109826589595
Classification report:
              precision    recall  f1-score   support

           0       0.79      0.93      0.86        83
           1       0.92      0.78      0.84        90

    accuracy                           0.85       173
   macro avg       0.86      0.85      0.85       173
weighted avg       0.86      0.85      0.85       173

Computation time:
0.013386964797973633


In [16]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.152
Average bias: 0.150
Average variance: 0.031
Sklearn 0-1 loss: 0.150


## 1.3 Max_depth = 4

In [17]:
# make test & train split
X = aus_m4.drop('label', axis=1)
y = aus_m4['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [18]:
# Count number of categories
n_categories = aus_m4.drop('label', axis=1).nunique()

In [19]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 26
Accuracy: 0.85
Recall score :  0.8497109826589595
Precision score :  0.8497109826589595
F1 score :  0.8497109826589595
Classification report:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        83
           1       0.93      0.77      0.84        90

    accuracy                           0.85       173
   macro avg       0.86      0.85      0.85       173
weighted avg       0.86      0.85      0.85       173

Computation time:
0.012998342514038086


In [20]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.153
Average bias: 0.156
Average variance: 0.030
Sklearn 0-1 loss: 0.150


## 1.4 Max_depth = 5

In [21]:
# make test & train split
X = aus_m5.drop('label', axis=1)
y = aus_m5['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [22]:
# SAM
# Count number of categories
n_categories = aus_m5.drop('label', axis=1).nunique()

In [23]:
import time
start = time.time() # For measuring time execution

cnb = CategoricalNB(min_categories = n_categories)
cnb.fit(X_train, y_train)

y_pred_cnb = cnb.predict(X_test)
y_prob_pred_cnb = cnb.predict_proba(X_test)

# how did our model perform?
count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB")
print("=" * 25)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_cnb , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_cnb , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_cnb , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_cnb))

end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

CategoricalNB
Misclassified samples: 25
Accuracy: 0.86
Recall score :  0.8554913294797688
Precision score :  0.8554913294797688
F1 score :  0.8554913294797687
Classification report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86        83
           1       0.92      0.79      0.85        90

    accuracy                           0.86       173
   macro avg       0.86      0.86      0.86       173
weighted avg       0.86      0.86      0.86       173

Computation time:
0.017255544662475586


In [24]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        cnb, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_cnb))

Average expected loss: 0.152
Average bias: 0.145
Average variance: 0.035
Sklearn 0-1 loss: 0.145


# 2. Decision Tree ID3

## 2.1 Max_depth = 2

In [25]:
#make splits
X = aus_m2.drop('label', axis=1)
y = aus_m2['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.83
Recall score :  0.8323699421965318
Precision score :  0.8323699421965318
F1 score :  0.8323699421965318
Classification report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        83
           1       0.86      0.81      0.83        90

    accuracy                           0.83       173
   macro avg       0.83      0.83      0.83       173
weighted avg       0.83      0.83      0.83       173

Computation time:
0.08155369758605957


In [26]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.208
Average bias: 0.145
Average variance: 0.129
Sklearn 0-1 loss: 0.168


## 2.2 Max_depth = 3

In [27]:
#make splits
X = aus_m3.drop('label', axis=1)
y = aus_m3['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.83
Recall score :  0.8265895953757225
Precision score :  0.8265895953757225
F1 score :  0.8265895953757225
Classification report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82        83
           1       0.84      0.82      0.83        90

    accuracy                           0.83       173
   macro avg       0.83      0.83      0.83       173
weighted avg       0.83      0.83      0.83       173

Computation time:
0.09459757804870605


In [28]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.204
Average bias: 0.145
Average variance: 0.128
Sklearn 0-1 loss: 0.173


## 2.3 max_depth = 4

In [29]:
#make splits
X = aus_m4.drop('label', axis=1)
y = aus_m4['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.80
Recall score :  0.8034682080924855
Precision score :  0.8034682080924855
F1 score :  0.8034682080924856
Classification report:
              precision    recall  f1-score   support

           0       0.78      0.83      0.80        83
           1       0.83      0.78      0.80        90

    accuracy                           0.80       173
   macro avg       0.80      0.80      0.80       173
weighted avg       0.81      0.80      0.80       173

Computation time:
0.0886995792388916


In [30]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.200
Average bias: 0.168
Average variance: 0.126
Sklearn 0-1 loss: 0.197


## 2.4 Max_depth = 5

In [31]:
#make splits
X = aus_m5.drop('label', axis=1)
y = aus_m5['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
#time recording
import time
start = time.time() # For measuring time execution
#build estimator
estimator = Id3Estimator()
estimator = estimator.fit(X_train, y_train, check_input=True)
tree = export_graphviz(estimator.tree_, 'tree.dot', y)
#make predictions
y_pred_id3 = estimator.predict(X_test)
#_prob_pred_id3 = estimator.predict_proba(X_test)
#report performance
accuracy = accuracy_score(y_test, y_pred_id3)
print('Accuracy: {:.2f}'.format(accuracy))
print("=" * 25)
print("Recall score : ", recall_score(y_test, y_pred_id3 , average='micro'))
print("Precision score : ",precision_score(y_test, y_pred_id3 , average='micro'))
print("F1 score : ",f1_score(y_test, y_pred_id3 , average='micro'))
print("=" * 25)
print("Classification report:")
print(classification_report(y_test, y_pred_id3))
#stop time recoridng
end = time.time()
print("Computation time:")
print(end - start) # Total time execution for this sample

Accuracy: 0.79
Recall score :  0.791907514450867
Precision score :  0.791907514450867
F1 score :  0.791907514450867
Classification report:
              precision    recall  f1-score   support

           0       0.76      0.82      0.79        83
           1       0.82      0.77      0.79        90

    accuracy                           0.79       173
   macro avg       0.79      0.79      0.79       173
weighted avg       0.79      0.79      0.79       173

Computation time:
0.1000971794128418


In [32]:
# calculate classification bias and variance
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import warnings
warnings.filterwarnings('ignore')
#We will load the Boston house dataset for our example
from sklearn.datasets import load_boston
from sklearn import metrics

# y_pred_cnb
# y_prob_pred_cnb

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        estimator, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.189
Average bias: 0.145
Average variance: 0.124
Sklearn 0-1 loss: 0.208


# 3. KNN classification

## 3.1 KNN with DT max_depth = 2

In [45]:
# make test & train split
X = aus_m2.drop('label', axis=1) # max_depth = 2
y = aus_m2['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [46]:
# Knn-VDM complete code
# accuracy 0.84 173
# Time for training model Knn-VDM: 38.516900062561035
# Time for training model Knn-VDM: 37.39223265647888.
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.82      0.87      0.84        83
           1       0.87      0.82      0.85        90

    accuracy                           0.84       173
   macro avg       0.84      0.84      0.84       173
weighted avg       0.85      0.84      0.84       173

Time for training model Knn-VDM: 37.39223265647888.


In [47]:
# !!!! WARING: TIME CONSUMING

# Bias and variance decomposition
# Convert all dataframe to array first
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.189
Average bias: 0.173
Average variance: 0.090
Sklearn 0-1 loss: 0.156


## 3.2 KNN with DT max_depth = 3

In [48]:
# make test & train split
X = aus_m3.drop('label', axis=1)
y = aus_m3['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [49]:
# Knn-VDM complete code
# accuracy 0.83 173
# Time for training model Knn-VDM: 38.24225306510925
# Time for training model Knn-VDM: 39.29734921455383.
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.79      0.87      0.83        83
           1       0.87      0.79      0.83        90

    accuracy                           0.83       173
   macro avg       0.83      0.83      0.83       173
weighted avg       0.83      0.83      0.83       173

Time for training model Knn-VDM: 39.29734921455383.


In [50]:
# !!!! WARING: TIME CONSUMING

# Bias and variance decomposition
# Convert all dataframe to array first
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.197
Average bias: 0.179
Average variance: 0.091
Sklearn 0-1 loss: 0.173


## 3.3 KNN with DT max_depth = 4

In [51]:
# make test & train split
X = aus_m4.drop('label', axis=1)
y = aus_m4['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [52]:
# Knn-VDM complete code
# DONE
# accuracy 0.82 173
# Time for training model Knn-VDM: 37.76226568222046.
# Time for training: 37.068912744522095
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.78      0.86      0.82        83
           1       0.85      0.78      0.81        90

    accuracy                           0.82       173
   macro avg       0.82      0.82      0.82       173
weighted avg       0.82      0.82      0.81       173

Time for training model Knn-VDM: 37.068912744522095.


In [53]:
# !!!! WARING: TIME CONSUMING

# Bias and variance decomposition
# Convert all dataframe to array first
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.201
Average bias: 0.179
Average variance: 0.089
Sklearn 0-1 loss: 0.185


## 3.4 KNN with DT max_depth = 5

In [54]:
# make test & train split
X = aus_m5.drop('label', axis=1)
y = aus_m5['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)


In [55]:
# Knn-VDM complete code
# DONE:
# Time for training model Knn-VDM: 37.23736929893494.
# accuracy 0.83 173
# New time for training: 41.03129029273987

import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(X_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
# Fit model
knn_vdm.fit(X_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(X_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.79      0.87      0.83        83
           1       0.87      0.79      0.83        90

    accuracy                           0.83       173
   macro avg       0.83      0.83      0.83       173
weighted avg       0.83      0.83      0.83       173

Time for training model Knn-VDM: 41.03129029273987.


In [57]:
# !!!! WARING: TIME CONSUMING

# Bias and variance decomposition
# Convert all dataframe to array first
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        knn_vdm, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.199
Average bias: 0.173
Average variance: 0.095
Sklearn 0-1 loss: 0.173
