# Classification models - unsupervised discretization

Dataset: pendigits (REDO training models on the entired dataset) <br>
By: Sam <br>
Update at: 27/04/2023 <br>

====

Summary:<br>
- Import unsupervised discretised datasets (already encoded categorical attributes)
- Datasets are discretized from BNAIC (manually ChiMerge)
- Perform 3 classification models: EWD (3 settings), EFD (3 settings) and FFD (4 settings)
- Evaluation on testing data: Classification report (accuracy, precision, recall, f1-score) + G-mean
- Export models after training: Knn-Hamming: skops
- Write models performance to file: 'pendigits_models.txt'.

In [1]:
import pandas as pd
from pandas import read_csv
from pandas import set_option
import numpy as np
from numpy import arange
## EDA
from collections import Counter

In [2]:
# Pre-processing
from sklearn.preprocessing import OrdinalEncoder
# Cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score # 1 metric
from sklearn.model_selection import cross_validate # more than 1 metric
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
# For Naive Bayes
from sklearn.naive_bayes import CategoricalNB # Categorical Naive Bayes
from sklearn.naive_bayes import MultinomialNB # Multinominal Naive Bayes (suitable for NLP)
from mixed_naive_bayes import MixedNB # Mixed Naive Bayes for combination of both discrete & continuous feature

In [4]:
# For decision tree ID3 
# https://stackoverflow.com/questions/61867945/python-import-error-cannot-import-name-six-from-sklearn-externals
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from id3 import Id3Estimator # ID3 Decision Tree (https://pypi.org/project/decision-tree-id3/)
from id3 import export_graphviz

In [5]:
# Knn-VDM 3
from vdm3 import ValueDifferenceMetric
from sklearn.neighbors import KNeighborsClassifier

In [6]:
# For model evaluation
from sklearn.metrics import classification_report
from sklearn import metrics
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

# 1. EWD data

## 1.1 EWD, k = 4

In [8]:
# Complete code for data preperation
# Read data
df_ewd1 = pd.read_csv('pendigits_ewd1.csv')
df_ewd1.drop(df_ewd1.columns[0], axis=1, inplace = True)
df_ewd1.rename(columns={'class':'label'}, inplace=True)

disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ewd1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models - EWD, k=4

In [9]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'EWD'
disc_param = 'k = 4'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_EWD_4.skops


## 1.2 EWD, k = 7

### Data prep

In [10]:
# Complete code for data preperation
# Read data
df_ewd2 = pd.read_csv('pendigits_ewd2.csv')
df_ewd2.drop(df_ewd2.columns[0], axis=1, inplace = True)
df_ewd2.rename(columns={'class':'label'}, inplace=True)

disc = 'EWD'
k = 7

df_ewd2.info()
data = df_ewd2.values
data.shape

features = df_ewd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ewd2[features]
#Y = df_ewd2['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ewd2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models - EWD, k=7

In [11]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'EWD'
disc_param = 'k = 7'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_EWD_7.skops


## 1.3 EWD, k = 10

### Data prep

In [12]:
# Complete code for data preperation
# Read data
df_ewd3 = pd.read_csv('pendigits_ewd3.csv')
df_ewd3.drop(df_ewd3.columns[0], axis=1, inplace = True)
df_ewd3.rename(columns={'class':'label'}, inplace=True)

disc = 'EWD'
k = 10

df_ewd3.info()
data = df_ewd3.values
data.shape

features = df_ewd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ewd3[features]
#Y = df_ewd3['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ewd3[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models - EWD, k=10

In [13]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'EWD'
disc_param = 'k = 10'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_EWD_10.skops


# 2. EFD datasets

## 2.1 EFD, k = 4

### Data prep

In [14]:
# Complete code for data preperation
# Read data
df_efd1 = pd.read_csv('pendigits_efd1.csv')
df_efd1.drop(df_efd1.columns[0], axis=1, inplace = True)
df_efd1.rename(columns={'class':'label'}, inplace=True)

disc = 'efd'
k = 4

df_efd1.info()
data = df_efd1.values
data.shape

features = df_efd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_efd1[features]
#Y = df_efd1['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models - EFD, k=4

In [15]:
# Knn-Hamming complete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'EFD'
disc_param = 'k = 4'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_EFD_4.skops


## 2.2 EFD, k = 7

### Data prep

In [16]:
# Complete code for data preperation
# Read data
df_efd2 = pd.read_csv('pendigits_efd2.csv')
df_efd2.drop(df_efd2.columns[0], axis=1, inplace = True)
df_efd2.rename(columns={'class':'label'}, inplace=True)

disc = 'efd'
k = 7

df_efd2.info()
data = df_efd2.values
data.shape

features = df_efd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_efd2[features]
#Y = df_efd2['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models, EFD, k=7

In [17]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'EFD'
disc_param = 'k = 7'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_EFD_7.skops


## 2.3 EFD, k = 10

### Dataprep

In [18]:
# Complete code for data preperation
# Read data
df_efd3 = pd.read_csv('pendigits_efd3.csv')
df_efd3.drop(df_efd3.columns[0], axis=1, inplace = True)
df_efd3.rename(columns={'class':'label'}, inplace=True)

disc = 'efd'
k = 10

df_efd3.info()
data = df_efd3.values
data.shape

features = df_efd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_efd3[features]
#Y = df_efd3['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_efd3[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models, EFD, k=10

In [19]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'EFD'
disc_param = 'k = 10'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, k = {k} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{k}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_EFD_10.skops


# 3. FFD datasets

## 3.1 FFD, m =10 (tranfusion_ffd1)

### Data prep

In [20]:
# Complete code for data preperation
# Read data
df_ffd1 = pd.read_csv('pendigits_ffd1.csv')
df_ffd1.drop(df_ffd1.columns[0], axis=1, inplace = True)
df_ffd1.rename(columns={'class':'label'}, inplace=True)

disc = 'ffd'
m = 10

df_ffd1.info()
data = df_ffd1.values
data.shape

features = df_ffd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ffd1[features]
#Y = df_ffd1['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd1[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
(8244, 15)
(2748, 15)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counte

### Models, FFD, m = 10

In [21]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'FFD'
disc_param = 'm = 10'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{m}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_FFD_10.skops


## 3.1 FFD, m =30 (tranfusion_ffd2)

### Data prep

In [22]:
# Complete code for data preperation
# Read data
df_ffd2 = pd.read_csv('pendigits_ffd2.csv')
df_ffd2.drop(df_ffd2.columns[0], axis=1, inplace = True)
df_ffd2.rename(columns={'class':'label'}, inplace=True)

disc = 'ffd'
m = 30

df_ffd2.info()
data = df_ffd2.values
data.shape

features = df_ffd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ffd2[features]
#Y = df_ffd2['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd2[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models, FFD, m = 30

In [23]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'FFD'
disc_param = 'm = 30'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{m}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_FFD_30.skops


## 3.3 FFD, m =60 (tranfusion_ffd3)

### Data prep

In [24]:
# Complete code for data preperation
# Read data
df_ffd3 = pd.read_csv('pendigits_ffd3.csv')
df_ffd3.drop(df_ffd3.columns[0], axis=1, inplace = True)
df_ffd3.rename(columns={'class':'label'}, inplace=True)

disc = 'ffd'
m = 60

df_ffd3.info()
data = df_ffd3.values
data.shape

features = df_ffd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ffd3[features]
#Y = df_ffd3['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

print(x_train.shape)
print(x_test.shape)
print('=================')

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd3[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
(8244, 15)
(2748, 15)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counte

### Models, FFD, m = 60

In [25]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'FFD'
disc_param = 'm = 60'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{m}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_FFD_60.skops


## 3.3 FFD, m =100 (tranfusion_ffd4)

### Data prep

In [26]:
# Complete code for data preperation
# Read data
df_ffd4 = pd.read_csv('pendigits_ffd4.csv')
df_ffd4.drop(df_ffd4.columns[0], axis=1, inplace = True)
df_ffd4.rename(columns={'class':'label'}, inplace=True)

disc = 'ffd'
m = 100

df_ffd4.info()
data = df_ffd4.values
data.shape

features = df_ffd4.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]
#X = df_ffd4[features]
#Y = df_ffd4['class']

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30, stratify=Y)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

# Check number of categories for features
n_categories = df_ffd4[features].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A2      10992 non-null  int64
 1   A3      10992 non-null  int64
 2   A4      10992 non-null  int64
 3   A5      10992 non-null  int64
 4   A6      10992 non-null  int64
 5   A7      10992 non-null  int64
 6   A8      10992 non-null  int64
 7   A9      10992 non-null  int64
 8   A10     10992 non-null  int64
 9   A11     10992 non-null  int64
 10  A12     10992 non-null  int64
 11  A13     10992 non-null  int64
 12  A14     10992 non-null  int64
 13  A15     10992 non-null  int64
 14  A16     10992 non-null  int64
 15  label   10992 non-null  int64
dtypes: int64(16)
memory usage: 1.3 MB
(10992, 15) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training data:  Counter({4: 858, 2: 858, 0: 

### Models, FFD, m = 100

In [27]:
# Knn-Hammingcomplete code

model = 'KNN-Hamming'
dataset = 'pendigits'
discretizer = 'FFD'
disc_param = 'm = 100'

f = open("pendigits_models.txt", "a")
import time
start = time.time() # For measuring time execution

# Knn-Hamming complete code
knn_hamming = KNeighborsClassifier(n_neighbors=3, metric='hamming', algorithm='auto')
knn_hamming.fit(x_train, y_train)

# Testing
y_pred_knn = knn_hamming.predict(x_test)
knn_hamming.classes_
print(f'Models results: model {model}, dataset {dataset}, discretization {discretizer} with parameter {disc_param}', 
      file = f)
print('Classification report', file = f)
print(classification_report(y_test, y_pred_knn), file = f)

from imblearn.metrics import geometric_mean_score as gmean
print('G-mean:', gmean(y_test, y_pred_knn),file = f)

end = time.time()
print(f'Time for training model {model}- default, {disc}, m = {m} is: {end - start}.', file = f) # Total time execution
print('=='*20, file = f)
f.close()

# Save models
import skops.io as sio
model_name = f"{dataset}_{model}_{discretizer}_{m}.skops"
print(model_name)
obj = sio.dump(knn_hamming, model_name)

pendigits_KNN-Hamming_FFD_100.skops
