In [1]:
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
df = pd.read_csv("KnowledgeBase.csv")
df.fillna(0,inplace=True)

In [3]:
df.shape

(173, 101)

In [3]:
df_char_raw = df[[ 'cor.mean', 'cov.mean', 'eigenvalues.mean', 'g_mean.mean', 'h_mean.mean', 'iq_range.mean', 
 'kurtosis.mean', 'mad.mean', 'max.mean', 'mean.mean', 'median.mean', 'min.mean', 'nr_cor_attr', 'nr_norm', 'nr_outliers', 'range.mean', 'sd.mean', 'skewness.mean', 'sparsity.mean',
 't_mean.mean', 'var.mean', 'attr_to_inst', 'inst_to_attr', 'nr_attr', 'nr_bin', 'nr_inst', 'nr_num','attr_conc.mean', 
 'attr_ent.mean', 'nUnique', 'ena', 'snr.mean', 'cEntropy', 'FeatureAlgo']]

df_quality_raw = df[['Completeness', 'Conciseness', 'ClassImbRatio', 'ClassOverlapPerc', 'OutlierPerc', 'LabelIssuesPerc', 'FeatureAlgo']]

df_raw = df[['Completeness', 'Conciseness', 'cor.mean', 'cov.mean', 'eigenvalues.mean', 'g_mean.mean', 'h_mean.mean', 'iq_range.mean', 
 'kurtosis.mean', 'mad.mean', 'max.mean', 'mean.mean', 'median.mean', 'min.mean', 'nr_cor_attr', 'nr_norm', 'nr_outliers', 'range.mean', 'sd.mean', 'skewness.mean', 'sparsity.mean',
 't_mean.mean', 'var.mean', 'ClassImbRatio', 'ClassOverlapPerc', 'OutlierPerc', 'attr_to_inst', 'inst_to_attr', 'nr_attr', 'nr_bin', 'nr_inst', 'nr_num','attr_conc.mean', 
 'attr_ent.mean', 'LabelIssuesPerc','nUnique', 'ena', 'snr.mean', 'cEntropy', 'FeatureAlgo']]


df_char_bin = df[['cor.mean_bins', 'cov.mean_bins',
       'eigenvalues.mean_bins', 'g_mean.mean_bins', 'h_mean.mean_bins',
       'iq_range.mean_bins', 'kurtosis.mean_bins', 'mad.mean_bins',
       'max.mean_bins', 'mean.mean_bins', 'median.mean_bins', 'min.mean_bins',
       'nr_cor_attr_bins', 'nr_norm_bins', 'nr_outliers_bins',
       'range.mean_bins', 'sd.mean_bins', 'skewness.mean_bins',
       'sparsity.mean_bins', 't_mean.mean_bins', 'var.mean_bins',
       'attr_to_inst_bins', 'inst_to_attr_bins',
       'nr_attr_bins', 'nr_inst_bins', 'nr_num_bins', 'nr_bin_bins',
       'attr_conc.mean_bins', 'attr_ent.mean_bins', 'ena_bins',
       'snr.mean_bins', 'cEntropy_bins','FeatureAlgo']]

df_quality_bin = df[['Completeness_bins',
       'Conciseness_bins', 'LabelIssues_bins', 'ClassImbRatio_bins', 'OutlierPerc_bins',
       'ClassOverlapPerc_bins','FeatureAlgo']]

df_bin = df[['Completeness_bins',
       'Conciseness_bins', 'cor.mean_bins', 'cov.mean_bins',
       'eigenvalues.mean_bins', 'g_mean.mean_bins', 'h_mean.mean_bins',
       'iq_range.mean_bins', 'kurtosis.mean_bins', 'mad.mean_bins',
       'max.mean_bins', 'mean.mean_bins', 'median.mean_bins', 'min.mean_bins',
       'nr_cor_attr_bins', 'nr_norm_bins', 'nr_outliers_bins',
       'range.mean_bins', 'sd.mean_bins', 'skewness.mean_bins',
       'sparsity.mean_bins', 't_mean.mean_bins', 'var.mean_bins',
       'LabelIssues_bins', 'ClassImbRatio_bins', 'OutlierPerc_bins',
       'ClassOverlapPerc_bins', 'attr_to_inst_bins', 'inst_to_attr_bins',
       'nr_attr_bins', 'nr_inst_bins', 'nr_num_bins', 'nr_bin_bins',
       'attr_conc.mean_bins', 'attr_ent.mean_bins', 'ena_bins',
       'snr.mean_bins', 'cEntropy_bins', 'FeatureAlgo']]

In [5]:
df_raw['FeatureAlgo'].value_counts()

chisquare    102
GR            35
relief        20
fcbf          12
MI             4
Name: FeatureAlgo, dtype: int64

In [4]:
#update df_raw
df_raw['labels_tuple'] = df_raw.iloc[:, -1].apply(tuple)

tuple_counts = df_raw['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_raw[df_raw['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_raw = df_filtered

#update df_char_raw
df_char_raw['labels_tuple'] = df_char_raw.iloc[:, -1].apply(tuple)

tuple_counts = df_char_raw['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_char_raw[df_char_raw['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_char_raw = df_filtered

#update df_quality_raw
df_quality_raw['labels_tuple'] = df_quality_raw.iloc[:, -1].apply(tuple)

tuple_counts = df_quality_raw['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_quality_raw[df_quality_raw['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_quality_raw = df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
#update df_bin
df_bin['labels_tuple'] = df_bin.iloc[:, -1].apply(tuple)

tuple_counts = df_bin['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_bin[df_bin['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_bin = df_filtered

#update df_char_bin
df_char_bin['labels_tuple'] = df_char_bin.iloc[:, -1].apply(tuple)

tuple_counts = df_char_bin['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_char_bin[df_char_bin['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_char_bin = df_filtered

#update df_quality_bin
df_quality_bin['labels_tuple'] = df_quality_bin.iloc[:, -1].apply(tuple)

tuple_counts = df_quality_bin['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_quality_bin[df_quality_bin['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_quality_bin = df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## kNN

In [8]:
def evaluate_knn(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)


    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
        
    knn_classifier = KNeighborsClassifier(n_neighbors=4)
    
    cross_val_scores = cross_val_score(knn_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_knn(df)

comparison_df = pd.DataFrame(results).transpose()
print(comparison_df)



               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                          0.647273                    0.352727   
df_char_raw                     0.647273                    0.352727   
df_quality_raw                  0.576503                    0.423497   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_raw              0.682927        0.317073  0.698856  0.682927  0.677672   
df_char_raw         0.682927        0.317073  0.698856  0.682927  0.677672   
df_quality_raw      0.560976        0.439024  0.573448  0.560976  0.562444   

                                                 Confusion Matrix  
df_raw          [[14, 0, 1, 1], [6, 9, 3, 2], [3, 2, 18, 2], [...  
df_char_raw     [[14, 0, 1, 1], [6, 9, 3, 2], [3, 2, 18, 2], [...  
df_quality_raw  [[10, 4, 1, 1], [8, 8, 1, 3], [3, 3, 16, 3], [...  


## Decision trees

In [9]:

def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  
    
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

comparison_df = pd.DataFrame(results).transpose()

print(comparison_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                          0.754499                    0.245501   
df_char_raw                     0.754499                    0.245501   
df_quality_raw                  0.640979                    0.359021   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_raw              0.707317        0.292683  0.708045  0.707317   0.70038   
df_char_raw         0.658537        0.341463    0.6714  0.658537  0.660034   
df_quality_raw      0.670732        0.329268   0.66557  0.670732  0.665999   

                                                 Confusion Matrix  
df_raw          [[13, 3, 0, 0], [4, 9, 3, 4], [1, 1, 23, 0], [...  
df_char_raw     [[11, 5, 0, 0], [4, 9, 4, 3], [3, 1, 21, 0], [...  
df_quality_raw  [[11, 4, 0, 1], [4, 9, 3, 4], [2, 1, 20, 2], [...  


## Distance-based

In [10]:
datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

metric_name = 'Euclidean'
metric_func = lambda X_train, X_test: np.sqrt(((X_train.values[:, np.newaxis] - X_test.values) ** 2).sum(axis=2))

kf = KFold(n_splits=5, shuffle=True, random_state=33)

for name, df in datasets.items():
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    cv_accuracies = []

    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        distances = metric_func(X_train, X_test)
        nearest_indices = np.argmin(distances, axis=0)
        predicted_labels = y_train[nearest_indices]
        cv_accuracy = accuracy_score(y_test, predicted_labels)
        
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    
    distances = metric_func(X_train, X_test)
    
    nearest_indices = np.argmin(distances, axis=0)
    
    predicted_labels = y_train[nearest_indices]
    
    test_accuracy = accuracy_score(y_test, predicted_labels)
    precision = precision_score(y_test, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(y_test, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predicted_labels, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(y_test, predicted_labels)

    results[name] = {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }

results_df = pd.DataFrame(results).transpose()

print(results_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                          0.713068                    0.286932   
df_char_raw                     0.713068                    0.286932   
df_quality_raw                  0.664408                    0.335592   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_raw              0.731707        0.268293  0.726939  0.731707  0.720715   
df_char_raw         0.731707        0.268293  0.726939  0.731707  0.720715   
df_quality_raw      0.621951        0.378049  0.655045  0.621951  0.623613   

                                                 Confusion Matrix  
df_raw          [[11, 2, 1, 2], [5, 9, 3, 3], [0, 1, 22, 2], [...  
df_char_raw     [[11, 2, 1, 2], [5, 9, 3, 3], [0, 1, 22, 2], [...  
df_quality_raw  [[13, 3, 0, 0], [8, 8, 1, 3], [4, 1, 17, 3], [...  


## Unification

In [11]:
def unify_features(test_sample, train_data, train_labels):
    test_features = test_sample.values
    matching_indices = np.all(train_data == test_features, axis=1)
    if matching_indices.any():
        matching_labels = train_labels[matching_indices]
        return np.unique(matching_labels)
    return ["unknown"]

def evaluate_unification(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    cv_accuracies = []
    test_accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices = []
    
    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        predicted_labels = []
        for i in range(X_test.shape[0]):
            label = unify_features(X_test.iloc[i], X_train, y_train)
            predicted_labels.append(label)
        
        predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
        valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        if not valid_indices:
            continue
        
        valid_predictions = np.array([predicted_labels[i] for i in valid_indices])

        valid_truth = np.array([y_test[i] for i in valid_indices])
        
        cv_accuracy = accuracy_score(valid_truth, valid_predictions)
       
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    predicted_labels = []
    for i in range(X_test.shape[0]):
        label = unify_features(X_test.iloc[i], X_train, y_train)
        predicted_labels.append(label)
    predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
    valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        
    valid_predictions = np.array([predicted_labels[i] for i in valid_indices])
    valid_truth = np.array([y_test[i] for i in valid_indices])
        
    precision = precision_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    recall = recall_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    f1 = f1_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(valid_truth, valid_predictions)    
    test_accuracy = accuracy_score(valid_truth, valid_predictions)

    return {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }


datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_unification(df)

results_df = pd.DataFrame(results).transpose()

print(results_df)


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                                 0                           1   
df_char_raw                            0                           1   
df_quality_raw                  0.563333                    0.436667   

               Test Accuracy Test Error Rate Precision Recall F1 Score  \
df_raw                   NaN             NaN         0      0        0   
df_char_raw              NaN             NaN         0      0        0   
df_quality_raw             1               0         1      1        1   

               Confusion Matrix  
df_raw                       []  
df_char_raw                  []  
df_quality_raw            [[2]]  


## Bin data

## kNN

In [12]:
def evaluate_knn(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    

    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)


    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
        
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    
    cross_val_scores = cross_val_score(knn_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_knn(df)

comparison_df = pd.DataFrame(results).transpose()
print(comparison_df)

               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.650583                    0.349417   
df_char_bin                     0.539953                    0.460047   
df_quality_bin                  0.401865                    0.598135   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.646341        0.353659  0.661878  0.646341  0.645223   
df_char_bin         0.560976        0.439024  0.544851  0.560976  0.534272   
df_quality_bin      0.378049        0.621951  0.344656  0.378049  0.345201   

                                                 Confusion Matrix  
df_bin          [[13, 3, 0, 0], [7, 7, 5, 1], [2, 5, 18, 0], [...  
df_char_bin     [[2, 9, 1, 4], [2, 10, 5, 3], [1, 3, 21, 0], [...  
df_quality_bin  [[11, 2, 1, 2], [8, 3, 5, 4], [1, 2, 15, 7], [...  


## Decision trees

In [13]:

def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    
    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}
results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

comparison_df = pd.DataFrame(results).transpose()

print(comparison_df)

               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.653613                    0.346387   
df_char_bin                      0.61972                     0.38028   
df_quality_bin                  0.435897                    0.564103   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.719512        0.280488  0.730331  0.719512  0.719622   
df_char_bin         0.609756        0.390244  0.600957  0.609756  0.596617   
df_quality_bin      0.439024        0.560976  0.425822  0.439024   0.40696   

                                                 Confusion Matrix  
df_bin          [[13, 3, 0, 0], [4, 11, 4, 1], [2, 2, 20, 1], ...  
df_char_bin     [[5, 6, 1, 4], [3, 9, 5, 3], [1, 3, 21, 0], [0...  
df_quality_bin  [[11, 1, 2, 2], [7, 4, 6, 3], [1, 3, 17, 4], [...  


## Distance based

In [14]:
datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}

results = {}

metric_name = 'Euclidean'
metric_func = lambda X_train, X_test: np.sqrt(((X_train.values[:, np.newaxis] - X_test.values) ** 2).sum(axis=2))

kf = KFold(n_splits=5, shuffle=True, random_state=33)

for name, df in datasets.items():
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    
  
    cv_accuracies = []

        
    smote = SMOTE(random_state=33)
    

    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        distances = metric_func(X_train, X_test)
        
        nearest_indices = np.argmin(distances, axis=0)
        
        predicted_labels = y_train[nearest_indices]
        
        cv_accuracy = accuracy_score(y_test, predicted_labels)
        
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    
    distances = metric_func(X_train, X_test)
    
    nearest_indices = np.argmin(distances, axis=0)
    
    predicted_labels = y_train[nearest_indices]
    
    test_accuracy = accuracy_score(y_test, predicted_labels)
    precision = precision_score(y_test, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(y_test, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predicted_labels, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(y_test, predicted_labels)

    results[name] = {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }

results_df = pd.DataFrame(results).transpose()

print(results_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.612948                    0.387052   
df_char_bin                     0.421921                    0.578079   
df_quality_bin                  0.331135                    0.668865   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.719512        0.280488  0.713106  0.719512  0.708814   
df_char_bin         0.585366        0.414634  0.573181  0.585366  0.552402   
df_quality_bin      0.341463        0.658537  0.355906  0.341463  0.345337   

                                                 Confusion Matrix  
df_bin          [[11, 3, 0, 2], [6, 8, 3, 3], [1, 1, 22, 1], [...  
df_char_bin     [[4, 4, 2, 6], [4, 6, 5, 5], [0, 0, 21, 4], [0...  
df_quality_bin  [[7, 5, 2, 2], [4, 5, 6, 5], [0, 3, 10, 12], [...  


## Unification

In [15]:
def unify_features(test_sample, train_data, train_labels):
    test_features = test_sample.values
    matching_indices = np.all(train_data == test_features, axis=1)
    if matching_indices.any():
        matching_labels = train_labels[matching_indices]
        return np.unique(matching_labels)
    return ["unknown"]

def evaluate_unification(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    cv_accuracies = []
    test_accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices = []
    
    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        predicted_labels = []
        for i in range(X_test.shape[0]):
            label = unify_features(X_test.iloc[i], X_train, y_train)
            predicted_labels.append(label)
        
        predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
        valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        if not valid_indices:
            continue
        
        valid_predictions = np.array([predicted_labels[i] for i in valid_indices])

        valid_truth = np.array([y_test[i] for i in valid_indices])
        
        cv_accuracy = accuracy_score(valid_truth, valid_predictions)
       
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    predicted_labels = []
    for i in range(X_test.shape[0]):
        label = unify_features(X_test.iloc[i], X_train, y_train)
        predicted_labels.append(label)
    predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
    valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        
    valid_predictions = np.array([predicted_labels[i] for i in valid_indices])
    valid_truth = np.array([y_test[i] for i in valid_indices])
        
    precision = precision_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    recall = recall_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    f1 = f1_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(valid_truth, valid_predictions)    
    test_accuracy = accuracy_score(valid_truth, valid_predictions)

    return {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }


datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}


results = {}

for name, df in datasets.items():
    results[name] = evaluate_unification(df)

results_df = pd.DataFrame(results).transpose()

print(results_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.275617                    0.724383   
df_char_bin                     0.270635                    0.729365   
df_quality_bin                  0.271838                    0.728162   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.142857        0.857143  0.137302  0.142857   0.13985   
df_char_bin          0.17284         0.82716  0.169464   0.17284  0.167699   
df_quality_bin      0.219512        0.780488  0.221078  0.219512  0.218879   

                                                 Confusion Matrix  
df_bin          [[1, 4, 4, 7], [5, 3, 4, 4], [3, 4, 5, 6], [7,...  
df_char_bin     [[3, 5, 2, 5], [3, 5, 5, 7], [11, 7, 2, 5], [5...  
df_quality_bin  [[1, 4, 3, 8], [3, 6, 5, 6], [6, 7, 7, 5], [4,...  


In [7]:
df_normalised = pd.read_csv("NormalisedDataset.csv")
df_normalised_quality = df_normalised[['Completeness_normalized', 'Conciseness_normalized',  'ClassImbRatio_normalized', 'ClassOverlapPerc_normalized',
       'OutlierPerc_normalized',  'LabelIssuesPerc_normalized', 'FeatureAlgo_normalized']]
df_normalised_char = df_normalised[[ 'cor.mean_normalized', 'cov.mean_normalized',
       'eigenvalues.mean_normalized', 'g_mean.mean_normalized',
       'h_mean.mean_normalized', 'iq_range.mean_normalized',
       'kurtosis.mean_normalized', 'mad.mean_normalized',
       'max.mean_normalized', 'mean.mean_normalized', 'median.mean_normalized',
       'min.mean_normalized', 'nr_cor_attr_normalized', 'nr_norm_normalized',
       'nr_outliers_normalized', 'range.mean_normalized', 'sd.mean_normalized',
       'skewness.mean_normalized', 'sparsity.mean_normalized',
       't_mean.mean_normalized', 'var.mean_normalized',
        'attr_to_inst_normalized',
       'inst_to_attr_normalized', 'nr_attr_normalized', 'nr_bin_normalized',
       'nr_inst_normalized', 'nr_num_normalized', 'attr_conc.mean_normalized',
       'attr_ent.mean_normalized', 'nUnique_normalized', 'ena_normalized', 'snr.mean_normalized',
       'cEntropy_normalized', 'FeatureAlgo_normalized']]

In [8]:
df_normalised.columns

Index(['Completeness_normalized', 'Conciseness_normalized',
       'cor.mean_normalized', 'cov.mean_normalized',
       'eigenvalues.mean_normalized', 'g_mean.mean_normalized',
       'h_mean.mean_normalized', 'iq_range.mean_normalized',
       'kurtosis.mean_normalized', 'mad.mean_normalized',
       'max.mean_normalized', 'mean.mean_normalized', 'median.mean_normalized',
       'min.mean_normalized', 'nr_cor_attr_normalized', 'nr_norm_normalized',
       'nr_outliers_normalized', 'range.mean_normalized', 'sd.mean_normalized',
       'skewness.mean_normalized', 'sparsity.mean_normalized',
       't_mean.mean_normalized', 'var.mean_normalized',
       'ClassImbRatio_normalized', 'ClassOverlapPerc_normalized',
       'OutlierPerc_normalized', 'attr_to_inst_normalized',
       'inst_to_attr_normalized', 'nr_attr_normalized', 'nr_bin_normalized',
       'nr_inst_normalized', 'nr_num_normalized', 'attr_conc.mean_normalized',
       'attr_ent.mean_normalized', 'LabelIssuesPerc_normalized',


In [17]:
#update df_raw
df_normalised['labels_tuple'] = df_normalised.iloc[:, -1].apply(tuple)

tuple_counts = df_normalised['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_normalised[df_normalised['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_normalised = df_filtered

#update df_char_raw
df_normalised_char['labels_tuple'] = df_normalised_char.iloc[:, -1].apply(tuple)

tuple_counts = df_normalised_char['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_normalised_char[df_normalised_char['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_normalised_char = df_filtered

#update df_quality_raw
df_normalised_quality['labels_tuple'] = df_normalised_quality.iloc[:, -1].apply(tuple)

tuple_counts = df_normalised_quality['labels_tuple'].value_counts()

valid_tuples = tuple_counts[tuple_counts >= 6].index
df_filtered = df_normalised_quality[df_normalised_quality['labels_tuple'].isin(valid_tuples)].copy()
df_filtered = df_filtered.drop('labels_tuple', axis=1)

df_normalised_quality = df_filtered

## kNN

In [18]:

def evaluate_knn(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)


    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
        
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    
    cross_val_scores = cross_val_score(knn_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_knn(df)

comparison_df = pd.DataFrame(results).transpose()
print(comparison_df)

                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.701628                    0.298372   
df_normalised_char                     0.657169                    0.342831   
df_normalised_quality                  0.475622                    0.524378   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised              0.714286        0.285714   0.73791  0.714286   
df_normalised_char         0.690476        0.309524  0.743773  0.690476   
df_normalised_quality      0.488095        0.511905  0.507829  0.488095   

                       F1 Score  \
df_normalised          0.720293   
df_normalised_char     0.701245   
df_normalised_quality  0.475191   

                                                        Confusion Matrix  
df_normalised          [[13, 7, 0, 0], [5, 10, 2, 1], [1, 1, 19, 0], ...  
df_normalised_char     [[15, 5, 0, 0], [6, 11, 0, 1], [0, 4, 17, 0], ...  
df_normalised_qu

## Decision trees

In [19]:
#df_raw decision
def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

    smote = SMOTE(random_state=33)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

comparison_df = pd.DataFrame(results).transpose()

print(comparison_df)

                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.719991                    0.280009   
df_normalised_char                     0.740932                    0.259068   
df_normalised_quality                  0.575351                    0.424649   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised              0.785714        0.214286  0.811965  0.785714   
df_normalised_char         0.785714        0.214286  0.804743  0.785714   
df_normalised_quality      0.511905        0.488095  0.569535  0.511905   

                       F1 Score  \
df_normalised          0.783955   
df_normalised_char     0.763734   
df_normalised_quality  0.499341   

                                                        Confusion Matrix  
df_normalised          [[18, 2, 0, 0], [7, 10, 0, 1], [0, 1, 20, 0], ...  
df_normalised_char     [[18, 1, 1, 0], [6, 6, 3, 3], [0, 0, 21, 0], [...  
df_normalised_qu

## Distance-based

In [20]:
datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

metric_name = 'Euclidean'
metric_func = lambda X_train, X_test: np.sqrt(((X_train.values[:, np.newaxis] - X_test.values) ** 2).sum(axis=2))

kf = KFold(n_splits=5, shuffle=True, random_state=33)

for name, df in datasets.items():
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
  
    cv_accuracies = []
        
    smote = SMOTE(random_state=33)
  
    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        distances = metric_func(X_train, X_test)
        
        nearest_indices = np.argmin(distances, axis=0)
        
        predicted_labels = y_train[nearest_indices]
        
        cv_accuracy = accuracy_score(y_test, predicted_labels)
        
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    
    distances = metric_func(X_train, X_test)
    
    nearest_indices = np.argmin(distances, axis=0)
    
    predicted_labels = y_train[nearest_indices]
    
    test_accuracy = accuracy_score(y_test, predicted_labels)
    precision = precision_score(y_test, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(y_test, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predicted_labels, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(y_test, predicted_labels)

    results[name] = {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }

results_df = pd.DataFrame(results).transpose()

print(results_df)


                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.762106                    0.237894   
df_normalised_char                     0.661044                    0.338956   
df_normalised_quality                  0.543316                    0.456684   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised              0.738095        0.261905  0.742022  0.738095   
df_normalised_char         0.738095        0.261905  0.749217  0.738095   
df_normalised_quality      0.488095        0.511905  0.513825  0.488095   

                       F1 Score  \
df_normalised          0.737166   
df_normalised_char     0.736461   
df_normalised_quality  0.493817   

                                                        Confusion Matrix  
df_normalised          [[14, 5, 0, 1], [5, 9, 2, 2], [0, 0, 21, 0], [...  
df_normalised_char     [[13, 5, 1, 1], [3, 11, 2, 2], [0, 0, 21, 0], ...  
df_normalised_qu

In [21]:
predicted_labels

array([0, 1, 0, 3, 3, 1, 3, 1, 3, 0, 3, 3, 1, 1, 3, 2, 0, 0, 2, 2, 3, 0,
       2, 2, 2, 3, 0, 0, 0, 1, 0, 1, 2, 0, 1, 1, 1, 2, 1, 2, 3, 2, 0, 0,
       0, 2, 3, 2, 0, 3, 1, 0, 2, 2, 1, 2, 2, 2, 0, 2, 3, 1, 1, 1, 0, 3,
       0, 1, 2, 0, 0, 1, 0, 0, 2, 1, 2, 2, 1, 3, 1, 3, 0, 3])

## Unification

In [22]:

def unify_features(test_sample, train_data, train_labels):
    test_features = test_sample.values
    matching_indices = np.all(train_data == test_features, axis=1)
    if matching_indices.any():
        matching_labels = train_labels[matching_indices]
        return np.unique(matching_labels)
    return ["unknown"]

def evaluate_unification(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
   
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    cv_accuracies = []
    test_accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices = []
    
    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        predicted_labels = []
        for i in range(X_test.shape[0]):
            label = unify_features(X_test.iloc[i], X_train, y_train)
            predicted_labels.append(label)
        
        predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
        valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        if not valid_indices:
            continue
        
        valid_predictions = np.array([predicted_labels[i] for i in valid_indices])

        valid_truth = np.array([y_test[i] for i in valid_indices])
        
        cv_accuracy = accuracy_score(valid_truth, valid_predictions)
       
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    predicted_labels = []
    for i in range(X_test.shape[0]):
        label = unify_features(X_test.iloc[i], X_train, y_train)
        predicted_labels.append(label)
    predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
    valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        
    valid_predictions = np.array([predicted_labels[i] for i in valid_indices])
    valid_truth = np.array([y_test[i] for i in valid_indices])
        
    precision = precision_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    recall = recall_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    f1 = f1_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(valid_truth, valid_predictions)    
    test_accuracy = accuracy_score(valid_truth, valid_predictions)

    return {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }


datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_unification(df)

results_df = pd.DataFrame(results).transpose()

print(results_df)


                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.553733                    0.446267   
df_normalised_char                      0.21981                     0.78019   
df_normalised_quality                  0.246171                    0.753829   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised              0.354839        0.645161  0.408705  0.354839   
df_normalised_char         0.238806        0.761194  0.248866  0.238806   
df_normalised_quality      0.265625        0.734375  0.277257  0.265625   

                       F1 Score  \
df_normalised          0.364055   
df_normalised_char     0.242553   
df_normalised_quality  0.269998   

                                                        Confusion Matrix  
df_normalised          [[4, 1, 3, 2], [1, 4, 1, 1], [1, 3, 1, 0], [1,...  
df_normalised_char     [[2, 6, 2, 5], [4, 4, 3, 3], [5, 4, 5, 4], [7,...  
df_normalised_qu

## Saving the better model

In [6]:
import pickle
from sklearn.tree import export_text

def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  
    
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    smote = SMOTE(random_state=33)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    rules = export_text(tree_classifier, feature_names=list(X_train.columns))
    print(rules)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': tree_classifier,
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_raw': df_raw,
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

with open('df_raw_decision_tree_model.pkl', 'wb') as model_file:
    pickle.dump(results['df_raw']['model'], model_file)

|--- nr_inst <= 335.50
|   |--- cEntropy <= 0.95
|   |   |--- iq_range.mean <= 310.88
|   |   |   |--- cEntropy <= 0.41
|   |   |   |   |--- cor.mean <= 0.24
|   |   |   |   |   |--- class: chisquare
|   |   |   |   |--- cor.mean >  0.24
|   |   |   |   |   |--- class: relief
|   |   |   |--- cEntropy >  0.41
|   |   |   |   |--- attr_to_inst <= 0.36
|   |   |   |   |   |--- inst_to_attr <= 8.72
|   |   |   |   |   |   |--- LabelIssuesPerc <= 0.02
|   |   |   |   |   |   |   |--- class: chisquare
|   |   |   |   |   |   |--- LabelIssuesPerc >  0.02
|   |   |   |   |   |   |   |--- class: GR
|   |   |   |   |   |--- inst_to_attr >  8.72
|   |   |   |   |   |   |--- attr_to_inst <= 0.01
|   |   |   |   |   |   |   |--- class: chisquare
|   |   |   |   |   |   |--- attr_to_inst >  0.01
|   |   |   |   |   |   |   |--- ena <= -40.54
|   |   |   |   |   |   |   |   |--- class: chisquare
|   |   |   |   |   |   |   |--- ena >  -40.54
|   |   |   |   |   |   |   |   |--- OutlierPerc <= 0.03
|

In [24]:
#df_raw decision
def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

    smote = SMOTE(random_state=33)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': tree_classifier,
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_normalised': df_normalised
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

with open('df_raw_normalised_decision_tree_model.pkl', 'wb') as model_file:
    pickle.dump(results['df_normalised']['model'], model_file)