In [1]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
df = pd.read_csv("KnowledgeBase.csv")
df.fillna(0,inplace=True)


In [3]:
df_char_raw = df[[ 'cor.mean', 'cov.mean', 'eigenvalues.mean', 'g_mean.mean', 'h_mean.mean', 'iq_range.mean', 
 'kurtosis.mean', 'mad.mean', 'max.mean', 'mean.mean', 'median.mean', 'min.mean', 'nr_cor_attr', 'nr_norm', 'nr_outliers', 'range.mean', 'sd.mean', 'skewness.mean', 'sparsity.mean',
 't_mean.mean', 'var.mean', 'attr_to_inst', 'inst_to_attr', 'nr_attr', 'nr_bin', 'nr_inst', 'nr_num','attr_conc.mean', 
 'attr_ent.mean', 'nUnique', 'ena', 'snr.mean', 'cEntropy', 'FeatureAlgo']]

df_quality_raw = df[['Completeness', 'Conciseness', 'ClassImbRatio', 'ClassOverlapPerc', 'OutlierPerc', 'LabelIssuesPerc', 'FeatureAlgo']]

df_raw = df[['Completeness', 'Conciseness', 'cor.mean', 'cov.mean', 'eigenvalues.mean', 'g_mean.mean', 'h_mean.mean', 'iq_range.mean', 
 'kurtosis.mean', 'mad.mean', 'max.mean', 'mean.mean', 'median.mean', 'min.mean', 'nr_cor_attr', 'nr_norm', 'nr_outliers', 'range.mean', 'sd.mean', 'skewness.mean', 'sparsity.mean',
 't_mean.mean', 'var.mean', 'ClassImbRatio', 'ClassOverlapPerc', 'OutlierPerc', 'attr_to_inst', 'inst_to_attr', 'nr_attr', 'nr_bin', 'nr_inst', 'nr_num','attr_conc.mean', 
 'attr_ent.mean', 'LabelIssuesPerc','nUnique', 'ena', 'snr.mean', 'cEntropy', 'FeatureAlgo']]


df_char_bin = df[['cor.mean_bins', 'cov.mean_bins',
       'eigenvalues.mean_bins', 'g_mean.mean_bins', 'h_mean.mean_bins',
       'iq_range.mean_bins', 'kurtosis.mean_bins', 'mad.mean_bins',
       'max.mean_bins', 'mean.mean_bins', 'median.mean_bins', 'min.mean_bins',
       'nr_cor_attr_bins', 'nr_norm_bins', 'nr_outliers_bins',
       'range.mean_bins', 'sd.mean_bins', 'skewness.mean_bins',
       'sparsity.mean_bins', 't_mean.mean_bins', 'var.mean_bins',
       'attr_to_inst_bins', 'inst_to_attr_bins',
       'nr_attr_bins', 'nr_inst_bins', 'nr_num_bins', 'nr_bin_bins',
       'attr_conc.mean_bins', 'attr_ent.mean_bins', 'ena_bins',
       'snr.mean_bins', 'cEntropy_bins','FeatureAlgo']]

df_quality_bin = df[['Completeness_bins',
       'Conciseness_bins', 'LabelIssues_bins', 'ClassImbRatio_bins', 'OutlierPerc_bins',
       'ClassOverlapPerc_bins','FeatureAlgo']]

df_bin = df[['Completeness_bins',
       'Conciseness_bins', 'cor.mean_bins', 'cov.mean_bins',
       'eigenvalues.mean_bins', 'g_mean.mean_bins', 'h_mean.mean_bins',
       'iq_range.mean_bins', 'kurtosis.mean_bins', 'mad.mean_bins',
       'max.mean_bins', 'mean.mean_bins', 'median.mean_bins', 'min.mean_bins',
       'nr_cor_attr_bins', 'nr_norm_bins', 'nr_outliers_bins',
       'range.mean_bins', 'sd.mean_bins', 'skewness.mean_bins',
       'sparsity.mean_bins', 't_mean.mean_bins', 'var.mean_bins',
       'LabelIssues_bins', 'ClassImbRatio_bins', 'OutlierPerc_bins',
       'ClassOverlapPerc_bins', 'attr_to_inst_bins', 'inst_to_attr_bins',
       'nr_attr_bins', 'nr_inst_bins', 'nr_num_bins', 'nr_bin_bins',
       'attr_conc.mean_bins', 'attr_ent.mean_bins', 'ena_bins',
       'snr.mean_bins', 'cEntropy_bins', 'FeatureAlgo']]

In [4]:
df['FeatureAlgo'].value_counts()

chisquare    102
GR            35
relief        20
fcbf          12
MI             4
Name: FeatureAlgo, dtype: int64

## Raw data

## kNN

In [5]:
def evaluate_knn(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y) 

    ros = RandomOverSampler(random_state=33)

    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
        
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    
    cross_val_scores = cross_val_score(knn_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_knn(df)

comparison_df = pd.DataFrame(results).transpose()
print(comparison_df)



               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                          0.752454                    0.247546   
df_char_raw                     0.752454                    0.247546   
df_quality_raw                  0.698796                    0.301204   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_raw              0.833333        0.166667  0.819334  0.833333  0.810676   
df_char_raw         0.833333        0.166667  0.819334  0.833333  0.810676   
df_quality_raw      0.735294        0.264706  0.776197  0.735294  0.712222   

                                                 Confusion Matrix  
df_raw          [[12, 0, 1, 1, 0], [0, 20, 0, 0, 0], [4, 2, 3,...  
df_char_raw     [[12, 0, 1, 1, 0], [0, 20, 0, 0, 0], [4, 2, 3,...  
df_quality_raw  [[10, 1, 1, 2, 0], [0, 20, 0, 0, 0], [6, 2, 3,...  


## Decision trees

In [6]:
#decision

def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  
    
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    ros = RandomOverSampler(random_state=33)

    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

comparison_df = pd.DataFrame(results).transpose()

print(comparison_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                          0.914303                   0.0856971   
df_char_raw                     0.906986                   0.0930142   
df_quality_raw                  0.857964                    0.142036   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_raw              0.960784       0.0392157  0.966301  0.960784  0.959023   
df_char_raw         0.960784       0.0392157  0.966301  0.960784  0.959023   
df_quality_raw      0.892157        0.107843  0.900767  0.892157  0.883325   

                                                 Confusion Matrix  
df_raw          [[14, 0, 0, 0, 0], [0, 20, 0, 0, 0], [3, 0, 10...  
df_char_raw     [[14, 0, 0, 0, 0], [0, 20, 0, 0, 0], [3, 0, 10...  
df_quality_raw  [[13, 1, 0, 0, 0], [0, 20, 0, 0, 0], [3, 1, 7,...  


## Distance based

In [7]:
datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

metric_name = 'Euclidean'
metric_func = lambda X_train, X_test: np.sqrt(((X_train.values[:, np.newaxis] - X_test.values) ** 2).sum(axis=2))

kf = KFold(n_splits=5, shuffle=True, random_state=33)

for name, df in datasets.items():
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    cv_accuracies = []

    ros = RandomOverSampler(random_state=33)
    smote = SMOTE(random_state=33)
    
    pipeline = Pipeline([
        ('ros', ros),
        ('smote', smote)
    ])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        distances = metric_func(X_train, X_test)
        
        nearest_indices = np.argmin(distances, axis=0)
        
        predicted_labels = y_train[nearest_indices]
        
        cv_accuracy = accuracy_score(y_test, predicted_labels)
        
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    
    distances = metric_func(X_train, X_test)
    
    nearest_indices = np.argmin(distances, axis=0)
    
    predicted_labels = y_train[nearest_indices]
    
    test_accuracy = accuracy_score(y_test, predicted_labels)
    precision = precision_score(y_test, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(y_test, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predicted_labels, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(y_test, predicted_labels)

    results[name] = {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }

results_df = pd.DataFrame(results).transpose()

print(results_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                          0.905882                   0.0941176   
df_char_raw                     0.905882                   0.0941176   
df_quality_raw                  0.815686                    0.184314   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_raw              0.931373       0.0686275   0.93961  0.931373  0.922151   
df_char_raw         0.931373       0.0686275   0.93961  0.931373  0.922151   
df_quality_raw      0.872549        0.127451  0.876166  0.872549  0.861582   

                                                 Confusion Matrix  
df_raw          [[14, 0, 0, 0, 0], [0, 20, 0, 0, 0], [3, 2, 7,...  
df_char_raw     [[14, 0, 0, 0, 0], [0, 20, 0, 0, 0], [3, 2, 7,...  
df_quality_raw  [[12, 1, 1, 0, 0], [0, 20, 0, 0, 0], [4, 2, 6,...  


## Unification

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def unify_features(test_sample, train_data, train_labels):
    test_features = test_sample.values
    matching_indices = np.all(train_data == test_features, axis=1)
    if matching_indices.any():
        matching_labels = train_labels[matching_indices]
        return np.unique(matching_labels)
    return ["unknown"]

def evaluate_unification(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    cv_accuracies = []
    test_accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices = []
    
    ros = RandomOverSampler(random_state=33)

    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        predicted_labels = []
        for i in range(X_test.shape[0]):
            label = unify_features(X_test.iloc[i], X_train, y_train)
            predicted_labels.append(label)
        
        predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
        valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        if not valid_indices:
            continue
        
        valid_predictions = np.array([predicted_labels[i] for i in valid_indices])

        valid_truth = np.array([y_test[i] for i in valid_indices])
        
        cv_accuracy = accuracy_score(valid_truth, valid_predictions)
       
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    predicted_labels = []
    for i in range(X_test.shape[0]):
        label = unify_features(X_test.iloc[i], X_train, y_train)
        predicted_labels.append(label)
    predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
    valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        
    valid_predictions = np.array([predicted_labels[i] for i in valid_indices])
    valid_truth = np.array([y_test[i] for i in valid_indices])
        
    precision = precision_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    recall = recall_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    f1 = f1_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(valid_truth, valid_predictions)    
    test_accuracy = accuracy_score(valid_truth, valid_predictions)

    return {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }


datasets = {
    'df_raw': df_raw,
    'df_char_raw': df_char_raw,
    'df_quality_raw': df_quality_raw
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_unification(df)

results_df = pd.DataFrame(results).transpose()

print(results_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_raw                          0.948492                   0.0515076   
df_char_raw                     0.948492                   0.0515076   
df_quality_raw                  0.586965                    0.413035   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_raw              0.454545        0.545455  0.411315  0.454545  0.431564   
df_char_raw         0.454545        0.545455  0.411315  0.454545  0.431564   
df_quality_raw      0.362637        0.637363   0.37798  0.362637  0.363931   

                                                 Confusion Matrix  
df_raw          [[6, 1, 0, 2, 3], [2, 11, 0, 0, 4], [1, 3, 0, ...  
df_char_raw     [[6, 1, 0, 2, 3], [2, 11, 0, 0, 4], [1, 3, 0, ...  
df_quality_raw  [[4, 2, 1, 1, 5], [4, 7, 0, 3, 5], [0, 3, 3, 3...  


## BIN data

## knn

In [9]:
def evaluate_knn(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    

    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    ros = RandomOverSampler(random_state=33)
    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)


    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
        
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    
    cross_val_scores = cross_val_score(knn_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_knn(df)

comparison_df = pd.DataFrame(results).transpose()
print(comparison_df)

               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.661759                    0.338241   
df_char_bin                     0.487654                    0.512346   
df_quality_bin                   0.42171                     0.57829   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.764706        0.235294  0.792974  0.764706  0.753066   
df_char_bin         0.568627        0.431373  0.725897  0.568627   0.57731   
df_quality_bin       0.45098         0.54902  0.503912   0.45098  0.459663   

                                                 Confusion Matrix  
df_bin          [[11, 1, 2, 0, 0], [0, 20, 0, 0, 0], [4, 2, 6,...  
df_char_bin     [[8, 4, 2, 0, 0], [0, 20, 0, 0, 0], [2, 4, 7, ...  
df_quality_bin  [[6, 1, 5, 2, 0], [0, 12, 6, 0, 2], [1, 1, 5, ...  


## Decision trees

In [10]:
def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    
    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    ros = RandomOverSampler(random_state=33)

    # SMOTE
    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

# Evaluate all three datasets
datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}
results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

comparison_df = pd.DataFrame(results).transpose()

print(comparison_df)

               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.806444                    0.193556   
df_char_bin                     0.580939                    0.419061   
df_quality_bin                  0.507438                    0.492562   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.813725        0.186275  0.848702  0.813725  0.807594   
df_char_bin         0.637255        0.362745  0.698977  0.637255  0.617408   
df_quality_bin      0.558824        0.441176  0.611434  0.558824  0.494522   

                                                 Confusion Matrix  
df_bin          [[12, 1, 1, 0, 0], [0, 20, 0, 0, 0], [3, 1, 9,...  
df_char_bin     [[8, 4, 0, 2, 0], [0, 20, 0, 0, 0], [4, 3, 3, ...  
df_quality_bin  [[11, 1, 0, 2, 0], [0, 20, 0, 0, 0], [3, 4, 2,...  


## Distance based

In [11]:
datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}

results = {}

metric_name = 'Euclidean'
metric_func = lambda X_train, X_test: np.sqrt(((X_train.values[:, np.newaxis] - X_test.values) ** 2).sum(axis=2))

kf = KFold(n_splits=5, shuffle=True, random_state=33)

for name, df in datasets.items():
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    
    cv_accuracies = []

    ros = RandomOverSampler(random_state=33)
    smote = SMOTE(random_state=33)
    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        distances = metric_func(X_train, X_test)
        
        nearest_indices = np.argmin(distances, axis=0)
        
        predicted_labels = y_train[nearest_indices]
        
        cv_accuracy = accuracy_score(y_test, predicted_labels)
        
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    
    distances = metric_func(X_train, X_test)
    
    nearest_indices = np.argmin(distances, axis=0)
    
    predicted_labels = y_train[nearest_indices]
    
    test_accuracy = accuracy_score(y_test, predicted_labels)
    precision = precision_score(y_test, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(y_test, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predicted_labels, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(y_test, predicted_labels)

    results[name] = {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }

results_df = pd.DataFrame(results).transpose()

print(results_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.637255                    0.362745   
df_char_bin                     0.333333                    0.666667   
df_quality_bin                   0.35098                     0.64902   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.803922        0.196078  0.833787  0.803922  0.794862   
df_char_bin         0.578431        0.421569  0.607732  0.578431  0.571409   
df_quality_bin      0.431373        0.568627  0.453515  0.431373  0.432584   

                                                 Confusion Matrix  
df_bin          [[12, 1, 1, 0, 0], [0, 20, 0, 0, 0], [3, 2, 7,...  
df_char_bin     [[5, 2, 3, 2, 2], [0, 15, 0, 5, 0], [0, 3, 6, ...  
df_quality_bin  [[7, 0, 6, 0, 1], [0, 11, 0, 2, 7], [2, 2, 3, ...  


## Unification

In [12]:
def unify_features(test_sample, train_data, train_labels):
    test_features = test_sample.values
    matching_indices = np.all(train_data == test_features, axis=1)
    if matching_indices.any():
        matching_labels = train_labels[matching_indices]
        return np.unique(matching_labels)
    return ["unknown"]

def evaluate_unification(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()

    for col in X:
        X[col] = label_encoder.fit_transform(X[col])
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    cv_accuracies = []
    test_accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices = []
    
    ros = RandomOverSampler(random_state=33)

    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        predicted_labels = []
        for i in range(X_test.shape[0]):
            label = unify_features(X_test.iloc[i], X_train, y_train)
            predicted_labels.append(label)
        
        predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
        valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        if not valid_indices:
            continue
        
        valid_predictions = np.array([predicted_labels[i] for i in valid_indices])

        valid_truth = np.array([y_test[i] for i in valid_indices])
        
        cv_accuracy = accuracy_score(valid_truth, valid_predictions)
       
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    predicted_labels = []
    for i in range(X_test.shape[0]):
        label = unify_features(X_test.iloc[i], X_train, y_train)
        predicted_labels.append(label)
    predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
    valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        
    valid_predictions = np.array([predicted_labels[i] for i in valid_indices])
    valid_truth = np.array([y_test[i] for i in valid_indices])
        
    precision = precision_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    recall = recall_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    f1 = f1_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(valid_truth, valid_predictions)    
    test_accuracy = accuracy_score(valid_truth, valid_predictions)

    return {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }


datasets = {
    'df_bin': df_bin,
    'df_char_bin': df_char_bin,
    'df_quality_bin': df_quality_bin
}


results = {}

for name, df in datasets.items():
    results[name] = evaluate_unification(df)

results_df = pd.DataFrame(results).transpose()

print(results_df)


               Cross-Validation Accuracy Cross-Validation Error Rate  \
df_bin                          0.206199                    0.793801   
df_char_bin                     0.193499                    0.806501   
df_quality_bin                  0.196772                    0.803228   

               Test Accuracy Test Error Rate Precision    Recall  F1 Score  \
df_bin              0.222222        0.777778  0.262511  0.222222  0.233824   
df_char_bin         0.196078        0.803922  0.206998  0.196078  0.190664   
df_quality_bin      0.166667        0.833333  0.170434  0.166667  0.161078   

                                                 Confusion Matrix  
df_bin          [[3, 2, 6, 2, 0], [6, 4, 4, 3, 3], [3, 4, 2, 3...  
df_char_bin     [[3, 2, 3, 2, 4], [5, 1, 4, 3, 7], [3, 3, 6, 1...  
df_quality_bin  [[4, 0, 4, 1, 5], [4, 2, 7, 3, 4], [2, 0, 4, 4...  


## Standard deviation

In [13]:
df_normalised = pd.read_csv("NormalisedDataset.csv")
df_normalised_quality = df_normalised[['Completeness_normalized', 'Conciseness_normalized',  'ClassImbRatio_normalized', 'ClassOverlapPerc_normalized',
       'OutlierPerc_normalized',  'LabelIssuesPerc_normalized', 'FeatureAlgo_normalized']]
df_normalised_char = df_normalised[[ 'cor.mean_normalized', 'cov.mean_normalized',
       'eigenvalues.mean_normalized', 'g_mean.mean_normalized',
       'h_mean.mean_normalized', 'iq_range.mean_normalized',
       'kurtosis.mean_normalized', 'mad.mean_normalized',
       'max.mean_normalized', 'mean.mean_normalized', 'median.mean_normalized',
       'min.mean_normalized', 'nr_cor_attr_normalized', 'nr_norm_normalized',
       'nr_outliers_normalized', 'range.mean_normalized', 'sd.mean_normalized',
       'skewness.mean_normalized', 'sparsity.mean_normalized',
       't_mean.mean_normalized', 'var.mean_normalized',
        'attr_to_inst_normalized',
       'inst_to_attr_normalized', 'nr_attr_normalized', 'nr_bin_normalized',
       'nr_inst_normalized', 'nr_num_normalized', 'attr_conc.mean_normalized',
       'attr_ent.mean_normalized', 'nUnique_normalized', 'ena_normalized', 'snr.mean_normalized',
       'cEntropy_normalized', 'FeatureAlgo_normalized']]

## kNN

In [14]:
def evaluate_knn(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    ros = RandomOverSampler(random_state=33)
    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)


    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
        
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    
    cross_val_scores = cross_val_score(knn_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_knn(df)

comparison_df = pd.DataFrame(results).transpose()
print(comparison_df)

                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.730866                    0.269134   
df_normalised_char                     0.644435                    0.355565   
df_normalised_quality                  0.442312                    0.557688   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised              0.759615        0.240385  0.784856  0.759615   
df_normalised_char         0.682692        0.317308  0.744291  0.682692   
df_normalised_quality      0.403846        0.596154  0.565061  0.403846   

                       F1 Score  \
df_normalised          0.761207   
df_normalised_char     0.688986   
df_normalised_quality  0.388166   

                                                        Confusion Matrix  
df_normalised          [[12, 1, 3, 0, 0], [0, 13, 0, 0, 0], [5, 2, 10...  
df_normalised_char     [[11, 2, 2, 0, 1], [0, 13, 0, 0, 0], [6, 3, 8,...  
df_normalised_qu

## Decision trees

In [15]:
#df_raw decision

def evaluate_decision_tree(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    ros = RandomOverSampler(random_state=33)

    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Cross-Validation Error Rate': error_rates.mean(),
        'Test Accuracy': accuracy,
        'Test Error Rate': 1 - accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }

datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_decision_tree(df)

comparison_df = pd.DataFrame(results).transpose()

print(comparison_df)

                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.819707                    0.180293   
df_normalised_char                     0.733133                    0.266867   
df_normalised_quality                  0.550516                    0.449484   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised              0.846154        0.153846  0.860741  0.846154   
df_normalised_char         0.778846        0.221154  0.834188  0.778846   
df_normalised_quality      0.490385        0.509615  0.687627  0.490385   

                       F1 Score  \
df_normalised          0.842221   
df_normalised_char     0.787035   
df_normalised_quality  0.498323   

                                                        Confusion Matrix  
df_normalised          [[14, 1, 1, 0, 0], [0, 13, 0, 0, 0], [4, 1, 10...  
df_normalised_char     [[13, 2, 1, 0, 0], [0, 13, 0, 0, 0], [5, 1, 11...  
df_normalised_qu

## Distance based

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE

datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

metric_name = 'Euclidean'
metric_func = lambda X_train, X_test: np.sqrt(((X_train.values[:, np.newaxis] - X_test.values) ** 2).sum(axis=2))

kf = KFold(n_splits=5, shuffle=True, random_state=33)

for name, df in datasets.items():
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    
    cv_accuracies = []

    ros = RandomOverSampler(random_state=33)
    smote = SMOTE(random_state=33)
    
    pipeline = Pipeline([
        ('ros', ros),
        ('smote', smote)
    ])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        distances = metric_func(X_train, X_test)
        
        nearest_indices = np.argmin(distances, axis=0)
        
        predicted_labels = y_train[nearest_indices]
        
        cv_accuracy = accuracy_score(y_test, predicted_labels)
        
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    
    distances = metric_func(X_train, X_test)
    
    nearest_indices = np.argmin(distances, axis=0)
    
    predicted_labels = y_train[nearest_indices]
    
    test_accuracy = accuracy_score(y_test, predicted_labels)
    precision = precision_score(y_test, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(y_test, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predicted_labels, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(y_test, predicted_labels)

    results[name] = {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }

results_df = pd.DataFrame(results).transpose()

print(results_df)


                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.728846                    0.271154   
df_normalised_char                     0.630769                    0.369231   
df_normalised_quality                  0.394231                    0.605769   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised              0.769231        0.230769  0.777686  0.769231   
df_normalised_char         0.798077        0.201923  0.820765  0.798077   
df_normalised_quality      0.403846        0.596154  0.613018  0.403846   

                       F1 Score  \
df_normalised          0.766673   
df_normalised_char     0.798219   
df_normalised_quality  0.407522   

                                                        Confusion Matrix  
df_normalised          [[14, 1, 1, 0, 0], [0, 10, 0, 3, 0], [5, 2, 9,...  
df_normalised_char     [[11, 1, 3, 1, 0], [0, 10, 0, 3, 0], [2, 1, 13...  
df_normalised_qu

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


## Unification

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def unify_features(test_sample, train_data, train_labels):
    test_features = test_sample.values
    matching_indices = np.all(train_data == test_features, axis=1)
    if matching_indices.any():
        matching_labels = train_labels[matching_indices]
        return np.unique(matching_labels)
    return ["unknown"]

def evaluate_unification(df, random_state=33):
    df = df.copy()
    df.fillna(0, inplace=True)
    
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
   
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # Ensure y_encoded is 1D
    
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    cv_accuracies = []
    test_accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices = []
    
    ros = RandomOverSampler(random_state=33)

    smote = SMOTE(random_state=33)

    pipeline = Pipeline([('ros', ros), ('smote', smote)])

    X_resampled, y_resampled = pipeline.fit_resample(X, y_encoded)
    
    for train_index, test_index in kf.split(X_resampled):
        X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        
        predicted_labels = []
        for i in range(X_test.shape[0]):
            label = unify_features(X_test.iloc[i], X_train, y_train)
            predicted_labels.append(label)
        
        predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
        valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        if not valid_indices:
            continue
        
        valid_predictions = np.array([predicted_labels[i] for i in valid_indices])

        valid_truth = np.array([y_test[i] for i in valid_indices])
        
        cv_accuracy = accuracy_score(valid_truth, valid_predictions)
       
        cv_accuracies.append(cv_accuracy)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=33)
    predicted_labels = []
    for i in range(X_test.shape[0]):
        label = unify_features(X_test.iloc[i], X_train, y_train)
        predicted_labels.append(label)
    predicted_labels = [item for sublist in predicted_labels for item in sublist]
        
    valid_indices = [i for i, label in enumerate(predicted_labels) if label != "unknown" and i < len(y_test)]
        
    valid_predictions = np.array([predicted_labels[i] for i in valid_indices])
    valid_truth = np.array([y_test[i] for i in valid_indices])
        
    precision = precision_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    recall = recall_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    f1 = f1_score(valid_truth, valid_predictions, average='weighted', zero_division=0)
    confusion_matrix_result = confusion_matrix(valid_truth, valid_predictions)    
    test_accuracy = accuracy_score(valid_truth, valid_predictions)

    return {
        'Cross-Validation Accuracy': np.mean(cv_accuracies),
        'Cross-Validation Error Rate': np.mean([1 - acc for acc in cv_accuracies]),
        'Test Accuracy': test_accuracy,
        'Test Error Rate': 1 - test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix_result
    }


datasets = {
    'df_normalised': df_normalised,
    'df_normalised_char': df_normalised_char,
    'df_normalised_quality': df_normalised_quality
}

results = {}

for name, df in datasets.items():
    results[name] = evaluate_unification(df)

results_df = pd.DataFrame(results).transpose()

print(results_df)


                      Cross-Validation Accuracy Cross-Validation Error Rate  \
df_normalised                          0.333918                    0.666082   
df_normalised_char                     0.200077                    0.799923   
df_normalised_quality                  0.187718                    0.812282   

                      Test Accuracy Test Error Rate Precision    Recall  \
df_normalised                  0.25            0.75  0.259081      0.25   
df_normalised_char              0.2             0.8  0.205202       0.2   
df_normalised_quality      0.262136        0.737864  0.267688  0.262136   

                       F1 Score  \
df_normalised          0.252381   
df_normalised_char     0.201614   
df_normalised_quality  0.263796   

                                                        Confusion Matrix  
df_normalised          [[3, 2, 3, 3, 3], [2, 3, 1, 2, 3], [3, 1, 4, 4...  
df_normalised_char     [[3, 1, 2, 7, 2], [3, 3, 3, 1, 3], [4, 1, 1, 4...  
df_normalised_qu