# Spam Email Classification Using Machine Learning
### Group Members:
Simon Lindqvist: siln22@student.bth.se

Abdalrahman Mohammed: abmm22@student.bth.se

### General setup

In [8]:
# --- Imports ---
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import jinja2 as j2
import scipy.stats as stats

# --- Import models ---
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# --- K fold and cross val stuff ---
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

# --- Other stuff ---
import time

# --- Load data ---
spambase_data_df = pd.read_csv("spambase.data", header=None)
x_data = spambase_data_df.iloc[:, :-1] # All columns except the last
y_data = spambase_data_df.iloc[:, -1] # Last column is the target

### Run stratified ten-fold cross-validation tests for each model


In [9]:
# Kfold setup
n_folds = 10
strat_k_fold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=7)


Naive Bayes

In [10]:
# Run stratified kfold on the data with naive bayes classifier
naive_bayes_accuracy = []
naive_bayes_f1 = []
naive_bates_time = []

for train_index, test_index in strat_k_fold.split(x_data, y_data):
    x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]

    naive_bayes = GaussianNB()

    timer = time.time()

    naive_bayes.fit(x_train, y_train)

    delta_time = time.time() - timer

    y_pred = naive_bayes.predict(x_test)

    naive_bayes_accuracy.append(accuracy_score(y_test, y_pred))
    naive_bayes_f1.append(f1_score(y_test, y_pred))
    naive_bates_time.append(delta_time)


K-Nearest Neighbours

In [11]:
# Run stratified kfold on the data with knn classifier
knn_accuracy = []
knn_f1 = []
knn_time = []

for train_index, test_index in strat_k_fold.split(x_data, y_data):
    x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]

    knn = KNeighborsClassifier(n_neighbors=3)

    timer = time.time()

    knn.fit(x_train, y_train)

    delta_time = time.time() - timer

    y_pred = knn.predict(x_test)

    knn_accuracy.append(accuracy_score(y_test, y_pred))
    knn_f1.append(f1_score(y_test, y_pred))
    knn_time.append(delta_time)


Decision Tree

In [12]:
# Run stratified kfold on the data with decision tree classifier
decision_tree_accuracy = []
decision_tree_f1 = []
decision_tree_time = []

for train_index, test_index in strat_k_fold.split(x_data, y_data):
    x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]

    decision_tree = DecisionTreeClassifier()

    timer = time.time()

    decision_tree.fit(x_train, y_train)

    delta_time = time.time() - timer

    y_pred = decision_tree.predict(x_test)

    decision_tree_accuracy.append(accuracy_score(y_test, y_pred))
    decision_tree_f1.append(f1_score(y_test, y_pred))
    decision_tree_time.append(delta_time)


### Present the results exactly as in example 12.4 in course literature

In [13]:
all_model_accuracy = [naive_bayes_accuracy, knn_accuracy, decision_tree_accuracy]
all_model_f1 = [naive_bayes_f1, knn_f1, decision_tree_f1]
all_model_time = [naive_bates_time, knn_time, decision_tree_time]

# --- For accuracy ---

# Make new df for accuracy
models_accuracy_df = pd.DataFrame(columns=["Fold", "Naive Bayes", "KNN", "Decision Tree"])
models_accuracy_df["Fold"] = range(1, n_folds+1)
models_accuracy_df["Naive Bayes"] = naive_bayes_accuracy
models_accuracy_df["KNN"] = knn_accuracy
models_accuracy_df["Decision Tree"] = decision_tree_accuracy

# Add 11th row in fold column as mean
models_accuracy_df.loc[n_folds] = ["avg", np.mean(naive_bayes_accuracy), np.mean(knn_accuracy), np.mean(decision_tree_accuracy)]
# Add 12th row in fold column as std
models_accuracy_df.loc[n_folds+1] = ["std", np.std(naive_bayes_accuracy), np.std(knn_accuracy), np.std(decision_tree_accuracy)]

# Make into latex table using the to_latex function
models_accuracy_print_latex = models_accuracy_df.to_latex(index=False, header=True, float_format="%.3f", column_format="|l|c|c|c|", escape=False)

# Adjust LaTeX to add horizontal lines and match desired style
models_accuracy_print_latex = models_accuracy_print_latex.replace("\\toprule", "\\hline")
models_accuracy_print_latex = models_accuracy_print_latex.replace("\\bottomrule", "\\hline")

# --- For f1 ---

# Make new df for f1
models_f1_df = pd.DataFrame(columns=["Fold", "Naive Bayes", "KNN", "Decision Tree"])
models_f1_df["Fold"] = range(1, n_folds+1)
models_f1_df["Naive Bayes"] = naive_bayes_f1
models_f1_df["KNN"] = knn_f1
models_f1_df["Decision Tree"] = decision_tree_f1

# Add 11th row in fold column as mean
models_f1_df.loc[n_folds] = ["avg", np.mean(naive_bayes_f1), np.mean(knn_f1), np.mean(decision_tree_f1)]
# Add 12th row in fold column as std
models_f1_df.loc[n_folds+1] = ["std", np.std(naive_bayes_f1), np.std(knn_f1), np.std(decision_tree_f1)]

# Make into latex table using the to_latex function
models_f1_print_latex = models_f1_df.to_latex(index=False, header=True, float_format="%.3f", column_format="|l|c|c|c|", escape=False)

# Adjust LaTeX to add horizontal lines and match desired style
models_f1_print_latex = models_f1_print_latex.replace("\\toprule", "\\hline")
models_f1_print_latex = models_f1_print_latex.replace("\\bottomrule", "\\hline")


# --- For time ---

# Make new df for time
models_time_df = pd.DataFrame(columns=["Fold", "Naive Bayes", "KNN", "Decision Tree"])
models_time_df["Fold"] = range(1, n_folds+1)
models_time_df["Naive Bayes"] = naive_bates_time
models_time_df["KNN"] = knn_time
models_time_df["Decision Tree"] = decision_tree_time

# Add 11th row in fold column as mean
models_time_df.loc[n_folds] = ["avg", np.mean(naive_bates_time), np.mean(knn_time), np.mean(decision_tree_time)]
# Add 12th row in fold column as std
models_time_df.loc[n_folds+1] = ["std", np.std(naive_bates_time), np.std(knn_time), np.std(decision_tree_time)]

# Make into latex table using the to_latex function
models_time_print_latex = models_time_df.to_latex(index=False, header=True, float_format="%.3f", column_format="|l|c|c|c|", escape=False)

# Adjust LaTeX to add horizontal lines and match desired style
models_time_print_latex = models_time_print_latex.replace("\\toprule", "\\hline")
models_time_print_latex = models_time_print_latex.replace("\\bottomrule", "\\hline")


# --- Print the latex tables ---
print("Accuracy latex table:")
print(models_accuracy_print_latex)
print("\n")
print("F1 latex table:")
print(models_f1_print_latex)
print("\n")
print("Time latex table:")
print(models_time_print_latex)
print("\n")


Accuracy latex table:
\begin{tabular}{|l|c|c|c|}
\hline
Fold & Naive Bayes & KNN & Decision Tree \\
\midrule
1 & 0.826 & 0.807 & 0.909 \\
2 & 0.841 & 0.787 & 0.889 \\
3 & 0.826 & 0.793 & 0.917 \\
4 & 0.811 & 0.830 & 0.911 \\
5 & 0.846 & 0.813 & 0.911 \\
6 & 0.826 & 0.848 & 0.926 \\
7 & 0.839 & 0.822 & 0.924 \\
8 & 0.780 & 0.800 & 0.915 \\
9 & 0.780 & 0.796 & 0.893 \\
10 & 0.826 & 0.787 & 0.911 \\
avg & 0.820 & 0.808 & 0.911 \\
std & 0.022 & 0.019 & 0.011 \\
\hline
\end{tabular}



F1 latex table:
\begin{tabular}{|l|c|c|c|}
\hline
Fold & Naive Bayes & KNN & Decision Tree \\
\midrule
1 & 0.810 & 0.753 & 0.888 \\
2 & 0.827 & 0.732 & 0.861 \\
3 & 0.815 & 0.740 & 0.895 \\
4 & 0.800 & 0.780 & 0.887 \\
5 & 0.830 & 0.758 & 0.885 \\
6 & 0.814 & 0.800 & 0.908 \\
7 & 0.826 & 0.776 & 0.902 \\
8 & 0.769 & 0.753 & 0.896 \\
9 & 0.775 & 0.749 & 0.869 \\
10 & 0.813 & 0.718 & 0.887 \\
avg & 0.808 & 0.756 & 0.888 \\
std & 0.020 & 0.023 & 0.014 \\
\hline
\end{tabular}



Time latex table:
\begin{tabular}{

### Conduct friedman test for each performance measure

#### Accuracy

In [23]:
# --- Ranking stuff and latex table stuff ---
# Create ranking dataframe for the table
latex_rankings_df = pd.DataFrame({"Data Set": range(1, n_folds + 1), "Naive Bayes": naive_bayes_accuracy, "KNN": knn_accuracy, "Decision Tree": decision_tree_accuracy})
rankings_per_model = {"Naive Bayes": [], "KNN": [], "Decision Tree": []}

# Loop through all models and rank each row based on accuracy and put the ranking as a paranthesis next to the accuracy
for i, row in latex_rankings_df.iterrows():
    # Rank the current row
    rankings_current_row = row.rank(ascending=False, method="min").astype(float)
    # Insert paranthesis next to the accuracy in the dataframe
    for model in ["Naive Bayes", "KNN", "Decision Tree"]:
        rankings_per_model[model].append(rankings_current_row[model])
        # Change format of dataframe i, model to be in string format to accomodate paranthesis
        latex_rankings_df[model] = latex_rankings_df[model].astype(str)
        # Add paranthesis to the value
        latex_rankings_df.at[i, model] = f"{row[model]:.3f} ({int(rankings_current_row[model] - 1)})"

# Add mean and std rows to latex table
latex_rankings_df.loc[n_folds] = ["avg", f"{np.mean(naive_bayes_accuracy):.3f}", f"{np.mean(knn_accuracy):.3f}", f"{np.mean(decision_tree_accuracy):.3f}"]
# Add std row to latex table
latex_rankings_df.loc[n_folds+1] = ["std", f"{np.std(naive_bayes_accuracy):.3f}", f"{np.std(knn_accuracy):.3f}", f"{np.std(decision_tree_accuracy):.3f}"]
# Make into latex table using the to_latex function
accuracy_rankings_latex = latex_rankings_df.to_latex(index=False, header=True, float_format="%.4f", column_format="|l|c|c|c|", escape=False)
# Adjust LaTeX to add horizontal lines and match desired style
accuracy_rankings_latex = accuracy_rankings_latex.replace("\\toprule", "\\hline")
accuracy_rankings_latex = accuracy_rankings_latex.replace("\\bottomrule", "\\hline")

# --- Actual friedmann test stuff ---
# Vars for the friedman test
num_samples = n_folds
num_models = 3
alpha = 0.05
degrees_of_freedom = num_samples - 1

# Calculate avg rank
avg_rank_friedmann = (num_models + 1) / 2
print(f"Avg rank for accuracy: {avg_rank_friedmann}")

# Calculate first sum of squared differences as per the formula in book
sum_squared_diff = 0
for model in ["Naive Bayes", "KNN", "Decision Tree"]:
    Rj = sum(rankings_per_model[model]) / num_samples
    sum_squared_diff += (Rj - avg_rank_friedmann)**2
sum_squared_diff *= num_samples
print(f"Sum of squared differences for accuracy: {sum_squared_diff}")

# Calculate second sum of squared differences as per the formula in book
second_sum_squared_diff = 0
for model in ["Naive Bayes", "KNN", "Decision Tree"]:
    for i in range(num_samples):
        second_sum_squared_diff += (rankings_per_model[model][i] - avg_rank_friedmann)**2
second_sum_squared_diff *= (1 / (num_samples * (num_models - 1)))
print(f"Second sum of squared differences for accuracy: {second_sum_squared_diff}")

# Calculate friedman statistic
friedman_statistic = sum_squared_diff / second_sum_squared_diff
print(f"Friedman statistic for accuracy: {friedman_statistic}")

# Calculate critical value
critical_value = stats.chi2.ppf(1 - alpha, num_models - 1)
print(f"Critical value for accuracy: {critical_value}")

# Check if we reject the null hypothesis
if friedman_statistic > critical_value:
    print("Reject null hypothesis for friedman test (significant difference between models)")
else:
    print("Do not reject null hypothesis for friedman test (no significant difference between models)")


Avg rank for accuracy: 2.0
Sum of squared differences for accuracy: 45.2
Second sum of squared differences for accuracy: 2.5
Friedman statistic for accuracy: 18.080000000000002
Critical value for accuracy: 5.991464547107979
Reject null hypothesis for friedman test (significant difference between models)


#### F1 score

In [15]:
# --- Latex table for f1 scores ---
# Create ranking dataframe for the table
latex_rankings_df_f1 = pd.DataFrame({"Data Set": range(1, n_folds + 1), "Naive Bayes": naive_bayes_f1, "KNN": knn_f1, "Decision Tree": decision_tree_f1})
rankings_per_model_f1 = {"Naive Bayes": [], "KNN": [], "Decision Tree": []}

# Loop through all models and rank each row based on accuracy and put the ranking as a paranthesis next to the accuracy
for i, row in latex_rankings_df_f1.iterrows():
    # Rank the current row
    rankings_current_row = row.rank(ascending=False, method="min").astype(float)
    # Insert paranthesis next to the accuracy in the dataframe
    for model in ["Naive Bayes", "KNN", "Decision Tree"]:
        rankings_per_model_f1[model].append(rankings_current_row[model])
        # Change format of dataframe i, model to be in string format to accomodate paranthesis
        latex_rankings_df_f1[model] = latex_rankings_df_f1[model].astype(str)
        # Add paranthesis to the value
        latex_rankings_df_f1.at[i, model] = f"{row[model]:.3f} ({int(rankings_current_row[model] - 1)})"

# Add mean and std rows to latex table
latex_rankings_df_f1.loc[n_folds] = ["avg", f"{np.mean(naive_bayes_f1):.3f}", f"{np.mean(knn_f1):.3f}", f"{np.mean(decision_tree_f1):.3f}"]
# Add std row to latex table
latex_rankings_df_f1.loc[n_folds+1] = ["std", f"{np.std(naive_bayes_f1):.3f}", f"{np.std(knn_f1):.3f}", f"{np.std(decision_tree_f1):.3f}"]
# Make into latex table using the to_latex function
f1_rankings_latex = latex_rankings_df_f1.to_latex(index=False, header=True, float_format="%.4f", column_format="|l|c|c|c|", escape=False)
# Adjust LaTeX to add horizontal lines and match desired style
f1_rankings_latex = f1_rankings_latex.replace("\\toprule", "\\hline")
f1_rankings_latex = f1_rankings_latex.replace("\\bottomrule", "\\hline")

# --- Actual friedmann test stuff ---
# Vars for the friedman test
num_samples = n_folds
num_models = 3
alpha = 0.05

# Calculate avg rank
avg_rank_friedmann = (num_models + 1) / 2
print(f"Avg rank f1: {avg_rank_friedmann}")

# Calculate first sum of squared differences as per the formula in book
sum_squared_diff = 0
for model in ["Naive Bayes", "KNN", "Decision Tree"]:
    Rj = sum(rankings_per_model_f1[model]) / num_samples
    sum_squared_diff += (Rj - avg_rank_friedmann)**2
sum_squared_diff *= num_samples
print(f"Sum of squared differences f1: {sum_squared_diff}")

# Calculate second sum of squared differences as per the formula in book
second_sum_squared_diff = 0
for model in ["Naive Bayes", "KNN", "Decision Tree"]:
    for i in range(num_samples):
        second_sum_squared_diff += (rankings_per_model_f1[model][i] - avg_rank_friedmann)**2
second_sum_squared_diff *= (1 / (num_samples * (num_models - 1)))
print(f"Second sum of squared differences f1: {second_sum_squared_diff}")

# Calculate friedman statistic
friedman_statistic = sum_squared_diff / second_sum_squared_diff
print(f"Friedman statistic f1: {friedman_statistic}")

# Calculate critical value
critical_value = stats.chi2.ppf(1 - alpha, num_models - 1)
print(f"Critical value f1: {critical_value}")

# Check if we reject the null hypothesis
if friedman_statistic > critical_value:
    print("Reject null hypothesis for f1 (significant difference between models)")
else:
    print("Do not reject null hypothesis for f1 (no significant difference between models)")


Avg rank f1: 2.0
Sum of squared differences f1: 50.0
Second sum of squared differences f1: 2.5
Friedman statistic f1: 20.0
Critical value f1: 5.991464547107979
Reject null hypothesis for f1 (significant difference between models)


#### Time

In [16]:
# --- Latex table for time ---
# Create ranking dataframe for the table
latex_rankings_df_time = pd.DataFrame({"Data Set": range(1, n_folds + 1), "Naive Bayes": naive_bates_time, "KNN": knn_time, "Decision Tree": decision_tree_time})
rankings_per_model_time = {"Naive Bayes": [], "KNN": [], "Decision Tree": []}

# Loop through all models and rank each row based on accuracy and put the ranking as a paranthesis next to the accuracy
for i, row in latex_rankings_df_time.iterrows():
    # Rank the current row
    rankings_current_row = row.rank(ascending=True, method="min").astype(float)
    # Insert paranthesis next to the accuracy in the dataframe
    for model in ["Naive Bayes", "KNN", "Decision Tree"]:
        rankings_per_model_time[model].append(rankings_current_row[model])
        # Change format of dataframe i, model to be in string format to accomodate paranthesis
        latex_rankings_df_time[model] = latex_rankings_df_time[model].astype(str)
        # Add the paranthesis to the dataframe
        latex_rankings_df_time.at[i, model] = f"{row[model]:.3f} ({int(rankings_current_row[model])})"

# Add mean and std rows to latex table
latex_rankings_df_time.loc[n_folds] = ["avg", f"{np.mean(naive_bates_time):.3f}", f"{np.mean(knn_time):.3f}", f"{np.mean(decision_tree_time):.3f}"]
# Add std row to latex table
latex_rankings_df_time.loc[n_folds+1] = ["std", f"{np.std(naive_bates_time):.3f}", f"{np.std(knn_time):.3f}", f"{np.std(decision_tree_time):.3f}"]
# Make into latex table using the to_latex function
time_rankings_latex = latex_rankings_df_time.to_latex(index=False, header=True, float_format="%.4f", column_format="|l|c|c|c|", escape=False)
# Adjust LaTeX to add horizontal lines and match desired style
time_rankings_latex = time_rankings_latex.replace("\\toprule", "\\hline")
time_rankings_latex = time_rankings_latex.replace("\\bottomrule", "\\hline")

# --- Actual friedmann test stuff ---
# Vars for the friedman test
num_samples = n_folds
num_models = 3
alpha = 0.05
degrees_of_freedom = num_samples - 1

# Calculate avg rank
avg_rank_friedmann = (num_models + 1) / 2
print(f"Avg rank for time: {avg_rank_friedmann}")

# Calculate first sum of squared differences as per the formula in book
sum_squared_diff = 0
for model in ["Naive Bayes", "KNN", "Decision Tree"]:
    Rj = sum(rankings_per_model_time[model]) / num_samples
    sum_squared_diff += (Rj - avg_rank_friedmann)**2
sum_squared_diff *= num_samples
print(f"Sum of squared differences for time: {sum_squared_diff}")

# Calculate second sum of squared differences as per the formula in book
second_sum_squared_diff = 0
for model in ["Naive Bayes", "KNN", "Decision Tree"]:
    for i in range(num_samples):
        second_sum_squared_diff += (rankings_per_model_time[model][i] - avg_rank_friedmann)**2
second_sum_squared_diff *= (1 / (num_samples * (num_models - 1)))
print(f"Second sum of squared differences for time: {second_sum_squared_diff}")

# Calculate friedman statistic
friedman_statistic = sum_squared_diff / second_sum_squared_diff
print(f"Friedman statistic for time: {friedman_statistic}")

# Calculate critical value
critical_value = stats.chi2.ppf(1 - alpha, num_models - 1)
print(f"Critical value for time: {critical_value}")

# Check if we reject the null hypothesis
if friedman_statistic > critical_value:
    print("Reject null hypothesis for friedman test (significant difference between models)")
else:
    print("Do not reject null hypothesis for friedman test (no significant difference between models)")

Avg rank for time: 2.0
Sum of squared differences for time: 20.0
Second sum of squared differences for time: 1.0
Friedman statistic for time: 20.0
Critical value for time: 5.991464547107979
Reject null hypothesis for friedman test (significant difference between models)


#### Tables as in example 12.8 in the book

In [17]:
# --- Print the latex tables ---
print("Accuracy rankings latex table:")
print(accuracy_rankings_latex)
print("\n")
print("F1 rankings latex table:")
print(f1_rankings_latex)
print("\n")
print("Time rankings latex table:")
print(time_rankings_latex)
print("\n")

Accuracy rankings latex table:
\begin{tabular}{|l|c|c|c|}
\hline
Data Set & Naive Bayes & KNN & Decision Tree \\
\midrule
1 & 0.826 (2) & 0.807 (3) & 0.909 (1) \\
2 & 0.841 (2) & 0.787 (3) & 0.889 (1) \\
3 & 0.826 (2) & 0.793 (3) & 0.917 (1) \\
4 & 0.811 (3) & 0.830 (2) & 0.911 (1) \\
5 & 0.846 (2) & 0.813 (3) & 0.911 (1) \\
6 & 0.826 (3) & 0.848 (2) & 0.926 (1) \\
7 & 0.839 (2) & 0.822 (3) & 0.924 (1) \\
8 & 0.780 (3) & 0.800 (2) & 0.915 (1) \\
9 & 0.780 (3) & 0.796 (2) & 0.893 (1) \\
10 & 0.826 (2) & 0.787 (3) & 0.911 (1) \\
avg & 0.820 & 0.808 & 0.911 \\
std & 0.022 & 0.019 & 0.011 \\
\hline
\end{tabular}



F1 rankings latex table:
\begin{tabular}{|l|c|c|c|}
\hline
Data Set & Naive Bayes & KNN & Decision Tree \\
\midrule
1 & 0.810 (2) & 0.753 (3) & 0.888 (1) \\
2 & 0.827 (2) & 0.732 (3) & 0.861 (1) \\
3 & 0.815 (2) & 0.740 (3) & 0.895 (1) \\
4 & 0.800 (2) & 0.780 (3) & 0.887 (1) \\
5 & 0.830 (2) & 0.758 (3) & 0.885 (1) \\
6 & 0.814 (2) & 0.800 (3) & 0.908 (1) \\
7 & 0.826 (2) & 0.7

### Determine whether the average ranks as a whole display significant differences on the 0.05 alpha level and, if so, use the Nemenyi test to calculate the critical difference in order to determine which algorithms perform significantly different from each other.

In [18]:
# Determine whether the average ranks as a whole display significant differences on the 0.05 alpha level and, 
# if so, use the Nemenyi test to calculate the critical difference in order to determine which algorithms perform significantly different from each other.

# --- Nemenyi test stuff ---
# Calculate critical difference
q_alpha = 2.343 # According to course book page 356 since we have alpha = 0.05 and k = 3 (models). This does not take degrees of freedom into account

# Calculate critical difference
critical_difference = q_alpha * np.sqrt((num_models * (num_models + 1)) / (6 * num_samples))
print(f"Critical difference: {critical_difference}")


# --- Nemenyi test for accuracy ---
print("Nemenyi test for accuracy:")
# Loop through all models and compare them with each other
for i, model1 in enumerate(["Naive Bayes", "KNN", "Decision Tree"]):
    for j, model2 in enumerate(["Naive Bayes", "KNN", "Decision Tree"]):
        # If i is less than j, then we have not compared the models yet
        if i < j:
            # Calculate the difference between the models
            difference = abs(np.mean(rankings_per_model[model1]) - np.mean(rankings_per_model[model2]))
            # Check if the difference is significant
            if difference > critical_difference:
                print(f"{model1} and {model2} for accuracy are significantly different since the difference is {difference} which is greater than the critical difference of {critical_difference}")
            else:
                print(f"{model1} and {model2} for accuracy are not significantly different since the difference is {difference} which is less than the critical difference of {critical_difference}")
print("\n")

# --- Nemenyi test for f1 ---
print("Nemenyi test for f1:")
# Loop through all models and compare them with each other
for i, model1 in enumerate(["Naive Bayes", "KNN", "Decision Tree"]):
    for j, model2 in enumerate(["Naive Bayes", "KNN", "Decision Tree"]):
        # If i is less than j, then we have not compared the models yet
        if i < j:
            # Calculate the difference between the models
            difference = abs(np.mean(rankings_per_model_f1[model1]) - np.mean(rankings_per_model_f1[model2]))
            # Check if the difference is significant
            if difference > critical_difference:
                print(f"{model1} and {model2} for f1 are significantly different since the difference is {difference} which is greater than the critical difference of {critical_difference}")
            else:
                print(f"{model1} and {model2} for f1 are not significantly different since the difference is {difference} which is less than the critical difference of {critical_difference}")
print("\n")

# --- Nemenyi test for time ---
print("Nemenyi test for time:")
# Loop through all models and compare them with each other
for i, model1 in enumerate(["Naive Bayes", "KNN", "Decision Tree"]):
    for j, model2 in enumerate(["Naive Bayes", "KNN", "Decision Tree"]):
        # If i is less than j, then we have not compared the models yet
        if i < j:
            # Calculate the difference between the models
            difference = abs(np.mean(rankings_per_model_time[model1]) - np.mean(rankings_per_model_time[model2]))
            # Check if the difference is significant
            if difference > critical_difference:
                print(f"{model1} and {model2} for time are significantly different since the difference is {difference} which is greater than the critical difference of {critical_difference}")
            else:
                print(f"{model1} and {model2} for time are not significantly different since the difference is {difference} which is less than the critical difference of {critical_difference}")


Critical difference: 1.0478214542564015
Nemenyi test for accuracy:
Naive Bayes and KNN for accuracy are not significantly different since the difference is 0.20000000000000018 which is less than the critical difference of 1.0478214542564015
Naive Bayes and Decision Tree for accuracy are significantly different since the difference is 1.4 which is greater than the critical difference of 1.0478214542564015
KNN and Decision Tree for accuracy are significantly different since the difference is 1.6 which is greater than the critical difference of 1.0478214542564015


Nemenyi test for f1:
Naive Bayes and KNN for f1 are not significantly different since the difference is 1.0 which is less than the critical difference of 1.0478214542564015
Naive Bayes and Decision Tree for f1 are not significantly different since the difference is 1.0 which is less than the critical difference of 1.0478214542564015
KNN and Decision Tree for f1 are significantly different since the difference is 2.0 which is gr