## Goldsmiths University of London
### Authors...: Sandor Kanda (skand001) + Carlos Alves (cdeol003)
### Created...: 14/02/2023

## Data Mining Coursework

In [1]:
# Import necessary libraries for this coursework

# Import pandas for data manipulation
import pandas as pd

# Import numpy for numerical calculations
import numpy as np

# Import matplotlib for plotting
import matplotlib.pyplot as plt

In [2]:
# Load the training and test datasets
train_data = pd.read_csv("sonar_train.csv")
test_data = pd.read_csv("sonar_test.csv")

In [3]:
# Define a function to calculate the Minkowski distance
def minkowski_distance(a, b, q):

    # Calculate the Minkowski distance between two vectors
    return np.sum(np.abs(a - b) ** q) ** (1 / q)

In [4]:
# Define functions to calculate accuracy, recall, precision, and F1 measure:

# Define function accuracy = (TP + TN) / (TP + TN + FP + FN)
def accuracy(y_true, y_pred):

    # Calculate and return the accuracy of the model
    return np.sum(y_true == y_pred) / len(y_true)

# Define function recall = TP / (TP + FN)
def recall(y_true, y_pred):

    # Calculate and return the recall of the model
    tp = np.sum((y_true == 'M') & (y_pred == 'M'))
    fn = np.sum((y_true == 'M') & (y_pred == 'R'))
    return tp / (tp + fn)

# Define function precision = TP / (TP + FP)
def precision(y_true, y_pred):

    # Calculate and return the precision of the model
    tp = np.sum((y_true == 'M') & (y_pred == 'M'))
    fp = np.sum((y_true == 'R') & (y_pred == 'M'))
    return tp / (tp + fp)

# Define function F1 = 2 * (precision * recall) / (precision + recall)
def f1_measure(y_true, y_pred):

    # Calculate and return the F1 measure of the model
    r = recall(y_true, y_pred)
    p = precision(y_true, y_pred)
    return 2 * r * p / (r + p)

In [5]:
# Implement the simplest Nearest Neighbour algorithm with the Minkowski distance

# Define a function to calculate the nearest neighbour
def nearest_neighbour(train_data, test_data, q):

    # Calculate the nearest neighbour of each test data point

    # Initialise a list to store the predicted classes
    y_pred = []

    # Iterate over each test data point
    for index, test_row in test_data.iterrows():

        # Extract the features of the test data point
        test_features = test_row.drop("Class").values

        # Initialise the minimum distance to infinity
        min_distance = float("inf")

        # Iterate over each training data point
        nearest_class = None
        for _, train_row in train_data.iterrows():

            # Extract the features of the training data point
            train_features = train_row.drop("Class").values

            # Calculate the Minkowski distance between the test and training data points
            distance = minkowski_distance(test_features, train_features, q)

            # Update the minimum distance and nearest class
            if distance < min_distance:

                # Update the minimum distance and nearest class
                min_distance = distance
                nearest_class = train_row["Class"]

        # Append the nearest class to the list of predicted classes
        y_pred.append(nearest_class)

    # Return the list of predicted classes
    return y_pred

In [None]:
# Run the algorithm for Manhattan distance (q=1) and Euclidean distance (q=2)

# Extract the true classes of the test data
y_true = test_data["Class"].values

# Run the algorithm for q=1 and q=2
for q in [1, 2]:

    # Calculate the predicted classes
    y_pred = nearest_neighbour(train_data, test_data, q)

    # Print the accuracy, recall, precision, and F1 measure
    print(f"q = {q}")
    print(f"Accuracy: {accuracy(y_true, y_pred):.4f}")
    print(f"Recall: {recall(y_true, y_pred):.4f}")
    print(f"Precision