## Goldsmiths University of London
### Authors...: Sandor Kanda (skand001) + Carlos Alves (cdeol003)
### Created...: 14/02/2023

## Data Mining Coursework

## Import the necessary libraries and load the datasets :

In [25]:
# Import pandas for data manipulation using dataframes
import pandas as pd

# Import numpy for data statistical analysis
import numpy as np

# Import matplotlib for data visualisation
import matplotlib.pyplot as plt

# Load the dataset from the CSV file using pandas
train_data = pd.read_csv('sonar_train.csv')
test_data = pd.read_csv('sonar_test.csv')


## Define a function to compute the Minkowski distance between two points :

In [26]:
# Define a function to compute the Minkowski distance between two points
def minkowski_distance(a, b, q):

    # Compute the sum of the absolute value of the difference between the points raised to the qth power
    return np.power(np.sum(np.power(np.abs(a - b), q)), 1/q)


## Define the simplest Nearest Neighbor algorithm function : 

In [27]:
# Define a function to compute the nearest neighbor of a point
def nearest_neighbor(train_data, test_data, q):

    # Initialise the minimum distance to infinity
    predictions = []

    # Loop through each test point
    for idx, test_point in test_data.iterrows():

        # Initialise the minimum distance to infinity
        min_distance = float('inf')

        # Loop through each training point
        nearest_class = None
        for _, train_point in train_data.iterrows():

            # Compute the distance between the test point and the training point
            distance = minkowski_distance(test_point[:-1], train_point[:-1], q)
            
            # If the distance is less than the minimum distance, update the nearest neighbor and minimum distance
            if distance < min_distance:

                # Update the minimum distance
                min_distance = distance

                # Update the nearest neighbor
                nearest_class = train_point['Class']

        # Append the nearest neighbor to the list of predictions
        predictions.append(nearest_class)

    # Return the list of predictions
    return predictions


## Define a function to compute the performance metrics (accuracy, recall, precision, and F1 measure) :

In [28]:
# Define a function to compute the accuracy of the model
def compute_metrics(y_true, y_pred):

    # Initialise the true positive, true negative, false positive, and false negative counts
    tp, tn, fp, fn = 0, 0, 0, 0

    # Loop through each true and predicted label
    for true, pred in zip(y_true, y_pred):

        # Update the true positive, true negative, false positive, and false negative counts
        if true == 'M' and pred == 'M':
            tp += 1
        elif true == 'M' and pred == 'R':
            fn += 1
        elif true == 'R' and pred == 'M':
            fp += 1
        elif true == 'R' and pred == 'R':
            tn += 1

    # Compute the accuracy, recall, precision, and F1 measure
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_measure = 2 * precision * recall / (precision + recall)

    # Return the accuracy, recall, precision, and F1 measure
    return accuracy, recall, precision, f1_measure


## Run the Nearest Neighbor algorithm with Manhattan (q=1) and Euclidean (q=2) distances :

In [29]:
# Compute the nearest neighbor for each value of q
for q in [1, 2]:

    # Compute the nearest neighbor
    predictions = nearest_neighbor(train_data, test_data, q)

    # Compute the accuracy, recall, precision, and F1 measure
    accuracy, recall, precision, f1_measure = compute_metrics(test_data['Class'], predictions)

    # Print the results for each value of q
    print(f"Results for q = {q}:")
    print(f"  Accuracy...: {accuracy:.2%}")
    print(f"  Recall.....: {recall:.2%}")
    print(f"  Precision..: {precision:.2%}")
    print(f"  F1 Measure.: {f1_measure:.2%}")
    print()


Results for q = 1:
  Accuracy...: 88.41%
  Recall.....: 94.59%
  Precision..: 85.37%
  F1 Measure.: 89.74%

Results for q = 2:
  Accuracy...: 89.86%
  Recall.....: 97.30%
  Precision..: 85.71%
  F1 Measure.: 91.14%



## Run the Nearest Neighbor algorithm for q values from 1 to 20 and display the results in a chart :

In [30]:
# Compute the nearest neighbor for each value of q
q_values = list(range(1, 21))

# Initialise the lists to store the performance metrics
accuracies, recalls, precisions, f1_measures = [], [], [], []

# Compute the nearest neighbor for each value of q
for q in q_values:

    # Compute the nearest neighbor
    predictions = nearest_neighbor(train_data, test_data, q)

    # Compute the accuracy, recall, precision, and F1 measure
    accuracy, recall, precision, f1_measure = compute_metrics(test_data['Class'], predictions)
    
    # Append the performance metrics to the lists
    accuracies.append(accuracy)
    recalls.append(recall)
    precisions.append(precision)
    f1_measures.append(f1_measure)

# Plot the performance metrics for each value of q
plt.plot(q_values, accuracies, label='Accuracy')
plt.plot(q_values, recalls, label='Recall')
plt.plot(q_values, precisions, label='Precision')
plt.plot(q_values, f1_measures, label='F1 Measure')
plt.xlabel('q Value')
plt.ylabel('Performance Metrics')
plt.legend()
plt.title('Performance Metrics for Different q Values')
plt.show()

# Find the best value of q
best_q = np.argmax(accuracies) + 1
best_accuracy = max(accuracies)

# Print the best value of q
print(f"The best accuracy is achieved for q = {best_q} with an accuracy of {best_accuracy:.2%}")
