## Goldsmiths University of London
### Authors...: Sandor Kanda (skand001) + Carlos Alves (cdeol003)
### Created...: 14/02/2023

## Data Mining Coursework

## PART 1: 
### This task is based on the Sonar real data seen previously in class. Several objects which can be rock or metal cylinders are scanned on different angles and under different conditions, with sonar signals. 60 measurements are recorded per columns for each object (one record per object) and these are the predictors called A1, A2, …, A60. The label associated with each record contains the letter "R" if the object is a rock and "M" if it is metal cylinder, and this is the outcome variable called Class. Two datasets are provided to you: a training dataset in the sonar_train.csv file, and a test dataset in the sonar_test.csv file. 

# Setup

In [None]:
# Import relevant libraries for the project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Get the Data

In [None]:
# Import the dataframes credit default test and train
df_credit_test = pd.read_csv('creditdefault_test.csv')
df_credit_train = pd.read_csv('creditdefault_train.csv')

In [None]:
# Import the dataframes sonar test and train
df_sonar_test = pd.read_csv('sonar_test.csv')
df_sonar_train = pd.read_csv('sonar_train.csv')

## Take a Quick Look at the Data Structure

In [None]:
# Check the dataframes sonar test and train
df_sonar_test.head()

In [None]:
# Display information about the dataframe sonar test
df_sonar_test.info()

In [None]:
# Provide a statistical summary of the dataframes sonar test and train
df_sonar_test.describe().T

## Data Cleanup

In [None]:
# Check for missing values in the dataframe sonar test
df_credit_test.isnull().sum()

In [None]:
# Check for missing values in the dataframe sonar train
df_credit_train.isnull().sum()

In [None]:
# Plot the distribution of the target variable in the dataframe sonar train
df_sonar_train['Class'].value_counts().plot(kind='bar')
plt.title('Distribution of the target variable in the dataframe sonar train')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plot the distribution of the target variable in the dataframe sonar test
df_sonar_test['Class'].value_counts().plot(kind='bar')
plt.title('Distribution of the target variable in the dataframe sonar test')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Create the feature matrix X_train and the target vector y_train for the dataframe sonar train 
X_train = df_sonar_train.drop('Class', axis=1)
y_train = df_sonar_train['Class']

## Training the K-NN model on the Training set using the euclidean distance metric

### Using the Library from sklearn:

In [None]:
# Import the KNeighborsClassifier class
from sklearn.neighbors import KNeighborsClassifier

# Create the classifier object using the euclidean distance metric using 1 nearest neighbour
classifier = KNeighborsClassifier(n_neighbors=1, metric='euclidean')

# Fit the classifier to the data
classifier.fit(X_train, y_train)

In [None]:
# Predict the target variable for the dataframe sonar test
y_pred = classifier.predict(df_sonar_test.drop('Class', axis=1))

# Import the confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report

# Create the confusion matrix
cm = confusion_matrix(df_sonar_test['Class'], y_pred)

# Create the classification report
cr = classification_report(df_sonar_test['Class'], y_pred)

# Print the confusion matrix
print(cm)

# Print accuracy score with 2 decimal places
print('\n>> Accuracy score: {:.2f}%'.format((cm[0,0] + cm[1,1])/len(y_pred) * 100))


# Evaluation metrics to measure the performance of the model:

In [None]:
# Plot the confusion matrix with TP, TN, FP, and FN values
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate TP, TN, FP, and FN from the confusion matrix
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]

# Create the confusion matrix plot
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')

# Add the TP, TN, FP, and FN values to the plot
plt.text(0.5, -0.2, "True Positives: {}".format(TP), size=12, ha="center", 
         transform=plt.gca().transAxes)
plt.text(0.5, -0.3, "True Negatives: {}".format(TN), size=12, ha="center", 
         transform=plt.gca().transAxes)
plt.text(0.5, -0.4, "False Positives: {}".format(FP), size=12, ha="center", 
         transform=plt.gca().transAxes)
plt.text(0.5, -0.5, "False Negatives: {}".format(FN), size=12, ha="center", 
         transform=plt.gca().transAxes)

# Show the plot
plt.show()


## Create a K-Nearest Neighbors classifier without using the scikit-learn library


### Implements the KNN algorithm using Euclidean distance metric:

In [None]:
# Import the numpy library
import numpy as np

# Create the feature matrix X_train and the target vector y_train for the dataframe sonar train
class KNearestNeighbors:

    # Initialize the class with the number of nearest neighbors
    def __init__(self, k=1):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # Calculate the euclidean distance between the test point and all training points
        distances = np.sqrt(np.sum((X - self.X_train)**2, axis=1))

        # Sort the distances and return the indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]

        # Extract the labels of the k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Return the most common class label
        from collections import Counter
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]


In [None]:
# Create an instance of the KNearestNeighbors class and fit it to your data
classifier = KNearestNeighbors(k=1)
classifier.fit(X_train, y_train)

In [None]:
# Predict method to make predictions on new data
y_pred = classifier.predict(df_sonar_test.drop('Class', axis=1))


### Check that the y_pred has the same results as using the sklearn library

In [None]:
# Import the KNeighborsClassifier class
from sklearn.neighbors import KNeighborsClassifier

# Create the classifier object using the euclidean distance metric with 1 nearest neighbour
classifier = KNeighborsClassifier(n_neighbors=1, metric='euclidean')

# Fit the classifier to the data
classifier.fit(X_train, y_train)

# Import the confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report

# Create the confusion matrix
cm = confusion_matrix(df_sonar_test['Class'], y_pred)

# Create the classification report
cr = classification_report(df_sonar_test['Class'], y_pred)

# Print the confusion matrix
print(cm)

# Print accuracy score with 2 decimal places
print('\n>> Accuracy score: {:.2f}%'.format((cm[0,0] + cm[1,1])/len(y_pred) * 100))

In [None]:
# Calculate Accuracy, Precision, Recall, and F1 score
accuracy = (cm[0,0] + cm[1,1])/len(y_pred)
precision = cm[1,1]/(cm[1,1] + cm[0,1])
recall = cm[1,1]/(cm[1,1] + cm[1,0])
f1_score = 2 * precision * recall / (precision + recall)

# Print the accuracy, precision, recall, and F1 score
print('\n>> Accuracy...: {:.2f}%'.format(accuracy * 100))
print('>> Precision..: {:.2f}%'.format(precision * 100))
print('>> Recall.....: {:.2f}%'.format(recall * 100))
print('>> F1 score...: {:.2f}%'.format(f1_score * 100))


In [None]:
# Plot the accuracy, precision, recall, and F1 score
import matplotlib.pyplot as plt

# Create the plot
plt.bar(['Accuracy', 'Precision', 'Recall', 'F1 score'], [accuracy, precision, recall, f1_score])
plt.title('Accuracy, Precision, Recall, and F1 score')
plt.xlabel('Metric')
plt.ylabel('Score')
plt.show()
