# LOAD DATASET
---

In [5]:
import numpy as np
import pandas as pd


def convert_if_possible(val):
    try:
        float_val = float(val)
        if float_val.is_integer():
            return int(float_val)
        else:
            return float_val
    except ValueError:
        return val


MODE = "breast_cancer"

test_file = "bc_test.csv"
train_file = "bc_train.csv"

# use a converter to make sure that all integers in the file (wihtout .) are stored as integers in to pandas dataframe
test_df = pd.read_csv(
    test_file, converters={i: convert_if_possible for i in range(1, 10)}, sep=" "
)
train_df = pd.read_csv(
    train_file, converters={i: convert_if_possible for i in range(1, 10)}, sep=" "
)

# Neural Network
---

In [6]:
from FCNN import (
    initialize_network,
    predict,
    train_network,
    read_configuration_file,
    accuracy_score,
    F1_score,
)

train_dataset = [
    [*row[:-1], int(row[-1])] for row in train_df.itertuples(index=False, name=None)
]
test_dataset = [
    [*row[:-1], int(row[-1])] for row in test_df.itertuples(index=False, name=None)
]


NUM_EPOCHS = 100

n_inputs = len(train_dataset[0]) - 1
n_outputs = 2 if MODE == "breast_cancer" else 4
network = initialize_network(n_inputs, 1, n_outputs)
trained_network = train_network(network, train_dataset, 0.1, NUM_EPOCHS, n_outputs)

print(f"All of the hyperparameters of this NN are: {trained_network}")

>epoch=0, lrate=0.100, error=194.408
>epoch=1, lrate=0.100, error=196.044
>epoch=2, lrate=0.100, error=194.910
>epoch=3, lrate=0.100, error=190.286
>epoch=4, lrate=0.100, error=182.767
>epoch=5, lrate=0.100, error=172.916
>epoch=6, lrate=0.100, error=160.860
>epoch=7, lrate=0.100, error=147.502
>epoch=8, lrate=0.100, error=134.286
>epoch=9, lrate=0.100, error=122.324
>epoch=10, lrate=0.100, error=112.073
>epoch=11, lrate=0.100, error=103.515
>epoch=12, lrate=0.100, error=96.422
>epoch=13, lrate=0.100, error=90.521
>epoch=14, lrate=0.100, error=85.566
>epoch=15, lrate=0.100, error=81.357
>epoch=16, lrate=0.100, error=77.737
>epoch=17, lrate=0.100, error=74.586
>epoch=18, lrate=0.100, error=71.815
>epoch=19, lrate=0.100, error=69.353
>epoch=20, lrate=0.100, error=67.151
>epoch=21, lrate=0.100, error=65.165
>epoch=22, lrate=0.100, error=63.365
>epoch=23, lrate=0.100, error=61.724
>epoch=24, lrate=0.100, error=60.221
>epoch=25, lrate=0.100, error=58.836
>epoch=26, lrate=0.100, error=57.556

In [7]:
# TESTING ON UNSEEN DATA

test_predictions = list()
train_predictions = list()

for row in test_dataset:
    prediction = predict(trained_network, row)
    test_predictions.append(prediction)

for row in train_dataset:
    prediction = predict(trained_network, row)
    train_predictions.append(prediction)

accuracy_on_test_data = accuracy_score(
    [row[-1] for row in test_dataset], test_predictions
)
accuracy_on_train_data = accuracy_score(
    [row[-1] for row in train_dataset], train_predictions
)

f1_test = F1_score([row[-1] for row in test_dataset], test_predictions)
f1_train = F1_score([row[-1] for row in train_dataset], train_predictions)

print(f"Accuracy on training data: {accuracy_on_train_data}")
print(f"F1 score on training data: {f1_train}")
print()
print(f"Accuracy on test data: {accuracy_on_test_data}")
print(f"F1 score on test data: {f1_test}")

Accuracy on training data: 97.32441471571906
F1 score on training data: 0.9732441471571907

Accuracy on test data: 97.01492537313433
F1 score on test data: 0.9701492537313433


# KMeans Clustering

In [None]:
import itertools

import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans


def do_clustering(test_df, train_df):
    df = test_df  # set the dataframe to the test dataframe
    column_names = [f"feature_{i}" for i in len(df.columns)]
    column_names[-1] = "Y"  # set the last column to be the Y col (target)
    # add column names to dataframe
    test_df.columns = column_names
    train_df.columns = column_names
    pairs = 5
    # clusters = KMeans(n_clusters=2, init="k-means++", max_iter=1000, random_state=0).fit(X)
    # clusters = DBSCAN(eps=0.5, min_samples=5).fit(X) # this does -1 or 0
    X = df.drop("Y", axis=1)
    clusters = AgglomerativeClustering(n_clusters=2).fit(X)
    Y = df["Y"]
    print(clusters.labels_)
    accuracy = accuracy_score(Y, clusters.labels_)

    # All unique pairs of the first #'pairs' columns
    pairs_of_columns = list(itertools.combinations(column_names[:pairs], 2))

    n_pairs = len(pairs_of_columns)
    grid_size = int(n_pairs**0.5) + 1  # Square root of number of pairs, rounded up
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
    fig.subplots_adjust(hspace=0.4, wspace=0.4)

    # Plot each pair in its respective subplot
    for ax, (a, b) in zip(axes.flatten(), pairs_of_columns):
        ax.scatter(df[a], df[b], c=clusters.labels_, s=3)
        ax.set_xlabel(a)
        ax.set_ylabel(b)

    # Hide any unused subplots
    for i in range(n_pairs, grid_size**2):
        axes.flatten()[i].axis("off")

    # make a header nice loooking box for the title and some text
    title = f"KMeans (n=2) Clustering of Breast Cancer Data Feature Combinations. \n\nBINARY CLASSIFICATION ACCURACY: {str(accuracy * 100)+ '%'}"

    # Add a title and a subtitle
    fig.suptitle(title, fontsize=20)

    # Show the plot
    plt.show()


do_clustering(
    test_df, train_df
)  # preform kmeans clustering on the training data and display a plot of all combinations of the first 10 features

do_clustering(
    test_df, train_df
)  # preform kmeans clustering on the training data and display a plot of all combinations of the first 10 features