<a href="https://colab.research.google.com/github/abdelmotlb/Unsupervised-Algorithms/blob/main/Sports_Activity_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <font color='orange' size='7px'> ***Global Variables***</font>

In [None]:
dataset_path = '/content/data'

# <font color='orange' size='7px'> ***Kaggle Data Configuration***</font>

## Hold content folder by kaggle


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/'

##  *Dataset Landing*


In [None]:
!kaggle datasets download -d obirgul/daily-and-sports-activities

Downloading daily-and-sports-activities.zip to /content
 99% 165M/167M [00:01<00:00, 137MB/s]
100% 167M/167M [00:01<00:00, 132MB/s]


In [None]:
import zipfile

# unzip all files
with zipfile.ZipFile('daily-and-sports-activities.zip', 'r') as zip_ref:
  zip_ref.extractall()

# <font color='orange' size='7px'> ***Dataset Loader***</font>

## *Helper Functions*

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def formulatedImage(pixels):
  return np.array(pixels).reshape(-1)

def get_truth_value_for_segment(activity_folder, subject_folder, segment_file):
  return int(activity_folder[1:])

def plot_image(image_array, resized):
  '''
  parameter: 1d-array, bool value
  '''
  if resized:
    plt.imshow(image_array.reshape(112 // 2, 92 // 2), cmap='gray')  # Use 'cmap=None' if the image is colored
  else:
    plt.imshow(image_array.reshape(112, 92), cmap='gray')
  plt.axis('off')  # Turn off axis
  plt.show()

## *Data Loader*

In [None]:
import imageio as img
import pandas as pd

def basic_load_for_dataset(absolute_directory_path):
  '''
  return naive-data: shape = (19, 8, 60, 125, 45) and naive-truth: shape = (19, 8, 60)
  '''

  naive_data = []
  naive_truth = []
  for activity_folder in sorted(os.listdir(absolute_directory_path)):
      activity_path = os.path.join(absolute_directory_path, activity_folder)
      activity_data = []
      activity_truth = []
      for subject_folder in sorted(os.listdir(activity_path)):
          subject_path = os.path.join(activity_path, subject_folder)
          segment_data = []
          segment_truth = []
          for segment_file in sorted(os.listdir(subject_path)):
              segment_path = os.path.join(subject_path, segment_file)
              df = pd.read_csv(segment_path, header=None)
              segment_data.append(df.values.tolist())
              segment_truth.append(get_truth_value_for_segment(activity_folder, subject_folder, segment_file))
          activity_data.append(segment_data)
          activity_truth.append(segment_truth)
      naive_data.append(activity_data)
      naive_truth.append(activity_truth)
  return np.array(naive_data), np.array(naive_truth)

## *Data splitter*

In [None]:
def split_data(naive_data, naive_truth, ratio):
  '''
  parameter: array: shape = (19, 8, 60, 125, 45) & array: shape = (19, 8, 60) & ratio of type float.
  return:
    4 arrays of shape =
      (19, 8, 60 * ratio, 125, 45) &
      (19, 8, 60 * ratio) &
      (19, 8, 60 * (1 - ratio), 125, 45) &
      (19, 8, 60 * (1 - ratio)
  '''
  dimension_to_split_at = 2
  splitted_dimension_size = naive_data.shape[dimension_to_split_at]
  split_size = int(ratio * splitted_dimension_size)

  training_data = naive_data[:, :, :split_size, :, :]
  training_truth = naive_truth[:, :, :split_size]
  testing_data = naive_data[:, :, split_size:, :, :]
  testing_truth = naive_truth[:, :, split_size:]

  return training_data, training_truth, testing_data, testing_truth

## *Make dataset in memory*

In [None]:
naive_data, naive_truth = basic_load_for_dataset(dataset_path)

# <font color='orange' size='7px'> ***Preprocessing***</font>

## *Reformulate dataset using mean*

In [None]:
def dataset_after_mean_reformulation(train_data, train_truth):
  '''
  parameter: array of shape = (19, 8, number_of_segments, 125, 45)
  return:
    array of shape (19 * 8 * number_of_segments, 45) which is the data matrix needed for each model..
    array of shape (19 * 8 * number_of_segments) which is the label vector..
  '''

  # get number_of_segments
  number_of_segments = train_data.shape[2]

  # compress first 3 dimensions because we don't need them.
  collapsed_arr = train_data.reshape((19 * 8 * number_of_segments, 125, 45))
  collapsed_truth = train_truth.reshape((19 * 8 * number_of_segments))

  # mean_of_each_set will be a numpy array of shape (19 * 8 * number_of_segments, 45)
  mean_of_each_set = np.mean(collapsed_arr, axis=1)

  return mean_of_each_set, collapsed_truth

## *Reformulate dataset using dimensionality reduction*

In [None]:
from sklearn.decomposition import PCA

def dataset_after_reduction_reformulation(train_data, train_truth):
  '''
  parameter: array of shape = (19, 8, number_of_segments, 125, 45)
  return:
    pca model and
    array of shape (19 * 8 * number_of_segments, number_components_of_pca(125 * 45)) which is the data matrix needed for each model..
    array of shape (19 * 8 * number_of_segments) which is the label vector..
  '''

  # get number_of_segments
  number_of_segments = train_data.shape[2]

  # reshape the naive_data to be on standard shape n * d...
  collapsed_arr = train_data.reshape((19 * 8 * number_of_segments, 125 * 45))
  collapsed_truth = train_truth.reshape((19 * 8 * number_of_segments))

  # create pca object
  desired_number_of_components = 45
  pca = PCA(n_components=desired_number_of_components)

  # Fit PCA to your data
  pca.fit(collapsed_arr)

  # Transform your data to the new feature space
  transformed_data = pca.transform(collapsed_arr)

  return pca, transformed_data, collapsed_truth

# <font color='orange' size='7px'> ***Start point for algorithms***</font>

## *Mean Dataset*

In [None]:
def get_mean_dataset(naive_data, naive_truth, ratio):
  '''
  parameter: basic naive -in memory- dataset(19, 8, 60, 125, 45), basic naive -in memory- ground_truth(19, 8, 60), ratio between 0 and 1
  return:
    train_dataset (19 * 8 * number_of_segments_training, 45) &
    train_truth (19 * 8 * number_of_segments_training)
    test_dataset (19, 8, number_of_segments_testing, 125, 45)
    test_truth (19, 8, number_of_segments_testing)
  '''
  train_dataset, train_truth, test_dataset, test_truth = split_data(naive_data, naive_truth, ratio)
  train_mean_dataset, train_mean_truth = dataset_after_mean_reformulation(train_dataset, train_truth)
  return train_mean_dataset, train_mean_truth, test_dataset, test_truth

## *Reduced Dataset*

In [None]:
def get_reduced_dataset(naive_data, naive_truth, ratio):
  '''
  parameter: basic naive -in memory- dataset(19, 8, 60, 125, 45), ratio between 0 and 1
  return:
    pca model,
    training data of shape (19 * 8 * number_of_segments_training, number_components_of_pca(125 * 45)) which is the data matrix needed for each model,
    training ground truth of shape(19 * 8 * number_of_segments_training)
    test_dataset (19, 8, number_of_segments_testing, 125, 45)
    test_truth (19, 8, number_of_segments_testing)
  '''
  train_dataset, train_truth, test_dataset, test_truth = split_data(naive_data, naive_truth, ratio)
  pca, train_reduced_dataset, train_reduced_truth = dataset_after_reduction_reformulation(train_dataset, train_truth)
  return pca, train_reduced_dataset, train_reduced_truth, test_dataset, test_truth

# <font color='orange' size='7px'> ***Evaluation***</font>

In [None]:
def count(i, j, predictions, truth):
  count = 0
  for idx in range(len(predictions)):
      if predictions[idx] == i and truth[idx] == j:
          count += 1
  return count

def map(predictions, truth):
  truth_classes_num = np.unique(np.array(truth)).shape[0] + 1
  predicted_classes_num = np.unique(np.array(predictions)).shape[0] + 1

  matrix = np.zeros((predicted_classes_num, truth_classes_num))

  for i in range(predicted_classes_num):
    for j in range(truth_classes_num):
      matrix[i][j] = count(i, j, predictions, truth)

  return matrix

def total_precision(matrix):
    precision_classes = np.zeros(len(matrix))
    total = 0
    total_occurences = np.sum(matrix)

    for i in range(len(precision_classes)):
      max_occurence = np.max(matrix[i])
      class_occurence = np.sum(matrix[i])
      if np.sum(matrix[i]):
        precision_classes[i] = max_occurence / np.sum(matrix[i])
      if total_occurences != 0:
        total += max_occurence / total_occurences

    return total, precision_classes

def total_recall(matrix):
    recall_classes = np.zeros(len(matrix))
    total = 0

    for i in range(len(recall_classes)):
      max_ind = np.argmax(matrix[i])
      val = np.sum(matrix[:, max_ind])
      if val != 0:
        recall_classes[i] = matrix[i][max_ind] / val
      total += recall_classes[i] / len(matrix)

    return total, recall_classes

def f_measure(precision, recall):
    total = 0
    for i in range(len(precision)):
      if precision[i] + recall[i] > 0:
        total += 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])

    return total / (len(precision) - 1)

def entropy(matrix):
    entropy_classes = np.zeros(len(matrix))
    total = 0
    total_occurences = np.sum(matrix)

    for i in range(len(entropy_classes)):
      cluster_sum = np.sum(matrix[i])
      for j in range(len(matrix[i])):
        if cluster_sum > 0 and matrix[i][j] / cluster_sum > 0:
         entropy_classes[i] -= matrix[i][j] / cluster_sum * np.log2(matrix[i][j] / cluster_sum)
      if total_occurences != 0:
        total += entropy_classes[i] * (cluster_sum / total_occurences)

    return total, entropy_classes


def evaluate(predictions, truth):
  contingency_matrix = map(predictions, truth)
  total_pre, precision_classes = total_precision(contingency_matrix)
  total_rec, recall_classes = total_recall(contingency_matrix)
  f_score = f_measure(precision_classes, recall_classes)
  entropy_value, entropy_classes = entropy(contingency_matrix)
  print("\tPrecision : ", total_pre)
  print("\tRecall : ", total_rec)
  print("\tF-score : ", f_score)
  print("\tEntropy : ", entropy_value)


# <font color='orange' size='7px'> ***K-Means***</font>

## Training

In [None]:
# def genrate_randome_centroids(dataset, clusters_num):
#   centroids = np.zeros((clusters_num, dataset.shape[1]))
#   # get bound for each feature
#   min_values = np.min(dataset, axis=0)
#   max_values = np.max(dataset, axis=0)
#   part = (max_values - min_values) / clusters_num
#   for i in range(clusters_num):
#     min_bound = min_values + part * i
#     centroids[i] = np.random.rand() * (part) + min_bound
#   return centroids

# def genrate_randome_centroids(dataset, clusters_num):
#   centroids = np.zeros((clusters_num, dataset.shape[1]))
#   # get bound for each feature
#   min_values = np.min(dataset, axis=0)
#   max_values = np.max(dataset, axis=0)
#   for i in range(clusters_num):
#     centroids[i] = np.random.rand() *  (max_values - min_values) + min_values
#   return centroids

def genrate_randome_centroids(dataset, clusters_num):
  return dataset[np.random.randint(0, dataset.shape[0], clusters_num)]

# def genrate_randome_centroids(dataset, clusters_num):
#   centroids = np.zeros((clusters_num, dataset.shape[1]))
#   for i in range(clusters_num):
#     centroids[i] = dataset[np.random.randint(i * 384, i * 384 + 384)]
#   centroids = np.array(centroids)
#   return centroids

def predict_cluster_for_point(point, centroids):
  distances = []
  for centroid in centroids:
    distances.append(np.linalg.norm(point - centroid))
  return np.argmin(distances)

def small_difference(centroids, old_centroids, err):
  for i in range(centroids.shape[0]):
    if np.linalg.norm(centroids[i] - old_centroids[i]) > err:
      return False
  return True

def k_means_training(dataset, clusters_num, err):
  centroids = genrate_randome_centroids(dataset, clusters_num)
  old_centroids = np.zeros((clusters_num, dataset.shape[1]))
  while not small_difference(centroids, old_centroids, err):
    old_centroids = np.array(centroids)
    clusters = [[] for _ in range(clusters_num)]
    for point in dataset:
      assigned_cluster = predict_cluster_for_point(point, centroids)
      clusters[assigned_cluster].append(point)
    for i in range(clusters_num):
      if len(clusters[i]) != 0:
        centroids[i] = np.mean(clusters[i])
  return centroids


## Prediction

In [None]:
def k_means_testing(testing_dataset, centroids):
  testing_samples_size = testing_dataset.shape[0]
  predictions = np.zeros((testing_samples_size))
  for i in range(testing_samples_size):
    predictions[i] = predict_cluster_for_point(testing_dataset[i], centroids)
  predictions = np.array(predictions, dtype=int)
  return predictions;


## K-Means Caller

### Train and predict

#### Mean reformulation

In [None]:
training_with_mean_reformulation, train_truth, testing_with_mean_reformulation, test_truth = get_mean_dataset(naive_data,naive_truth, 0.8)
testing_with_mean_reformulation, test_truth = dataset_after_mean_reformulation(testing_with_mean_reformulation, test_truth)
clusters_number = [8, 13, 19, 28,38]
# clusters_number = [19]
predictions = []
predictions_training = []
for k in clusters_number:
  centroids = k_means_training(training_with_mean_reformulation, k, 0.000001)
  predictions.append(k_means_testing(training_with_mean_reformulation, centroids))
  predictions_training.append(k_means_testing(testing_with_mean_reformulation, centroids))
predictions = np.array(predictions)
predictions_training = np.array(predictions_training)

#### PCA reformulation

In [None]:
pca, training_with_pca_reformulation, train_truth, testing_with_pca_reformulation, test_truth = get_reduced_dataset(naive_data,naive_truth, 0.8)
collapsed_arr = testing_with_pca_reformulation.reshape((19 * 8 * 12, 125 * 45))
test_truth = test_truth.reshape((19*8*12, 1))
testing_with_pca_reformulation = pca.transform(collapsed_arr)
clusters_number = [8, 13, 19, 28,38]
predictions_pca = []
predictions_testing_pca = []
for k in clusters_number:
  centroids = k_means_training(training_with_pca_reformulation, k, 0.000001)
  predictions_pca.append(k_means_testing(training_with_pca_reformulation, centroids))
  predictions_testing_pca.append(k_means_testing(testing_with_pca_reformulation, centroids))
predictions_pca = np.array(predictions_pca)
testing_with_pca_reformulation = np.array(testing_with_pca_reformulation)

##K-means Evaluation

### K-means mean refoemulation evaluation

#### Training set evaluation

In [None]:
for i in range(predictions.shape[0]):
  print(f"At K = {clusters_number[i]}")
  evaluate(predictions[i], train_truth)

At K = 8
	Precision :  0.22957785087719296
	Recall :  0.48466435185185175
	F-score :  0.3539006059195943
	Entropy :  3.0771250420461045
At K = 13
	Precision :  0.25095942982456143
	Recall :  0.34058779761904767
	F-score :  0.29804493962441314
	Entropy :  2.9440527383473016
At K = 19
	Precision :  0.27069627192982454
	Recall :  0.2571614583333334
	F-score :  0.26634731883155655
	Entropy :  2.850923682753519
At K = 28
	Precision :  0.2915296052631579
	Recall :  0.19100215517241376
	F-score :  0.22959330040016426
	Entropy :  2.758075311247441
At K = 38
	Precision :  0.3018092105263159
	Recall :  0.14703525641025644
	F-score :  0.19532504555932612
	Entropy :  2.702933322057741


#### Testing evaluation

In [None]:
for i in range(predictions_training.shape[0]):
  print(f"At K = {clusters_number[i]}")
  evaluate(predictions_training[i], test_truth)

At K = 8
	Precision :  0.21655701754385964
	Recall :  0.45717592592592593
	F-score :  0.34029599410747347
	Entropy :  3.1048410108876943
At K = 13
	Precision :  0.25109649122807015
	Recall :  0.34077380952380953
	F-score :  0.29781880917778103
	Entropy :  2.9196753542935396
At K = 19
	Precision :  0.26206140350877194
	Recall :  0.24895833333333334
	F-score :  0.2512336040873487
	Entropy :  2.7965193573303355
At K = 28
	Precision :  0.29769736842105265
	Recall :  0.19504310344827586
	F-score :  0.23213003856031675
	Entropy :  2.6282322115984798
At K = 38
	Precision :  0.319078947368421
	Recall :  0.15544871794871792
	F-score :  0.20614760522246867
	Entropy :  2.536616140589109


### K-means PCA reformulation evaluarion

#### Train set evalauation

In [None]:
for i in range(predictions_pca.shape[0]):
  print(f"At K = {clusters_number[i]}")
  evaluate(predictions_pca[i], train_truth)

At K = 8
	Precision :  0.18188048245614036
	Recall :  0.38396990740740744
	F-score :  0.28262746279314055
	Entropy :  3.4759785748171113
At K = 13
	Precision :  0.19682017543859648
	Recall :  0.26711309523809523
	F-score :  0.23558196932084322
	Entropy :  3.3726498157649374
At K = 19
	Precision :  0.2036732456140351
	Recall :  0.19348958333333333
	F-score :  0.2010279630004128
	Entropy :  3.3442758419186354
At K = 28
	Precision :  0.22491776315789472
	Recall :  0.1473599137931035
	F-score :  0.17028498842083106
	Entropy :  3.285713004096409
At K = 38
	Precision :  0.23862390350877188
	Recall :  0.11625267094017094
	F-score :  0.15275700511630697
	Entropy :  3.2419586306784987


#### Test evaluation

In [None]:
for i in range(predictions_pca.shape[0]):
  print(f"At K = {clusters_number[i]}")
  evaluate(predictions_testing_pca[i], test_truth)

At K = 8
	Precision :  0.1836622807017544
	Recall :  0.3877314814814815
	F-score :  0.29413956802253954
	Entropy :  3.429138283319883
At K = 13
	Precision :  0.2028508771929824
	Recall :  0.275297619047619
	F-score :  0.24179666721830845
	Entropy :  3.305044356638809
At K = 19
	Precision :  0.21217105263157895
	Recall :  0.20156249999999998
	F-score :  0.21049259230909514
	Entropy :  3.2358689539971905
At K = 28
	Precision :  0.22532894736842102
	Recall :  0.14762931034482757
	F-score :  0.17377712862496061
	Entropy :  3.163233120289483
At K = 38
	Precision :  0.25712719298245607
	Recall :  0.12526709401709404
	F-score :  0.16613142199496866
	Entropy :  3.052770250624569


# <font color='orange' size='7px'> ***Normalized Cut***</font>

## Algorithm

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.neighbors import kneighbors_graph

def normalize_eigenvectors(eigenvectors):
  '''
  parameter: The eigenvectors of the noramlized laplacian matrix
  return: The noramlized eigen vectors
  '''
  normalized = []
  for i in range(len(eigenvectors)):
    ui = eigenvectors[i]
    product = 1;
    for x in ui:
      product = product + x * x
    z = 1 / np.sqrt(product)
    normalized.append(z * ui)

  return normalized

def compute_similarity_matrix(data, gamma):
  '''
  parameter:
    - The data matrix containing all the data points
    - The gamma value that will be used for rbf function
  return:
    - The similarity matrix resulted from applying the rbf kernel function
      with specified gamma value on all pairwise points
  '''
  n = data.shape[0]
  similarity_matrix = np.zeros((n, n))
  for i in range(n):
      for j in range(i+1, n):
          distance = np.linalg.norm(data[i] - data[j])
          similarity_matrix[i, j] = similarity_matrix[j, i] = np.exp(-gamma * distance ** 2)
  return similarity_matrix

def k_way_normalized_cut(data, k, gamma):
  '''
  parameter:
    - The data matrix containing all the data points
    - The gamma value that will be used for generation of similarity matrix
    - The number of clusters that will be generated
  return:
    - The Labels of each data point in the data matrix
  '''
  # Compute similarity matrix using RBF kernel
  similarity_matrix = compute_similarity_matrix(data, gamma)

  # Compute the degree matrix
  degree_matrix = np.diag(np.sum(similarity_matrix, axis=1))

  # Compute the Laplacian matrix
  laplacian_matrix = degree_matrix - similarity_matrix

  # Compute the normalized Laplacian matrix (B matrix)
  for i in range(len(laplacian_matrix)):
    laplacian_matrix[i][i] /= degree_matrix[i][i]
  normalized_laplacian_matrix = laplacian_matrix

  # Compute the first k normalized eigenvectors
  _, eigenvectors = np.linalg.eig(normalized_laplacian_matrix)
  eigenvectors = eigenvectors[:, :k]
  eigenvectors = np.real(eigenvectors)

  # Compute the normalized eigenvectors
  normalized_eigenvectors = normalize_eigenvectors(eigenvectors)

  # Perform Kmeans on the normalized eigenvectors and get the labels of each point
  labels = KMeans(n_clusters=k, random_state=0).fit(normalized_eigenvectors).labels_

  return labels

## Evaluation on mean dataset

Fit the train of mean dataset in the model

In [None]:
# load the mean dataset
train_mean_dataset, train_mean_truth, test_mean_dataset, test_mean_truth = get_mean_dataset(naive_data, naive_truth, 0.8)

# Reformulate the test data
test_mean_dataset, test_mean_truth = dataset_after_mean_reformulation(test_mean_dataset, test_mean_truth)

# perfrom normalized cut on test mean dataset
test_predictions = k_way_normalized_cut(test_mean_dataset, k=19, gamma=0.00001) + 1



Evaluate the clusters using different measures

In [None]:
print("Evaluation on mean dataset: ")
evaluate(test_predictions, test_mean_truth)

Evaluation on mean dataset: 
	Precision :  0.4281798245614035
	Recall :  0.40677083333333325
	F-score :  0.42622502940551155
	Entropy :  2.02945440362094


## Evaluation on PCA dataset

Fit the train of PCA dataset in the model

In [None]:
# load the PCA dataset
pca, train_pca_dataset, train_pca_truth, test_pca_dataset, test_pca_truth = get_reduced_dataset(naive_data, naive_truth, 0.8)

# Reformulate the PCA dataset
collapsed_arr = test_pca_dataset.reshape((19 * 8 * 12, 125 * 45))
collapsed_truth = test_pca_truth.reshape((19 * 8 * 12))

# Reduce the dimensinos of test pca
test_pca_dataset = pca.transform(collapsed_arr)

# perfrom normalized cut on test PCA dataset
test_pca_predictions = k_way_normalized_cut(test_pca_dataset, k=19, gamma=0.00001) + 1

Evaluate the clusters using different measures

In [None]:
print("Evaluation on PCA dataset")
evaluate(test_pca_predictions, collapsed_truth)

Evaluation on PCA dataset
	Precision :  0.43695175438596484
	Recall :  0.41510416666666666
	F-score :  0.44091666672300056
	Entropy :  2.0631747137227254


# <font color='orange' size='7px'> ***DBSCAN***</font>

## Global lists

In [None]:
global_counter = 0

## Helpful Functions

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

def get_core_indices(D, n, ε, min_points):
  neighbors_model = NearestNeighbors(radius=ε, metric="euclidean")
  neighbors_model.fit(D)
  neighborhoods = neighbors_model.radius_neighbors(D, return_distance=False)
  n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
  core_samples = np.asarray(n_neighbors >= min_points, dtype=np.uint8)
  core_sample_indices_ = np.where(core_samples)[0]
  return core_sample_indices_

def connect_to_adj_cores(
    D, n, ε,
    basic_core_node_index, cluster_to_assign,
    core_indices_set, cluster_id
  ):

  stack = [basic_core_node_index]
  while stack:
    current_core_node_index = stack.pop()
    for instance_index in range(n):
      if ((np.linalg.norm(D[instance_index] - D[current_core_node_index])) > ε) or (cluster_id[instance_index] != -1): continue
      cluster_id[instance_index] = cluster_to_assign
      if instance_index in core_indices_set: stack.append(instance_index)

## Algorithm

In [None]:
def DBSCAN_algorithm(D, ε, min_points):
  global global_counter

  # initialization attributes:
  n = D.shape[0]
  d = D.shape[1]
  cluster_id = [-1] * n

  # get core points:
  core_indices = get_core_indices(D, n, ε, min_points)
  core_indices_set = set(core_indices)

  # build connected components:
  cluster_counter = -1
  for i in range(len(core_indices)):
    if cluster_id[core_indices[i]] != -1: continue
    cluster_counter = cluster_counter + 1
    cluster_id[core_indices[i]] = cluster_counter
    connect_to_adj_cores(D, n, ε, core_indices[i], cluster_counter, core_indices_set, cluster_id)

  return cluster_id

## Training Evaluation

In [None]:
# our main code test for mean dataset
train_mean_dataset, train_mean_truth, test_dataset, test_truth = get_mean_dataset(naive_data, naive_truth, 0.8)
labels = DBSCAN_algorithm(train_mean_dataset, ε = 0.9, min_points = 48)
labels = np.array(labels)
evaluate(labels, train_mean_truth)

	Precision :  0.9999999999999997
	Recall :  0.3181818181818182
	F-score :  0.46190476190476193
	Entropy :  0.0


## Testing Evaluation

### Mean

In [None]:
# our main code test for mean dataset
train_mean_dataset, train_mean_truth, test_dataset, test_truth = get_mean_dataset(naive_data, naive_truth, 0.8)
test_mean_dataset, test_mean_truth = dataset_after_mean_reformulation(test_dataset, test_truth)
labels_mean_predictions = DBSCAN_algorithm(test_mean_dataset, ε = 2, min_points = 16)
labels_mean_predictions = np.array(labels_mean_predictions)
print(labels_mean_predictions)
print(np.unique(labels_mean_predictions))
evaluate(labels_mean_predictions, train_mean_truth)

# # python testing
# from sklearn.cluster import DBSCAN
# train_mean_dataset, train_mean_truth, test_dataset, test_truth = get_mean_dataset(naive_data, naive_truth, 0.8)
# test_mean_dataset, test_mean_truth = dataset_after_mean_reformulation(test_dataset, test_truth)
# db = DBSCAN(eps=2, min_samples=16).fit(test_mean_dataset)
# print(db.labels_)
# print(np.unique(db.labels_))

# our main code test for pca dataset
# pca, train_pca_dataset, train_pca_truth, test_pca_dataset, test_pca_truth = get_reduced_dataset(naive_data, naive_truth, 0.8)
# collapsed_arr = test_pca_dataset.reshape((19 * 8 * 12, 125 * 45))
# collapsed_truth = test_pca_truth.reshape((19 * 8 * 12))
# test_pca_dataset = pca.transform(collapsed_arr)

[-1 -1 -1 ... -1 -1 -1]
[-1  0  1  2  3  4  5  6  7]
	Precision :  0.7338709677419355
	Recall :  0.25505216095380034
	F-score :  0.3925399282554809
	Entropy :  0.8098210808911355


### PCA

In [None]:
# load the PCA dataset
pca, train_pca_dataset, train_pca_truth, test_pca_dataset, test_pca_truth = get_reduced_dataset(naive_data, naive_truth, 0.8)

# Reformulate the PCA dataset
collapsed_arr = test_pca_dataset.reshape((19 * 8 * 12, 125 * 45))
test_pca_truth = test_pca_truth.reshape((19 * 8 * 12))
test_pca_dataset = pca.transform(collapsed_arr)

In [None]:
# perfrom normalized cut on test PCA dataset
test_pca_predictions = DBSCAN_algorithm(test_pca_dataset, ε = 1, min_points = 10)
evaluate(test_pca_predictions, test_pca_truth)

# from sklearn.cluster import DBSCAN
# db = DBSCAN(eps=1, min_samples=10).fit(test_pca_dataset)
# print(db.labels_)
# print(np.unique(db.labels_))

	Precision :  0.9999999999999998
	Recall :  0.19999999999999998
	F-score :  0.3438625582469251
	Entropy :  0.0


In [None]:
# perfrom normalized cut on test PCA dataset
test_pca_predictions = DBSCAN_algorithm(test_pca_dataset, ε = 1, min_points = 10)
print(test_pca_predictions)
print(np.unique(test_pca_predictions))


[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, -1