# Unsupervised Learning: Clustering Lab





In [2]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score
from scipy.io import arff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. (50%) Implement the k-means clustering algorithm and the HAC (Hierarchical Agglomerative Clustering) algorithm.

### 1.1.1 HAC

### Code requirements 
- HAC should support both single link and complete link options.
- HAC automatically generates all clusterings from n to 1.  To simplify the amount of output you may want to implement a mechanism to specify for which k values actual output will be generated.


---
The output should include the following:
- The number of clusters (k).
- The silhouette score of the full clustering. (You can either write and use your own silhouette_score function (extra credit) or use sklearn's)


For each cluster report include:


- The centroid id.
- The number of instances tied to that centroid. 
---

In [47]:
class HACClustering(BaseEstimator,ClassifierMixin):

    def __init__(self,k=3,link_type='single'): ## add parameters here
        """
        Args:
            k = how many final clusters to have
            link_type = single or complete. when combining two clusters use complete link or single link
        """
        self.link_type = link_type
        self.k = k
        self.clusters = []
        
    def fit(self, X, y=None):
        """ Fit the data; In this lab this will make the K clusters :D
        Args:
            X (array-like): A 2D numpy array with the training data
            y (array-like): An optional argument. Clustering is usually unsupervised so you don't need labels
        Returns:
            self: this allows this to be chained, e.g. model.fit(X,y).predict(X_test)
        """
        cluster_arr = [np.asarray([item]) for item in X]
        self.clusters = cluster_arr
        is_done = False
        while not is_done:
          # print('clusters: ', len(self.clusters))
          if len(self.clusters) == self.k:
            is_done = True
            break
          if self.link_type == 'single':
            self.clusters = self.single_cluster(self.clusters)
          else:
            self.clusters = self.cluster_complete_link(self.clusters)
        return self

    def score(self):
      centroids = []
      # print number of clusters
      print(len(self.clusters))

      sse = []
      total_sse = 0
      # find centroids
      for i in range(len(self.clusters)):
        centroid = np.mean(self.clusters[i], axis=0)
        centroids.append(centroid)
        # find cluster sse
        tmp = np.linalg.norm(self.clusters[i] - centroid)
        squared_err = tmp**2
        sse.append(np.sum(squared_err))
        total_sse += squared_err

      # print results
      print(total_sse, '\n')
      for i in range(len(self.clusters)):
        print(centroids[i])
        print(len(self.clusters[i]))
        print(sse[i], '\n')
    
    def single_cluster(self, clusters):
      distances = []
      cluster_indexes = []
      for i in range(len(clusters)):
        tmp_dist = []
        for j in range(len(clusters)):
          min_dist = np.inf
          if i == j:
            tmp_dist.append(np.inf)
            continue
          if len(clusters[i]) > 1:
            # check if j is also a cluster
            if len(clusters[j]) > 1:
            # calc distance between the two clusters
              for k in range(len(clusters[i])):
                for l in range(len(clusters[j])):
                  dist = np.linalg.norm(clusters[i][k] - clusters[j][l])
                  if dist < min_dist:
                    min_dist = dist
            else: 
              for k in range(len(clusters[i])):
                dist = np.linalg.norm(clusters[i][k] - clusters[j])
                if dist < min_dist:
                  min_dist = dist
          else: 
            if len(clusters[j]) > 1:
            # i is single cluster, j has more than one
              for k in range(len(clusters[j])):
                dist = np.linalg.norm(clusters[i] - clusters[j][k])
                if dist < min_dist:
                  min_dist = dist
            else:
              min_dist = np.linalg.norm(clusters[i] - clusters[j])
          tmp_dist.append(min_dist)
        min_val = min(tmp_dist)
        index = tmp_dist.index(min(tmp_dist))
        distances.append(min_val)
        cluster_indexes.append(index)
      
      best_dist = min(distances)
      dist_index = distances.index(best_dist)
      prev_index = cluster_indexes[dist_index]
      
      comb_arr = clusters[prev_index]
      comb_arr = np.append(clusters[dist_index], np.array(clusters[prev_index]), axis=0)
      clusters[dist_index] = comb_arr
      new_clusters = np.delete(clusters, prev_index)
      # clusters[prev_index].pop()
      return new_clusters

    def cluster_complete_link(self, clusters):
      distances = []
      cluster_indexes = []
      for i in range(len(clusters)):
        tmp_dist = []
        for j in range(len(clusters)):
          min_dist = np.inf
          if i == j:
            tmp_dist.append(np.inf)
            continue
          if len(clusters[i]) > 1:
            # check if j is also a cluster
            if len(clusters[j]) > 1:
            # calc distance between the two clusters
              for k in range(len(clusters[i])):
                for l in range(len(clusters[j])):
                  dist = np.linalg.norm(clusters[i][k] - clusters[j][l])
                  if dist > min_dist:
                    min_dist = dist
            else: 
              for k in range(len(clusters[i])):
                dist = np.linalg.norm(clusters[i][k] - clusters[j])
                if dist > min_dist:
                  min_dist = dist
          else: 
            if len(clusters[j]) > 1:
            # i is single cluster, j has more than one
              for k in range(len(clusters[j])):
                dist = np.linalg.norm(clusters[i] - clusters[j][k])
                if dist > min_dist:
                  min_dist = dist
            else:
              min_dist = np.linalg.norm(clusters[i] - clusters[j])
          tmp_dist.append(min_dist)
        min_val = min(tmp_dist)
        index = tmp_dist.index(min(tmp_dist))
        distances.append(min_val)
        cluster_indexes.append(index)
      
      best_dist = min(distances)
      dist_index = distances.index(best_dist)
      prev_index = cluster_indexes[dist_index]
      
      comb_arr = clusters[prev_index]
      comb_arr = np.append(clusters[dist_index], np.array(clusters[prev_index]), axis=0)
      clusters[dist_index] = comb_arr
      new_clusters = np.delete(clusters, prev_index)
      return new_clusters
                
    
    def print_clusters(self):
      """
        Used for grading.
        print("Num clusters: {:d}\n".format(k))
        print("Silhouette score: {:.4f}\n\n".format(silhouette_score))
        for each cluster and centroid:
          print(np.array2string(centroid,precision=4,separator=","))
          print("{:d}\n".format(size of cluster))
      """
      pass

### 1.1.2 Debug 

Debug your model by running it on the [Debug Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/abalone.arff)


---
The dataset was modified to be a lot smaller. The last datapoint should be on line 359 or the point 0.585,0.46,0.185,0.922,0.3635,0.213,0.285,10. The remaining points should be commented out.


- Make sure to include the output class (last column) as an additional input feature
- Normalize Data
- K = 5
- Use 4 decimal places and DO NOT ROUND when reporting silhouette score and centroid values.


---
Solutions in files:

[Debug HAC Single (Silhouette).txt](https://raw.githubusercontent.com/cs472ta/CS472/master/debug_solutions/Debug%20HAC%20Single%20Link%20%28Silhouette%29.txt)

[Debug HAC Complete (Silhouette).txt](https://raw.githubusercontent.com/cs472ta/CS472/master/debug_solutions/Debug%20HAC%20Complete%20Link%20%28Silhouette%29.txt)

In [4]:
def normalize_data(inputs):
  xmin = inputs.min(axis=0)
  xmax = inputs.max(axis=0)
  return (inputs-xmin)/(xmax-xmin)

In [48]:
# Debug Here
!curl -s https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/abalone.arff --output debug.arff
# Train on training set
debug_data = arff.loadarff('debug.arff')
debug_np = np.array(debug_data[0])
debug_norm = np.array(normalize_data(pd.DataFrame(debug_np)))

clf = HACClustering(k=5, link_type='complete')
res = clf.fit(debug_norm)
res.score()

  arr = asarray(arr)


5
0.183020684438721 

[0.97297297 0.96629213 0.85526316 0.88224206 0.83113208 0.94574369
 0.65075377 0.58823529]
2
0.11098582696147233 

[0.88738739 0.85955056 0.77631579 0.6202381  0.52476415 0.71375117
 0.53266332 0.47058824]
2
0.03216775470062799 

[0.62612613 0.64606742 0.56578947 0.3610119  0.43584906 0.38961646
 0.23115578 0.29411765]
2
0.025182529101247787 

[0.72522523 0.73033708 0.65789474 0.37619048 0.36462264 0.49204864
 0.28643216 0.32352941]
2
0.006923140753305043 

[0.77927928 0.75842697 0.73684211 0.37271825 0.36698113 0.38727783
 0.29648241 0.35294118]
2
0.007761432922067825 



### 1.1.3 Evaluation

We will evaluate your model based on its print_clusters() output using [Evaluation Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/seismic-bumps_train.arff)

- Make sure to include the output class (last column) as an additional input feature
- Normalize Data
- K = 5
- Use 4 decimal places and DO NOT ROUND when reporting silhouette score and centroid values.

#### 1.1.3.1 Complete Link

In [None]:
# Load evaluation data

# Train on evaluation data using complete link

# Print clusters

#### 1.1.3.1 Single Link

In [None]:
# Load evaluation data

# Train on evaluation data using single link

# Print clusters

### 1.2.1 K-Means

### Code requirements 
- Ability to choose k and specify k initial centroids
- Use Euclidean Distance as metric
- Ability to handle distance ties
- Include output label as a cluster feature


---
The output should include the following:
- The number of clusters (k).
- The silhouette score of the full clustering. (You can either write and use your own silhouette_score function (extra credit) or use sklearn's)


For each cluster report include:


- The centroid id.
- The number of instances tied to that centroid. 
---
You only need to handle continuous features

In [4]:
class KMEANSClustering(BaseEstimator,ClassifierMixin):

    def __init__(self,k=3,debug=False): ## add parameters here
        """
        Args:
            k = how many final clusters to have
            debug = if debug is true use the first k instances as the initial centroids otherwise choose random points as the initial centroids.
        """
        self.k = k
        self.debug = debug

    def fit(self, X, y=None):
        """ Fit the data; In this lab this will make the K clusters :D
        Args:
            X (array-like): A 2D numpy array with the training data
            y (array-like): An optional argument. Clustering is usually unsupervised so you don't need labels
        Returns:
            self: this allows this to be chained, e.g. model.fit(X,y).predict(X_test)
        """
        return self
    
    def print_clusters(self):
        """
            Used for grading.
            print("Num clusters: {:d}\n".format(k))
            print("Silhouette score: {:.4f}\n\n".format(silhouette_score))
            for each cluster and centroid:
                print(np.array2string(centroid,precision=4,separator=","))
                print("{:d}\n".format(size of cluster))
        """
        pass

### 1.2.2 Debug 

Debug your model by running it on the [Debug Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/abalone.arff)


- Train until convergence
- Make sure to include the output class (last column) as an additional input feature
- Normalize Data
- K = 5
- Use the first k instances as the initial centroids
- Use 4 decimal places and DO NOT ROUND when reporting silhouette score and centroid values




---
Solutions in files:

[Debug K Means (Silhouette).txt](https://raw.githubusercontent.com/cs472ta/CS472/master/debug_solutions/Debug%20K%20Means%20%28Silhouette%29.txt)

In [None]:
# Load debug data

# Train on debug data

# Print clusters

### 1.2.3 Evaluation

We will evaluate your model based on its print_clusters() output using [Evaluation Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/seismic-bumps_train.arff)
- Train until convergence
- Make sure to include the output class (last column) as an additional input feature
- Normalize Data
- K = 5
- Use the first k instances as the initial centroids
- Use 4 decimal places and DO NOT ROUND when reporting silhouette score and centroid values

In [None]:
# Load evaluation data

# Train on evaluation data

# Print clusters

## 2.1.1 (7.5%) Clustering the Iris Classification problem - HAC

Load the Iris Dataset [Iris Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/iris.arff)

- Use single-link and complete link clustering algorithms
- State whether you normalize your data or not (your choice).  
- Show your results for clusterings using k = 2-7.  
- Graph the silhouette score for each k and discuss your results (i.e. what kind of clusters are being made).
---

In [None]:
# Iris Classification using single-link

In [None]:
# Iris Classification using complete-link

Discuss differences between single-link and complete-link

## 2.1.2 (5%) Clustering the Iris Classification problem - HAC

Requirements:
- Repeat excercise 2.1.1 and include the output label as one of the input features.

In [None]:
# Clustering Labels using single-link

In [None]:
# Clustering Labels using complete-link

Discuss any differences between the results from 2.1.1 and 2.1.2.

## 2.2.1 (7.5%) Clustering the Iris Classification problem: K-Means

Load the Iris Dataset [Iris Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/iris.arff)

Run K-Means on the Iris dataset using the output label as a feature and without using the output label as a feature

Requirements:
- State whether you normalize your data or not (your choice).  
- Show your results for clusterings using k = 2-7.  
- Graph the silhouette score for each k and discuss your results (i.e. what kind of clusters are being made).
---

In [None]:
# Iris Classification without output label

In [None]:
# Iris Classification with output label

Compare results and differences between using the output label and excluding the output label

## 2.2.2 (5%) Clustering the Iris Classification problem: K-Means

Requirements:
- Use the output label as an input feature
- Run K-Means 5 times with k=4, each time with different initial random centroids and discuss any variations in the results. 

In [None]:
#K-Means 5 times

Discuss any variations in the results

## 3.1 (12.5%) Run the SK versions of HAC (both single and complete link) on iris including the output label and compare your results with those above.
Use the silhouette score for this iris problem(k = 2-7).  You may write your own code to do silhouette (optional extra credit) or you can use sklearn.metrics.silhouette_score. Please state if you coded your own silhouette score function to receive the extra credit points (described below). Discuss how helpful Silhouette appeared to be for selecting which clustering is best. You do not need to supply full Silhouette graphs, but you could if you wanted to.

Requirements
- Use the Sillhouette score for this iris problem (k= 2-7) 
- Use at least one other scoring function from [sklearn.metrics](https://scikit-learn.org/stable/modules/model_evaluation.html) and compare the results. State which metric was used. 
- Possible sklean metrics include (* metrics require ground truth labels):
    - adjusted_mutual_info_score*
    - adjusted_rand_score*
    - homogeneity_score*
    - completeness_score*
    - fowlkes_mallows_score*
    - calinski_harabasz_score
    - davies_bouldin_score
- Experiment using different hyper-parameters. Discuss Results

In [None]:
# Load sklearn



*Record impressions*

## 3.2 (12.5%) Run the SK version of k-means on iris including the output label and compare your results with those above. 

Use the silhouette score for this iris problem(k = 2-7). You may write your own code to do silhouette (optional extra credit) or you can use sklearn.metrics.silhouette_score. Please state if you coded your own silhouette score function to receive the extra credit points (described below). Discuss how helpful Silhouette appeared to be for selecting which clustering is best. You do not need to supply full Silhouette graphs, but you could if you wanted to.

Requirements
- Use the Sillhouette score for this iris problem (k= 2-7) 
- Use at least one other scoring function form sklearn.metrics and compare the results. State which metric was used
- Experiment different hyper-parameters. Discuss Results

In [None]:
# Load sklearn 



*Record impressions*

## 4. (Optional 5% extra credit) For your silhouette experiment above, write and use your own code to calculate the silhouette scores, rather than the SK or other version. 


*Show findings here*

In [None]:
# Copy function Below