# CS676 Algorithms for Data Science
## Final Project

Aayushi Verma

In [1]:
# importing packages for data cleaning, visualization, and EDA
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing 
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import sys  
sys.path.insert(0, '/Users/av15397n/Documents/GitHub/CS676-Algorithms-For-Data-Science/Final Project/')

from utils import *

In [3]:
# reading the raw CSV data file
penguins = pd.read_csv("data/penguins.csv")

In [4]:
penguins.head()

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [5]:
penguins.shape

(344, 9)

In [6]:
penguins.describe()

Unnamed: 0,rowid,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,344.0,342.0,342.0,342.0,342.0,344.0
mean,172.5,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,99.448479,5.459584,1.974793,14.061714,801.954536,0.818356
min,1.0,32.1,13.1,172.0,2700.0,2007.0
25%,86.75,39.225,15.6,190.0,3550.0,2007.0
50%,172.5,44.45,17.3,197.0,4050.0,2008.0
75%,258.25,48.5,18.7,213.0,4750.0,2009.0
max,344.0,59.6,21.5,231.0,6300.0,2009.0


In [7]:
penguins.isnull().sum()

rowid                 0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [8]:
df = penguins[["bill_length_mm","bill_depth_mm"]].dropna()

In [9]:
df

Unnamed: 0,bill_length_mm,bill_depth_mm
0,39.1,18.7
1,39.5,17.4
2,40.3,18.0
4,36.7,19.3
5,39.3,20.6
...,...,...
339,55.8,19.8
340,43.5,18.1
341,49.6,18.2
342,50.8,19.0


In [10]:
df = pd.DataFrame(
    {
        'x':[4,8,15,24,24],
        'y':[4,4,8,4,12]
    }
)

In [11]:
df

Unnamed: 0,x,y
0,4,4
1,8,4
2,15,8
3,24,4
4,24,12


In [170]:
def min_value_location(df):
    row_idx_of_min_values_in_df = df.idxmin(axis=0,numeric_only=True).values.tolist()
    col_idx_of_min_values_in_df = df.idxmin(axis=1,numeric_only=True).values.tolist()
    lst_of_min_values_in_df = df.min(numeric_only=True).values.tolist()
    min_value = min(lst_of_min_values_in_df)
    idx_of_min_value = lst_of_min_values_in_df.index(min_value)
    idx_of_min_row_in_df = row_idx_of_min_values_in_df[idx_of_min_value]
    idx_of_min_col_in_df = col_idx_of_min_values_in_df[idx_of_min_value]
    new_cluster_location = [idx_of_min_row_in_df,int(idx_of_min_col_in_df)]
    return new_cluster_location, min_value

def l1(given_point,x,y):
    # Function to determine Manhattan distance, L1.
    d_l1 = []
    d = len(x) # assumes len(x) = len(y)
    for i in range(d):
        d_sum = (given_point[0] - x[i]) + (given_point[1] - y[i])
        d_l1.append(d_sum)
    return d_l1

def l2(given_point,x,y):
    # Function to determine Euclidean distance, L2.
    d_l2 = []
    d = len(x) # assumes len(x) = len(y)
    for i in range(d):
        d_sum = np.sqrt(((given_point[0] - x[i]) ** 2) + ((given_point[1] - y[i]) ** 2))
        d_l2.append(d_sum)
    return d_l2

def l3(given_point, x, y, p):
    # Function to determine Minkowski distance, L3.
    d_l3 = []
    d = len(x) # assumes len(x) = len(y)
    for i in range(d):
        d_sum = (((given_point[0] - x[i]) ** p) + ((given_point[1] - y[i]) ** p)) ** (1 / p)
        d_l3.append(d_sum)
    return d_l3

# step 1: initialize df
def initialize_cluster(df, distance_measure='l2'):
    df2 = []
    for i in range(len(df)):
        given_point = df.iloc[i]

        if distance_measure == 'l2':
            df2.append(l2(given_point,df['x'],df['y']))
        else:
            # dummy clause for now
            df2.append(l2(given_point,df['x'],df['y']))

    df3 = pd.DataFrame(df2)
    df3 = df3.drop(0, axis=1)

    blank_value = pd.NA
    for i in range(len(df3)):
        df3.loc[i:,i] = blank_value
    df3.drop(4,axis=0,inplace=True) # generatlize this [-1]
    # df3.drop(0, axis=1,inplace=True)
    return df3

def complete_linkage(df1):
    return df1.max()

def old_cluster(df1, new_cluster):
    df2 = df1.loc[[new_cluster[0],new_cluster[1]],:]
    df2.drop(new_cluster[0],axis=1,inplace=True)

    df1.drop([new_cluster[0]+1,new_cluster[1]+1],axis=1,inplace=True)
    df1.drop(new_cluster,axis=0,inplace=True)
    return df1, df2

def modifying_rows_cols(df1, df2):
    if 0 in df1.columns:
        df1.drop(0, axis=1,inplace=True)
    df1[df1.columns[-1]+1] = pd.Series()
    df1 = pd.concat(
        [df1, pd.Series(index=[df1.index[-1]+1], data=pd.NA).to_frame(name=df1.index[-1]+1).T], 
        axis=0, ignore_index=False
        )
    return df1, df2

def linkage_insertion(df1, df2, linkage_type):
    if linkage_type == 'complete':
        df1[df1.columns[-1]] = complete_linkage(df2)
    else:
        # dummy clause for now
        df1[df1.columns[-1]] = complete_linkage(df2)
    return df1

def clustering_iteration(df1, linkage_type='complete'):
    new_cluster, min_value = min_value_location(df1)
    df1, df2 = old_cluster(df1, new_cluster)
    df1, df2 = modifying_rows_cols(df1, df2)
    df1 = linkage_insertion(df1, df2, linkage_type)

    return df1

In [232]:
df3 = initialize_cluster(df, distance_measure='l2')
df3

Unnamed: 0,1,2,3,4,0
0,4.0,11.7047,20.0,21.540659,
1,,8.062258,16.0,17.888544,
2,,,9.848858,9.848858,
3,,,,8.0,


In [233]:
df7 = clustering_iteration(df3, linkage_type='complete')
df7

Unnamed: 0,3,4,5
2,9.848858,9.848858,11.7047
3,,8.0,20.0
4,,,21.540659


In [234]:
new_cluster, min_value = min_value_location(df7)
new_cluster, min_value

([3, 4], 8.0)

In [235]:
df1 = df7.copy()
df1

Unnamed: 0,3,4,5
2,9.848858,9.848858,11.7047
3,,8.0,20.0
4,,,21.540659


In [227]:
new_cluster[0]

3

In [236]:
df2 = df1.loc[[new_cluster[0],new_cluster[1]],:]
# df2.drop(new_cluster[0],axis=1,inplace=True)
df2

Unnamed: 0,3,4,5
3,,8.0,20.0
4,,,21.540659


In [237]:
df2.drop(new_cluster[0],axis=1,inplace=True)
df2

Unnamed: 0,4,5
3,8.0,20.0
4,,21.540659


In [238]:
df1

Unnamed: 0,3,4,5
2,9.848858,9.848858,11.7047
3,,8.0,20.0
4,,,21.540659


In [229]:
df1.drop([new_cluster[0]+1,new_cluster[1]+1],axis=1,inplace=True)
df1.drop(new_cluster,axis=0,inplace=True)
df1

Unnamed: 0,3
2,9.848858


In [230]:
df1[df1.columns[-1]+1] = pd.Series()
df1 = pd.concat(
        [df1, pd.Series(index=[df1.index[-1]+1], data=pd.NA).to_frame(name=df1.index[-1]+1).T], 
        axis=0, ignore_index=False
        )
df1

Unnamed: 0,3,4
2,9.848858,
3,,


In [231]:
df1[df1.columns[-1]] = complete_linkage(df2)
df1

Unnamed: 0,3,4
2,9.848858,
3,,


In [221]:
df8 = clustering_iteration(df7, linkage_type='complete')
df8

Unnamed: 0,3,4
2,9.848858,
3,,


In [278]:
import numpy as np

def agglomerative_clustering(data, num_clusters):
    """
    Performs agglomerative hierarchical clustering on the given data, and returns the
    labels for each data point indicating which cluster it belongs to.

    Args:
        data (ndarray): A 2D numpy array containing the data to cluster.
        num_clusters (int): The desired number of clusters.

    Returns:
        ndarray: A 1D numpy array containing the cluster labels for each data point.
    """
    n = data.shape[0]
    distances = np.zeros((n, n))
    np.fill_diagonal(distances, np.inf)

    # Calculate pairwise distances between all data points
    for i in range(n):
        for j in range(i + 1, n):
            distances[i][j] = np.linalg.norm(data[i] - data[j])
            distances[j][i] = distances[i][j]

    # Initialize each data point as its own cluster
    clusters = [[i] for i in range(n)]

    # Keep merging clusters until the desired number of clusters is reached
    while len(clusters) > num_clusters:
        # Find the two closest clusters
        min_distance = np.inf
        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                distance = 0
                for p in clusters[i]:
                    for q in clusters[j]:
                        distance += distances[p][q]
                distance /= len(clusters[i]) * len(clusters[j])
                if distance < min_distance:
                    min_distance = distance
                    merge_clusters = (i, j)

        # Merge the two closest clusters
        clusters[merge_clusters[0]] += clusters[merge_clusters[1]]
        del clusters[merge_clusters[1]]

    # Assign each data point to a cluster
    labels = np.zeros(n, dtype=int)
    for i in range(num_clusters):
        for j in clusters[i]:
            labels[j] = i

    return labels


In [282]:
blah = df.to_numpy()

In [285]:
agglomerative_clustering(blah, num_clusters=3)

array([0, 0, 1, 2, 2])

# STARTING OVER

In [272]:
def min_value_location(df):
    try:
        row_idx_of_min_values_in_df = df.idxmin(axis=0,numeric_only=True).values.tolist()
    except:
        pass
    try:
        col_idx_of_min_values_in_df = df.idxmin(axis=1,numeric_only=True).values.tolist()
    except:
        pass
    lst_of_min_values_in_df = df.min(numeric_only=True).values.tolist()
    min_value = min(lst_of_min_values_in_df)
    idx_of_min_value = lst_of_min_values_in_df.index(min_value)
    idx_of_min_row_in_df = row_idx_of_min_values_in_df[idx_of_min_value]
    idx_of_min_col_in_df = col_idx_of_min_values_in_df[idx_of_min_value]
    new_cluster_location = [idx_of_min_row_in_df,int(idx_of_min_col_in_df)]
    return new_cluster_location, min_value

def initialize_cluster(df, distance_measure='l2'):
    df2 = []
    for i in range(len(df)):
        given_point = df.iloc[i]

        if distance_measure == 'l2':
            df2.append(l2(given_point,df['x'],df['y']))
        else:
            # dummy clause for now
            df2.append(l2(given_point,df['x'],df['y']))
    
    df2 = pd.DataFrame(np.triu(df2))
    df2.replace(0, pd.NA, inplace=True)
    return df2

In [240]:
df

Unnamed: 0,x,y
0,4,4
1,8,4
2,15,8
3,24,4
4,24,12


In [276]:
df2 = initialize_cluster(df, distance_measure='l2')
df2

Unnamed: 0,0,1,2,3,4
0,,4.0,11.7047,20.0,21.540659
1,,,8.062258,16.0,17.888544
2,,,,9.848858,9.848858
3,,,,,8.0
4,,,,,


In [274]:
df2.idxmin(axis=0,numeric_only=True).values.tolist()

[]

In [277]:
df2.idxmin(axis=1,numeric_only=True).values.tolist()

ValueError: attempt to get argmin of an empty sequence

In [273]:
new_cluster_location, min_value = min_value_location(df2)
new_cluster_location, min_value

ValueError: min() arg is an empty sequence

In [253]:
row_idx_of_min_values_in_df = df2.idxmin(axis=0,numeric_only=True).values.tolist()
col_idx_of_min_values_in_df = df2.idxmin(axis=1,numeric_only=True).values.tolist()
lst_of_min_values_in_df = df2.min(numeric_only=True).values.tolist()
row_idx_of_min_values_in_df, col_idx_of_min_values_in_df, lst_of_min_values_in_df

([0, 1, 2, 3, 4], [0, 0, 0, 0, 0], [0.0, 0.0, 0.0, 0.0, 0.0])

In [323]:
def distance_matrix_creation(data):
    n = data.shape[0]
    distances = np.zeros((n, n))
    np.fill_diagonal(distances, np.inf)
    return distances, n

def np_l1(x,y):
      return np.abs((x[0] - y[0])) + np.abs((x[1] - y[1]))
def np_l2(x,y):
      return np.sqrt(((x[0] - y[0]) ** 2) + ((x[1] - y[1]) ** 2))
def np_lp(x,y,p):
      return ((np.abs(x[0] - y[0]) ** p) + (np.abs(x[1] - y[1]) ** p)) ** p

def distance_calculations(data, n, distances, distance_measure='l2', p=None):
    for i in range(n):
        for j in range(i + 1, n):
            if distance_measure == 'l1':
                distances[i][j] = np_l1(data[i], data[j])
                distances[j][i] = distances[i][j]
            elif distance_measure == 'l2':
                distances[i][j] = np_l2(data[i], data[j])
                distances[j][i] = distances[i][j]
            else:
                distances[i][j] = np_lp(data[i], data[j], p=1)
                distances[j][i] = distances[i][j]
    return distances

def cluster_merging(distances, clusters, num_clusters):
    while len(clusters) > num_clusters:
        # Find the two closest clusters
        min_distance = np.inf
        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                distance = 0
                for p in clusters[i]:
                    for q in clusters[j]:
                        distance += distances[p][q]
                distance /= len(clusters[i]) * len(clusters[j])
                if distance < min_distance:
                    min_distance = distance
                    merge_clusters = (i, j)

    # Merge the two closest clusters
    clusters[merge_clusters[0]] += clusters[merge_clusters[1]]
    del clusters[merge_clusters[1]]

    return clusters

def cluster_assignment(clusters, num_clusters):
    labels = np.zeros(n, dtype=int)
    for i in range(num_clusters):
        for j in clusters[i]:
            labels[j] = i
    return labels

def hierarchal_clustering_agglomerative(data, num_clusters, distance_measure):
    distances, n = distance_matrix_creation(data)
    distances = distance_calculations(data, n, distances, distance_measure, p=None)
    clusters = [[i] for i in range(n)]
    clusters = cluster_merging(distances, clusters, num_clusters)
    labels = cluster_assignment(clusters, num_clusters)
    return labels

In [324]:
data = df.to_numpy()

In [325]:
labels = hierarchal_clustering_agglomerative(data, num_clusters=3, distance_measure='l1')

KeyboardInterrupt: 