In [1]:
import numpy as np
from numpy.linalg import norm
import csv

In [2]:
class matrix:
    def __init__(self, filename=None, array_2d=None): # add array_2d as an optional argument
        # Initialize array_2d as an empty NumPy array
        self.array_2d = np.array([])

        if filename:
            self.load_from_csv(filename)
        # If array_2d is provided, use it
        elif array_2d is not None: # Check if array_2d is not None
            self.array_2d = array_2d
            
    # 1) Loading CSV file
    def load_from_csv(self, filename):
        # Read CSV and load data into array_2d
        with open(filename, 'r') as file:
            reader = csv.reader(file)
            data = [list(map(float, row)) for row in reader]
            self.array_2d = np.array(data)

    # 2) Standardize the matrix
    def standardise(self):
        for j in range(self.array_2d.shape[1]):
            col = self.array_2d[:, j]
            self.array_2d[:, j] = (col - np.mean(col)) / (np.max(col) - np.min(col))

    # 3) Calculate Euclidean distance
    def get_distance(self, other_matrix, row_i):
        diff = self.array_2d[row_i, :] - other_matrix.array_2d
        dist = np.sqrt(np.sum(diff**2, axis=1))
        return dist.reshape(-1, 1)

    # 4) Calculate weighted Euclidean distance
    def get_weighted_distance(self, other_matrix, weights, row_i):
        diff = self.array_2d[row_i, :] - other_matrix.array_2d
        weighted_diff = weights.array_2d * (diff**2)
        dist = np.sqrt(np.sum(weighted_diff, axis=1))
        return dist.reshape(-1, 1)

    # 5) Count frequency of unique values (for a single-column matrix)
    def get_count_frequency(self):
        if self.array_2d.shape[1] != 1:
            return 0
        return dict(zip(*np.unique(self.array_2d, return_counts=True)))

In [3]:
# Function to get initial random weights
def get_initial_weights(m):
    weights = np.random.rand(1, m)
    return weights / np.sum(weights)

# Function to get centroids
def get_centroids(data, S, K):
    centroids = np.zeros((K, data.array_2d.shape[1]))
    for k in range(K):
        group_data = data.array_2d[S.array_2d[:, 0] == k]
        if group_data.shape[0] > 0:
            centroids[k, :] = np.mean(group_data, axis=0)
    return matrix(array_2d=centroids)

# Function to calculate separation within clusters
def get_separation_within(data, centroids, S, K):
    separation_within = np.zeros((1, data.array_2d.shape[1]))
    for j in range(data.array_2d.shape[1]):
        for k in range(K):
            group_data = data.array_2d[S.array_2d[:, 0] == k]
            for i in range(group_data.shape[0]):
                separation_within[0, j] += norm(group_data[i, j] - centroids.array_2d[k, j])**2
    return separation_within

In [4]:

# Function to calculate separation between clusters
def get_separation_between(data, centroids, S, K):
    separation_between = np.zeros((1, data.array_2d.shape[1]))
    for j in range(data.array_2d.shape[1]):
        for k in range(K):
            separation_between[0, j] += norm(centroids.array_2d[k, j] - np.mean(data.array_2d[:, j]))**2
    return separation_between

# Function to create groups
def get_groups(data, K):
    S = matrix(array_2d=np.zeros((data.array_2d.shape[0], 1)))
    # Ensure centroids_matrix has the correct dimensions: K rows and same number of columns as data
    centroids = data.array_2d[np.random.choice(data.array_2d.shape[0], K, replace=False), :]
    centroids_matrix = matrix(array_2d=centroids)
    weights = get_initial_weights(data.array_2d.shape[1])
    old_S = None

    while not np.array_equal(S.array_2d, old_S):
        old_S = np.copy(S.array_2d)
        for i in range(data.array_2d.shape[0]):
            # In get_weighted_distance, ensure 'self' is the data point and 'other_matrix' are the centroids
            distances = data.get_weighted_distance(centroids_matrix, matrix(array_2d=weights), i)
            S.array_2d[i, 0] = np.argmin(distances)
        for k in range(K):
            for j in range(data.array_2d.shape[1]):
                group_data = data.array_2d[S.array_2d[:, 0] == k]
                if group_data.shape[0] > 0:
                    centroids_matrix.array_2d[k, j] = np.mean(group_data[:, j])

    return S

In [5]:
# Function to calculate new weights
def get_new_weights(data, centroids, old_weights, S, K):
    a = get_separation_within(data, centroids, S, K)
    b = get_separation_between(data, centroids, S, K)
    new_weights = old_weights * (a / b)**0.5
    return new_weights / np.sum(new_weights)

# Function to run test
def run_test():
    m = matrix('test(1)_anubavam_dataset.csv')
    for k in range(2, 11):
        for i in range(20):
            S = get_groups(m, k)
            print(str(k) + '=' + str(S.get_count_frequency()))

In [6]:
run_test()

2={0.0: 101, 1.0: 77}
2={0.0: 84, 1.0: 94}
2={0.0: 43, 1.0: 135}
2={0.0: 74, 1.0: 104}
2={0.0: 121, 1.0: 57}
2={0.0: 91, 1.0: 87}
2={0.0: 46, 1.0: 132}
2={0.0: 121, 1.0: 57}
2={0.0: 98, 1.0: 80}
2={0.0: 83, 1.0: 95}
2={0.0: 60, 1.0: 118}
2={0.0: 90, 1.0: 88}
2={0.0: 102, 1.0: 76}
2={0.0: 86, 1.0: 92}
2={0.0: 118, 1.0: 60}
2={0.0: 56, 1.0: 122}
2={0.0: 115, 1.0: 63}
2={0.0: 86, 1.0: 92}
2={0.0: 114, 1.0: 64}
2={0.0: 83, 1.0: 95}
3={0.0: 93, 1.0: 1, 2.0: 84}
3={0.0: 52, 1.0: 64, 2.0: 62}
3={0.0: 57, 1.0: 65, 2.0: 56}
3={0.0: 1, 1.0: 97, 2.0: 80}
3={0.0: 1, 1.0: 81, 2.0: 96}
3={0.0: 71, 1.0: 48, 2.0: 59}
3={0.0: 59, 1.0: 79, 2.0: 40}
3={0.0: 66, 1.0: 51, 2.0: 61}
3={0.0: 80, 1.0: 55, 2.0: 43}
3={0.0: 57, 1.0: 58, 2.0: 63}
3={0.0: 96, 1.0: 81, 2.0: 1}
3={0.0: 85, 1.0: 92, 2.0: 1}
3={0.0: 50, 1.0: 62, 2.0: 66}
3={0.0: 67, 1.0: 48, 2.0: 63}
3={0.0: 54, 1.0: 71, 2.0: 53}
3={0.0: 22, 1.0: 82, 2.0: 74}
3={0.0: 53, 1.0: 54, 2.0: 71}
3={0.0: 73, 1.0: 34, 2.0: 71}
3={0.0: 54, 1.0: 66, 2.0: 58}
3={

-----