<a href="https://colab.research.google.com/github/amresh224006/ALL-problems/blob/main/Anubavam_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Create the matrix class and load the CSV data
import numpy as np
import csv

class Matrix:
    def __init__(self, filename=None):
        self.array_2d = None
        if filename:
            self.load_from_csv(filename)

    def load_from_csv(self, filename):
        with open(filename, newline='') as csvfile:
            data = list(csv.reader(csvfile))
            self.array_2d = np.array(data, dtype=float)

    def standardise(self):
        # Standardisation of the matrix
        array = self.array_2d
        for j in range(array.shape[1]):
            col = array[:, j]
            mean_col = np.mean(col)
            max_col = np.max(col)
            min_col = np.min(col)
            array[:, j] = (col - mean_col) / (max_col - min_col)
        self.array_2d = array

    def get_distance(self, other_matrix, row_i):
        distances = []
        row_a = self.array_2d[row_i, :]
        for row_b in other_matrix.array_2d:
            dist = np.linalg.norm(row_a - row_b)
            distances.append([dist])
        return np.array(distances)

    def get_weighted_distance(self, other_matrix, weights, row_i):
        distances = []
        row_a = self.array_2d[row_i, :]
        for row_b in other_matrix.array_2d:
            weighted_dist = np.sqrt(np.sum(weights.array_2d * (row_a - row_b) ** 2))
            distances.append([weighted_dist])
        return np.array(distances)

    def get_count_frequency(self):
        if self.array_2d.shape[1] != 1:
            return 0
        unique, counts = np.unique(self.array_2d, return_counts=True)
        return dict(zip(unique, counts))

# Test loading the file
m = Matrix('/content/Data (2).csv')
print(m.array_2d)


[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]


In [2]:
#Define the additional functions
def get_initial_weights(m):
    weights = np.random.rand(1, m)
    weights /= np.sum(weights)  # Normalize to ensure the sum equals 1
    return weights

def get_centroids(matrix, S, K):
    centroids = []
    for k in range(1, K+1):
        cluster_points = matrix.array_2d[S.array_2d[:, 0] == k]
        if len(cluster_points) > 0:
            centroids.append(np.mean(cluster_points, axis=0))
    return np.array(centroids)

def get_separation_within(matrix, centroids, S, K):
    separation_within = np.zeros((1, matrix.array_2d.shape[1]))
    for j in range(matrix.array_2d.shape[1]):
        for k in range(1, K+1):
            cluster_points = matrix.array_2d[S.array_2d[:, 0] == k]
            centroid_k = centroids[k-1, j]
            separation_within[0, j] += np.sum((cluster_points[:, j] - centroid_k) ** 2)
    return separation_within

def get_separation_between(matrix, centroids, S, K):
    separation_between = np.zeros((1, matrix.array_2d.shape[1]))
    for j in range(matrix.array_2d.shape[1]):
        for k in range(1, K+1):
            Nk = np.sum(S.array_2d[:, 0] == k)
            centroid_k = centroids[k-1, j]
            separation_between[0, j] += Nk * np.linalg.norm(matrix.array_2d[:, j] - centroid_k) ** 2
    return separation_between


In [4]:
def get_groups(matrix, K):
    n, m = matrix.array_2d.shape  # n = number of rows, m = number of columns

    # Step 2: Initialize weights (1 row, m columns)
    weights = get_initial_weights(m)

    # Step 4: Create matrix S with n rows and 1 column
    S = np.zeros((n, 1))

    # Step 5: Randomly select K rows from the matrix as initial centroids
    random_indices = np.random.choice(n, K, replace=False)
    centroids = matrix.array_2d[random_indices, :]

    prev_S = np.zeros(S.shape)  # To check for convergence

    while not np.array_equal(S, prev_S):
        prev_S = S.copy()

        # Step 7: Assign each row to the closest centroid
        for i in range(n):
            distances = [np.sqrt(np.sum(weights * (matrix.array_2d[i, :] - centroids[k, :]) ** 2)) for k in range(K)]
            S[i] = np.argmin(distances) + 1  # Assign the closest centroid's index (1-based)

        # Step 9: Update the centroids
        for k in range(K):
            cluster_points = matrix.array_2d[S[:, 0] == k + 1]
            if len(cluster_points) > 0:
                centroids[k, :] = np.mean(cluster_points, axis=0)

    return MatrixWithS(S)

class MatrixWithS:
    def __init__(self, S):
        self.array_2d = S

    def get_count_frequency(self):
        unique, counts = np.unique(self.array_2d, return_counts=True)
        return dict(zip(unique, counts))



In [5]:
 #Implement run_test
def run_test():
    m = Matrix('/content/Data (2).csv')
    for k in range(2, 11):
        for i in range(20):
            S = get_groups(m, k)
            print(str(k) + '=' + str(S.get_count_frequency()))

# Example of calling the test
run_test()


2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}

In [6]:
def get_new_weights(matrix, centroids, old_weights, S, K):
    # Get separation within and separation between
    separation_within = get_separation_within(matrix, centroids, S, K)
    separation_between = get_separation_between(matrix, centroids, S, K)

    # Initialize the new weights
    new_weights = np.zeros(old_weights.shape)

    # Calculate the sum of the ratio b_j/a_j across all features
    sum_b_over_a = np.sum(separation_between / separation_within)

    # Update each weight according to the formula
    for j in range(old_weights.shape[1]):
        new_weights[0, j] = 0.5 * (old_weights[0, j] + (separation_between[0, j] / separation_within[0, j]) / sum_b_over_a)

    return new_weights


In [7]:
def get_groups(matrix, K):
    n, m = matrix.array_2d.shape  # n = number of rows, m = number of columns

    # Step 2: Initialize weights (1 row, m columns)
    weights = get_initial_weights(m)

    # Step 4: Create matrix S with n rows and 1 column
    S = np.zeros((n, 1))

    # Step 5: Randomly select K rows from the matrix as initial centroids
    random_indices = np.random.choice(n, K, replace=False)
    centroids = matrix.array_2d[random_indices, :]

    prev_S = np.zeros(S.shape)  # To check for convergence

    while not np.array_equal(S, prev_S):
        prev_S = S.copy()

        # Step 7: Assign each row to the closest centroid using weighted distance
        for i in range(n):
            distances = [np.sqrt(np.sum(weights * (matrix.array_2d[i, :] - centroids[k, :]) ** 2)) for k in range(K)]
            S[i] = np.argmin(distances) + 1  # Assign the closest centroid's index (1-based)

        # Step 9: Update the centroids based on the cluster assignments
        for k in range(K):
            cluster_points = matrix.array_2d[S[:, 0] == k + 1]
            if len(cluster_points) > 0:
                centroids[k, :] = np.mean(cluster_points, axis=0)

        # Step 10: Update weights using the separation within and between clusters
        weights = get_new_weights(matrix, centroids, weights, S, K)

    return MatrixWithS(S)


In [8]:
def run_test():
    m = Matrix('/content/Data (2).csv')
    for k in range(2, 11):
        for i in range(20):
            S = get_groups(m, k)
            print(str(k) + '=' + str(S.get_count_frequency()))

# Example of calling the test
run_test()


2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
2={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
3={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
4={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}
5={0.0: 178}