In [142]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import glob
from itertools import product
import pandas as pd
import logging
import os
import pathlib
from sklearn.base import BaseEstimator, ClusterMixin

# Set up logger
logger = logging.getLogger(__name__)
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [143]:
SCRIPT_DIR = pathlib.Path(os.getcwd()).absolute()
DATA_DIR = os.path.join(SCRIPT_DIR.parent, "data")
PREPROCESSED_DATA_DIR = f'{DATA_DIR}/1_preprocessed/'

In [144]:
# FuzzyCMeansParamsGrid = {
#     "n_clusters": [2, 3, 4],
#     "fuzzyness": [1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5],
#     "suppression_factor": [0.1, 0.2, 0.3, 0.4, 0.5]
# }

FuzzyCMeansParamsGrid = {
    "n_clusters": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    "fuzzyness": [1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5],
}

In [145]:
class FuzzyCMeans(ClusterMixin, BaseEstimator):
    def __init__(self, n_clusters: int, fuzzyness: float, suppression_rule='theta', suppression_param=0.5):
        self.n_clusters = n_clusters
        self.fuzzyness = fuzzyness
        self.suppression_rule = suppression_rule  # 'theta', 'rho', 'beta', 'kappa', 'tau', 'sigma', 'xi'
        self.suppression_param = suppression_param

    def fit(self, X):
        X = X.to_numpy()
        n_samples, n_features = X.shape

        print(n_samples, n_features)
        
        # Initialize cluster prototypes (randomly for this example)
        print(np.random.choice(n_samples, self.n_clusters, replace=False))
        self.cluster_prototypes_ = X[np.random.choice(n_samples, self.n_clusters, replace=False)]

        # Initialize fuzzy membership matrix
        U = self._initialize_membership(X)

        # Main loop (alternating optimization)
        while True:
            previous_prototypes = np.copy(self.cluster_prototypes_)

            # 4. Compute distances
            distances = self._compute_distances(X)
            # print('distances', distances)

            # 5. Update fuzzy membership matrix
            U = self._update_membership(distances)
            # print('after fuzzyness', U)

            # 6. Apply suppression (context-sensitive)
            U = self._apply_suppression(U, distances)
            # print('after suppression', U)

            # 7. Update cluster prototypes
            self.cluster_prototypes_ = self._update_prototypes(X, U)

            # print('cluster protos', self.cluster_prototypes_)
            # print('previous protos', previous_prototypes)

            # 8. Check for convergence
            diff = np.linalg.norm(self.cluster_prototypes_ - previous_prototypes)
            # print('diff',diff)
            if diff < 1e-4:
                break


        self.is_fitted_ = True
        self.clusters_ = np.argmax(U, axis=1)
        self.centroids_ = self.cluster_prototypes_
        
        return self
    
    def fit_predict(self, data):
        """
        Fit the model and return cluster labels.
        """
        self.fit(data)
        return self.clusters_

    def _initialize_membership(self, X):
        # print('initialize membership')
        U = np.random.rand(len(X), self.n_clusters)
        U = U / np.sum(U, axis=1, keepdims=True)  # Normalize to satisfy probabilistic constraint
        return U

    def _compute_distances(self, X):
        # print('compute distances')
        distances = np.zeros((len(X), self.n_clusters))
        for i in range(self.n_clusters):
            # print(self.cluster_prototypes_)
            distances[:, i] = np.linalg.norm(X - self.cluster_prototypes_[i], axis=1)
        return distances

    def _update_membership(self, distances):
        # print('update membership')
        m = self.fuzzyness
        # print('m-1', m-1)
        U = np.power(distances, -2 / (m - 1-1e-6))
        U = np.where(np.isinf(U), 1.0, U) # handles possible inf values
        U = U / np.sum(U, axis=1, keepdims=True)  # Normalize

        return U

    def _apply_suppression(self, U, distances):
        # print('apply suppression')
        suppressed_U = np.zeros_like(U)
        winner_indices = np.argmax(U, axis=1)

        for k in range(len(U)):
            w = winner_indices[k]  # Winner cluster index
            alpha_k = self._compute_suppression_rate(U[k, w], distances[k], w)

            suppressed_U[k, :] = alpha_k * U[k, :]
            suppressed_U[k, w] = 1 - alpha_k + alpha_k * U[k, w]

        return suppressed_U

    def _compute_suppression_rate(self, u_w, distances_k, w):
        # print('compute suppression rate')
        m = self.fuzzyness
        rule = self.suppression_rule
        param = self.suppression_param

        if rule == 'theta':
            return 1 / (1 - u_w + u_w * (1 - param) ** (2 / (1 - m)))
        elif rule == 'rho':
            return 1 / (1 - u_w + param ** (2 / (1 - m)) * u_w ** ((3 - m) / (1 - m)))
        elif rule == 'beta':
            return 1 / (1 + u_w * (u_w ** (2 * param / (1 - m) / (1 - param)) - 1))
        elif rule == 'kappa':
            return 1 / (1 - u_w + u_w * (0.5 - (2 * param - 1) / 2 * np.sin(np.pi * u_w)) ** (2 / (1 - m)))
        elif rule == 'tau':
            return (1 - param) / (1 + u_w * param)
        elif rule == 'sigma':
            return (1 - u_w ** param) / (1 - u_w)
        elif rule == 'xi':
            return (1 - (np.sin(np.pi * u_w / 2)) ** param) / (1 - u_w)
        else:
            raise ValueError("Invalid suppression rule")

    def _update_prototypes(self, X, U):
        # print('update prototypes')
        m = self.fuzzyness
        U_m = np.power(U, m)
        new_prototypes = np.dot(U_m.T, X) / np.sum(U_m, axis=0, keepdims=True).T
        return new_prototypes

In [146]:
data_path = f"{DATA_DIR}/1_preprocessed/synthetic.csv"
preprocessed_data  = pd.read_csv(data_path)
preprocessed_data.shape

(1000, 3)

In [147]:

features_data = preprocessed_data.iloc[:, :-1]
# features_data = features_data.sample(n=5)
features_data.shape

(1000, 2)

In [148]:
dataset = 'synthetic'
model_name = 'fuzzy_cmeans'
from tools.config import PREPROCESSED_DATA_DIR, CLUSTERED_DATA_DIR

In [149]:
params_grid = FuzzyCMeansParamsGrid
i = 0
for params in product(*params_grid.values()): 
    param_dict = dict(zip(params_grid.keys(), params))
    print(param_dict)
    model = FuzzyCMeans(**param_dict)

    clustered_data_dir = CLUSTERED_DATA_DIR / dataset / model_name
    os.makedirs(clustered_data_dir, exist_ok=True)

    clusters = model.fit_predict(features_data)


    clustered_data = pd.concat(
        [preprocessed_data.iloc[:, :-1], pd.Series(clusters, name="cluster")], axis=1
    )


    clustered_data_path = clustered_data_dir / f"{','.join(f'{k}={v}' for k, v in param_dict.items())}.csv"
    clustered_data.to_csv(clustered_data_path, index=False)

    if i == 5:
        break
    i += 1



{'n_clusters': 2, 'fuzzyness': 1.5}
1000 2
[294 915]
{'n_clusters': 2, 'fuzzyness': 2}
1000 2
[348 149]
{'n_clusters': 2, 'fuzzyness': 2.5}
1000 2
[  3 409]
{'n_clusters': 2, 'fuzzyness': 3}
1000 2
[346 657]
{'n_clusters': 2, 'fuzzyness': 3.5}
1000 2
[279 256]
{'n_clusters': 2, 'fuzzyness': 4}
1000 2
[715 469]
