In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore")

In [None]:
def create_consenus_matrix(N, partitions):
    consensus_matrix = np.zeros((N, N))

    for i in range(N):
        for j in range(N):
            count = sum(partitions[k][i] == partitions[k][j] for k in range(len(partitions)))
            consensus_matrix[i, j] = count
    return consensus_matrix

def modularity_shift(N, matrix):
    shifted_matrix = matrix
    row_sums = shifted_matrix.sum(0)
    overall_sum = shifted_matrix.sum()
    for i in range(N):
        for j in range(N):
            shifted_matrix[i,j] -= row_sums[i] * row_sums[j] / overall_sum
    return shifted_matrix

def scale_shift(N,matrix):
    shifted_matrix = matrix
    average = shifted_matrix.sum()/N*N
    for i in range(N):
        for j in range(N):
            shifted_matrix[i,j] -= average
    return shifted_matrix
    

In [None]:
def agsu(N, all_partitions, criterion='av'):

    # Get consensus matrix
    consensus_matrix = create_consenus_matrix(N,all_partitions)

    # Modularity-shift
    shifted_matrix = modularity_shift(N,consensus_matrix)

    # Partitioning initialisation
    clusters = [[i] for i in range(len(consensus_matrix))]
    max_delta = 1
    while len(clusters) > 2 and max_delta > 0:
        max_delta = 0
        best_s, best_t = 0, 0
        for s in range(len(consensus_matrix)):
            for t in range(s + 1, len(consensus_matrix)):
                cur_delta = consensus_matrix[s,t]
                if criterion == 'av':
                    cur_delta /= (len(clusters[s]) + len(clusters[t]))
                if cur_delta > max_delta:
                    max_delta = cur_delta
                    best_s, best_t = s, t

        if max_delta > 0:
            clusters[best_s] += clusters[best_t]
            del clusters[best_t]

            consensus_matrix[best_s, :] += consensus_matrix[best_t, :]
            consensus_matrix[:, best_s] += consensus_matrix[:, best_t]
            consensus_matrix = np.delete(consensus_matrix, best_t, axis=0)  
            consensus_matrix = np.delete(consensus_matrix, best_t, axis=1)  

    cluster_labels = np.empty(N, dtype=int)

    for cluster_idx, cluster in enumerate(clusters):
        cluster_labels[cluster] = cluster_idx
    
    return cluster_labels

def agsa(N, all_partitions, criterion='av'):

    # Get consensus matrix
    consensus_matrix = create_consenus_matrix(N,all_partitions)
    # Modularity-shift
    shifted_matrix = modularity_shift(N,consensus_matrix)

    clusters = [[i] for i in range(len(consensus_matrix))]
    max_delta = 1
    while len(clusters) > 2 and max_delta > 0:
        max_delta = 0
        best_s, best_t = 0, 0
        for s in range(len(consensus_matrix)):
            for t in range(s + 1, len(consensus_matrix)):
                cur_delta = consensus_matrix[s,t]
                if criterion == 'av':
                    cur_delta /= (len(clusters[s]) + len(clusters[t]))
                if cur_delta > max_delta:
                    max_delta = cur_delta
                    best_s, best_t = s, t

        if max_delta > 0:
            clusters[best_s] += clusters[best_t]
            del clusters[best_t]

            consensus_matrix[best_s, :] += consensus_matrix[best_t, :]
            consensus_matrix[:, best_s] += consensus_matrix[:, best_t]
            consensus_matrix = np.delete(consensus_matrix, best_t, axis=0)  
            consensus_matrix = np.delete(consensus_matrix, best_t, axis=1)  

    cluster_labels = np.empty(N, dtype=int)

    for cluster_idx, cluster in enumerate(clusters):
        cluster_labels[cluster] = cluster_idx
    
    return cluster_labels

def get_test_data(N,K, m):
    test_dataset = np.random.rand(N,K,m) 
    return test_dataset

## Experiment

In [None]:
df = pd.read_csv('train.csv')

_, X, _, y = train_test_split(df.drop(columns=['id', 'target']), df.target, test_size=0.01,
                              stratify=df.target, random_state=0)
y = y.reset_index(drop=True).apply(lambda label: int(label[-1]) - 1)

In [None]:

# Results
results = []

N = len(X)
for n_clusters in [4, 8]:
    for km in [KMeans(n_clusters=n_clusters, random_state=0),
                   MiniBatchKMeans(n_clusters=n_clusters, random_state=0)]:
        
        for n_features in [60, 80]:
            for M in [10, 40]:
            
                np.random.seed(0)
                ground_truth_labels = km.fit_predict(X)
                all_partitions = []
                
                for i in range(M):
                    feature_indices = np.random.choice(X.shape[1], size=n_features, replace=False)
                    all_partitions.append(km.fit_predict(X.iloc[:, feature_indices]))
                
                for criterion in ['sum', 'av']:
                    pred_labels = agsu(N, all_partitions, criterion)
                    
                    cons_ari = adjusted_rand_score(ground_truth_labels, pred_labels)
                    mean_ari = np.mean([adjusted_rand_score(ground_truth_labels, all_partitions[i]) for i in range(M)])
                    delta = cons_ari - mean_ari

                    results.append([n_clusters, km, n_features, M, criterion, cons_ari, mean_ari, delta, len(np.unique(pred_labels))])


columns = ['Number of clusters in initial partitioning', 'Algorithm', 'Number of features', 'Number of partitionings', 'Criteria', 'ARI ', 'Average ARI of ensamble income partitiong', 'Difference', 'Number of clusters in consensus partitioning']
result_df = pd.DataFrame(results, columns=columns)
