## Finding Low Accuracy Cluster

In [1]:
# LIBRARIES
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# PARAMETERS
input_filepath = 'prepared_data/german.csv'
output_filepath = 'clustered_data/german.csv'

trial_cnt = 10

In [3]:
# GET DATA
df = pd.read_csv(input_filepath)

y_cols = ['y_true','y_pred_score','y_pred']
X_df =  df.loc[:, ~df.columns.isin(y_cols)]

X = X_df.to_numpy()
X_scaled = StandardScaler().fit_transform(X)

In [4]:
# We want to find a cluster with the lowest accuracy
 # - h

    
# Create several different clusters, then check the accuracy of all of them
def basicClustering(x,y_true,y_pred,k,trial_cnt):
    result_accuracy=[1]*k
    result_clusters=1
    
    for t in range(10):
        accuracy=[None]*k
        size=[None]*k

        clusters = KMeans(k).fit(X_scaled) 
        
        # calculate the accuracy for each of the clusters
        for i in np.unique(clusters.labels_):
            mask = (clusters.labels_ == i)

            # cluster_x = X_scaled[mask,:]
            cluster_y_true = y_true[mask]
            cluster_y_pred = y_pred[mask]
            

            accuracy[i] = accuracy_score(cluster_y_true,cluster_y_pred)
            size[i] = cluster_y_true.shape[0]

        k_min = accuracy.index(min(accuracy))
        # check if the cluster with the smallest accuracy is the min so far
        if min(accuracy)<min(result_accuracy):
            result_accuracy=accuracy
            result_clusters = clusters

        print('trial '+str(t)+': '+str(accuracy[k_min])+' ('+str(size[k_min])+')')
    return(result_accuracy,result_clusters)


In [5]:
# sub cluster

def getLeastAccurateCluster(x,y_true,y_pred,k,n_min):
    accuracy=[None]*k
    size=[None]*k
    clusters = KMeans(k).fit(x)
    for i in np.unique(clusters.labels_):
        mask = (clusters.labels_ == i)
        if sum(mask) >= n_min:
            cluster_x = x[mask,:]
            cluster_y_true = y_true[mask]
            cluster_y_pred = y_pred[mask]

            accuracy[i] = accuracy_score(cluster_y_true,cluster_y_pred)
        else:
            accuracy[i] = 1
        size[i]=sum(mask)
    k_min = accuracy.index(min(accuracy))
    mask = clusters.labels_ == k_min
    center = clusters.cluster_centers_[k_min,:]

    return(mask,min(accuracy),center)


def getLeastAccurateCluster_Recursive(x,y_true,y_pred,k,n_min):
    # print('TOP x shape:',x.shape)
    n = x.shape[0]
    indices = list(range(0, n))
    first_mask,first_accuracy,first_center = getLeastAccurateCluster(x,y_true,y_pred,k,n_min)

    return_mask = first_mask
    return_accuracy = first_accuracy
    return_center = first_center
    
    if sum(first_mask) > n_min:
        # print('first_mask shape:',first_mask.shape)
        # print('y_true shape:',y_true.shape)
        new_x = x[first_mask,:]
        new_y_true = y_true[first_mask]
        new_y_pred = y_pred[first_mask]
        
        full_i = np.array(list(range(0, n)))
        sub_i = full_i[first_mask]
        
        sub_mask,sub_accuracy,sub_center = getLeastAccurateCluster_Recursive(new_x,new_y_true,new_y_pred,k,n_min)
        
        if sub_accuracy < return_accuracy:
            return_mask = np.full(n,False)
            for i in sub_i[sub_mask]:
                return_mask[i] = True
            return_accuracy = sub_accuracy
            return_center = sub_center
        
        # return(return_mask)
    
    # else: return(newMask)
    return(return_mask,return_accuracy,return_center)


def subClustering(x,y_true,y_pred,k,n_min,trial_cnt):
    result_accuracy=1
    result_center=1
    result_mask = []
    
    for t in range(10):
        accuracy=[None]*k
        # size=[None]*k
        
        mask,accuracy,center = getLeastAccurateCluster_Recursive(x,y_true,y_pred,k,n_min)
        # print(mask)

        # check if the cluster with the smallest accuracy is the min so far
        if accuracy<result_accuracy:
            result_accuracy=accuracy
            result_center = center
            result_mask = mask

    # print(resul_mask)
    # print(mask)
        print('trial '+str(t)+': '+str(accuracy)+' ('+str(sum(1*mask))+')')
    return(result_accuracy,result_center,result_mask)




In [6]:
k = 5
trial_cnt = 10

y_true = df['y_true']
y_pred = df['y_pred']


clust_accuracy, clusters = basicClustering(X_scaled,y_true,y_pred,k,trial_cnt)

k_min = clust_accuracy.index(min(clust_accuracy))
# clusters.cluster_centers_[k_min,:]
# k_min

trial 0: 0.6470588235294118 (51)
trial 1: 0.6363636363636364 (11)
trial 2: 0.660377358490566 (53)
trial 3: 0.6727272727272727 (55)
trial 4: 0.660377358490566 (53)
trial 5: 0.6834532374100719 (139)
trial 6: 0.7227722772277227 (101)
trial 7: 0.728 (125)
trial 8: 0.6851851851851852 (108)
trial 9: 0.6896551724137931 (87)


In [7]:
k = 5
trial_cnt = 10
n_min = 10

y_true = df['y_true']
y_pred = df['y_pred']


accuracy,center,mask = subClustering(X_scaled,y_true,y_pred,k,n_min,trial_cnt)

np.mean(accuracy)


trial 0: 0.5 (12)
trial 1: 0.45454545454545453 (11)
trial 2: 0.4375 (16)
trial 3: 0.5 (12)
trial 4: 0.46153846153846156 (13)
trial 5: 0.5 (12)
trial 6: 0.4 (10)
trial 7: 0.46153846153846156 (13)
trial 8: 0.5 (10)
trial 9: 0.5 (12)


0.4

In [8]:
df['clust_mask']=mask

df.to_csv(output_filepath)