In [34]:
import numpy as np
from pathlib import Path
from matplotlib import pyplot as plt
from matplotlib import cm
import utility
import pandas as pd
import json
import seaborn as sns
import scipy.stats

# O(n)
def bottomk_indices(v, k):
    return np.argpartition(v, k)[:k]

def not_indices(indices, shape):
    new_indices = np.ones(shape, dtype=bool)
    new_indices[indices] = False
    return new_indices

def normalize(x):
    min = np.min(x)
    max = np.max(x)
    if min == max:
        return x
    return ((x - min) /
            (max - min))

def clamp(x):
    return np.clip(x, 0, 1)

In [2]:
# parameters
number_of_neurons_in_network = 100
bottomk_ratio = 0.1
learning_rate = 0.1
training_iterations = 14

In [3]:
class Network:
    
    def __init__(self, D):
        # D are the values of the neurons
        self.D = D
        # W are the weights of the neurons
        self.W = np.ones(D.shape[0]) / 2
        self.bottomk_number = round(bottomk_ratio * number_of_neurons_in_network)
    
    @classmethod
    def generate_initial(cls, X):
        n_dims = X.shape[1]
        ranges = []
        for col_idx in range(n_dims):
            col_values = X[:, col_idx]
            min = np.min(col_values)
            max = np.max(col_values)
            ranges.append((min, max))

        final_transposed = []
        for min, max in ranges:
            final_transposed.append(np.random.uniform(min, max, number_of_neurons_in_network))

        return cls(np.array(final_transposed).T)
    
    def plot(self, X, edge=True):
        fig, ax = plt.subplots()
        ax.scatter(X[:, 0], X[:, 1], color="red")
        facecolors = np.array([0, 1, 0]) * np.ones(len(self.W)).reshape(len(self.W), 1).astype('int')
        facecolors = np.hstack((facecolors, self.W.reshape(-1, 1)))
        ax.scatter(self.D[:, 0], self.D[:, 1], facecolor=facecolors, edgecolor='b' if edge else None)
        return ax
    
    def train_iter(self, raw_points):
                # experimental training loop
        shape = raw_points.shape

         # inhibition amount

        inhibition_ratio  = 1 - self.get_raw_activation()
        self.bottomk_number = round((bottomk_ratio * number_of_neurons_in_network) +
                                    (bottomk_ratio * number_of_neurons_in_network) *
                                    inhibition_ratio * 2)

        # print(((bottomk_ratio * number_of_neurons_in_network) ** inhibition_ratio) / (bottomk_ratio * number_of_neurons_in_network))

        # data point similarity
        for data_point in raw_points:
            difference_vectors =  self.D - data_point.reshape(1, shape[1])
            euclidean_distances = np.linalg.norm(difference_vectors, ord=2, axis=1)
            nearby_indices = bottomk_indices(euclidean_distances, self.bottomk_number)
            close_neurons_distances = euclidean_distances[nearby_indices]
            
            # hebbian part, move them closer, this is very important
            self.D[nearby_indices] = self.D[nearby_indices] - learning_rate * self.W[nearby_indices].reshape(-1, 1) * difference_vectors[nearby_indices]
            
            # reduce weights of far away points, this seems fairly important to kill neurons that are not doing anything
            weight_change_reduction_indices = not_indices(nearby_indices, self.W.shape)
            self.W[weight_change_reduction_indices] = clamp(self.W[weight_change_reduction_indices] - 0.01)

            # increase weight of the closest point
            #i = np.argmin(euclidean_distances)
            #self.W[i] = clamp(self.W[i] + 0.2)
            self.W[nearby_indices] = clamp(self.W[nearby_indices] + self.W[nearby_indices] * 0.5)
            

            
            # tan(x) based inhibition
            #normalized_neuron_distances = normalize(close_neurons_distances)
        #     weight_changes = np.tan(normalized_neuron_distances - 0.5) * self.W[nearby_indices]
        #     print(weight_changes)
        #     self.W[nearby_indices] = clamp(self.W[nearby_indices] + weight_changes)
            
        # neuron proximity based inhibition
        for idx in range(0, len(self.D)):
            neuron = self.D[idx]
            neuron_weight = self.W[idx]
            difference_vectors_from_other_neurons = self.D - neuron.reshape(1, shape[1])
            euclidean_distances = np.linalg.norm(difference_vectors_from_other_neurons, ord=2, axis=1)
            # plus one because the neuron itself is in the dataset
            near_neuron_indices = bottomk_indices(euclidean_distances, self.bottomk_number + 1)
            weight_change_neuron_indicies = near_neuron_indices[near_neuron_indices != idx]
            normalized_distances = normalize(euclidean_distances[weight_change_neuron_indicies])
            weight_changes = (1 - normalized_distances) * neuron_weight
            self.W[weight_change_neuron_indicies] = clamp(self.W[weight_change_neuron_indicies] - weight_changes)

                
          
        # pruning step
        # neurons_to_prune = []
        # for idx in range(0, len(self.D)):
        #     if self.W[idx] < 0.001:
        #         neurons_to_prune.append(idx)

        #neurons_to_keep = not_indices(neurons_to_prune, self.W.shape)
        #self.D = self.D[neurons_to_keep]
        #self.W = self.W[neurons_to_keep]
        #bottomk_number = round(len(self.D) * bottomk_ratio)
        #bottomk_number = round(bottomk_number * (1 - bottomk_ratio))
            
            # reduce weights of nearby points
            # self.W[nearby_indices] = clamp(self.W[nearby_indices] - (0.01 * inhibition_ratio), 0, 1)
            
    def get_cluster_representation(self, raw_points):
        cluster_centroid_values = self.D[self.W > 0.5]
        cluster_indicies = []
        for point in raw_points:
            euclidean_distances = np.linalg.norm(point.reshape(1, -1) - cluster_centroid_values, ord=2, axis=1)
            cluster_indicies.append(np.argmin(euclidean_distances))
        return np.hstack((raw_points, np.array(cluster_indicies).reshape(-1, 1)))

    
    def get_raw_activation(self):
        return np.sum(self.W) / self.W.shape[0]
    
    def fit(self, raw_points):
        for _ in range(training_iterations):
            self.train_iter(raw_points)
        return self.get_cluster_representation(raw_points)
    
    def __repr__(self):
        return f"<Network D={len(self.D)}>"


In [None]:
human_data_paths = list(Path("/chompsky/techno/development/work/clustering-data-cleaning/build/normalized").glob('*.json'))
records = []
for idx, human_data_path in enumerate(human_data_paths):
    for item in json.loads(human_data_path.read_bytes()):
        stimulus = item['clusters']
        del item['clusters']
        raw_data = utility.conv_mat_representation(stimulus)
        raw_points = raw_data[:, :2]
        network = Network.generate_initial(raw_points)
        cluster_rep = network.fit(raw_points)
        fm = utility.calc_fowlkes_mallows(cluster_rep, raw_data)
        item['model_fm'] = fm
        item['model_number_of_clusters'] = len(set(cluster_rep[:, 2]))
        item['model_clusters'] = cluster_rep
        records.append(item)
        print(item['participant_id'], idx + 1)


8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
9 2
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3
19 3

12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
12 15
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
3 16
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 17
7 

10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10020 26
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
10004 27
1

10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10007 34
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
10013 35
1

10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10018 42
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
10017 43
1

In [5]:
df = pd.DataFrame.from_dict(records)
df

Unnamed: 0,startDateTime,startTimestamp,endTimestamp,base_uuid,unique_uuid,group,number_of_points,std_vaccumed_z_score,flipped,block,set,experiment_version,participant_id,numberOfTries,trial_number,fm
0,1581632876274,1488138.000,1505018.000,823dc71a-6c2e-42ea-910b-4e53f7bd4a90,26891aa4-8a20-4b0a-9331-0584e8c29593,disperse,20,1.046253,False,1,1,1,8,,1,0.551553
1,1581632900434,1512298.000,1520089.000,fccbc907-4526-43e2-8ded-142b804b471a,c18abb70-6723-42d2-a3d7-d53daac372b4,clustered,10,-1.971192,False,1,1,1,8,,2,1.000000
2,1581632909849,1521714.000,1531642.000,e0b28b0d-8194-475b-98a6-6931d1ff1004,d7ecfd90-6c52-432d-b90b-849eb2692bd8,clustered,20,-1.979870,False,1,1,1,8,,3,0.834058
3,1581632920961,1532826.000,1538714.000,b3697dbc-04ee-4762-a98d-5773cf0c7e84,e2be293d-0bbf-42a5-8fe5-748c9f003aa5,clustered,10,-2.039006,False,1,1,1,8,,4,1.000000
4,1581632928488,1540354.000,1553090.000,37ecaaa0-dbb1-4b98-ad55-ce73229d5866,66045880-1a51-471d-ac44-1564eb2f9453,clustered,20,-1.964858,False,1,1,1,8,,5,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5259,1591622537952,3303212.260,3311594.870,42d361fa-4736-4e88-983e-5a08e9ba6071,76f583c2-d8cc-446d-8ef7-0d469b75f031,disperse,20,1.001153,True,4,2,3,10035,1.0,108,0.546594
5260,1591622548928,3314188.370,3324170.460,49f28baf-c341-4fb1-a75b-0ff70240de39,8f3ce56e-8e2b-4702-8fb1-0a1a409bdb65,disperse,20,0.952234,True,4,2,3,10035,1.0,109,0.666667
5261,1591622560420,3325680.550,3331786.545,6b397277-e2cf-4165-9f1a-32aeef65ebbc,68f60018-7175-401c-a4fd-1b1eacea4eba,clustered,10,-2.031332,False,4,2,3,10035,1.0,110,0.612372
5262,1591622567698,3332958.345,3341058.410,e9bc8099-ff66-4017-abd4-33cd8dbcc2d6,3cf902d3-13c6-44bb-ac24-e071d8046825,clustered,25,-2.014654,False,4,2,3,10035,1.0,111,0.680389


In [None]:
g = sns.FacetGrid(df, col="group")
g.map(sns.histplot, "model_fm")

In [27]:
model_fms = df.groupby(['participant_id', 'base_uuid']).agg({'fm': 'mean'})
model_fms = model_fms.reset_index()
model_fms

Unnamed: 0,participant_id,base_uuid,model_fm
0,1,0f18f0ad-f23d-4fc9-a761-3920c107550b,0.542184
1,1,0f9a9c80-cdab-4071-8d56-27965b585a10,0.586873
2,1,2bb1495b-1ef6-4943-bb48-662371f50b30,0.648052
3,1,2f24428f-632d-4374-b0ca-cb3d0ae04bc5,0.634442
4,1,2f452eee-c0bb-437c-bd4e-f35ee9fe9ff6,0.499383
...,...,...,...
2627,10038,f54baffc-bc26-4c4d-b940-824d08daf230,0.957905
2628,10038,fa83ff0a-5066-4c01-bdf6-909bc3dda074,0.444091
2629,10038,fbb6c70e-fe6e-4634-8737-975d829a36cc,0.480631
2630,10038,fccbc907-4526-43e2-8ded-142b804b471a,0.908248


In [28]:
human_fms = pd.read_csv('human_data_fm.csv')
human_fms = human_fms.rename({"fowlkes_mallows": "human_fm"})

Unnamed: 0,participant_id,base_uuid,fowlkes_mallows,flipped,group
0,1,0f18f0ad-f23d-4fc9-a761-3920c107550b,0.673496,True,clustered
1,1,0f9a9c80-cdab-4071-8d56-27965b585a10,0.727161,True,disperse
2,1,2bb1495b-1ef6-4943-bb48-662371f50b30,0.953463,True,clustered
3,1,2f24428f-632d-4374-b0ca-cb3d0ae04bc5,0.767561,True,disperse
4,1,2f452eee-c0bb-437c-bd4e-f35ee9fe9ff6,0.754342,False,clustered
...,...,...,...,...,...
2627,10038,f54baffc-bc26-4c4d-b940-824d08daf230,0.915811,True,clustered
2628,10038,fa83ff0a-5066-4c01-bdf6-909bc3dda074,0.853057,False,clustered
2629,10038,fbb6c70e-fe6e-4634-8737-975d829a36cc,1.000000,False,disperse
2630,10038,fccbc907-4526-43e2-8ded-142b804b471a,1.000000,True,clustered


In [31]:
merged = pd.merge(model_fms, human_fms, on=['participant_id', 'base_uuid'])

In [53]:
tmp = merged[['model_fm', 'fowlkes_mallows']].dropna().to_numpy()

In [54]:
# scipy.stats.linregress(merged['model_fm'].to_numpy(), merged['fowlkes_mallows'].to_numpy())
scipy.stats.linregress(tmp)

LinregressResult(slope=0.38374838759547, intercept=0.513529934268943, rvalue=0.3530252149460526, pvalue=4.529671316760011e-78, stderr=0.019835455000573595)