In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api
from pathlib import Path
import json
import scipy

import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import plot_ceres_residuals

In [None]:
matplotlib.rcParams.update({'font.family': 'Inter'})
matplotlib.rcParams.update({'font.size': 12})
matplotlib.rcParams.update({'savefig.bbox': 'tight'})

In [None]:
def get_partial_eta_squared(lm):
    if not isinstance(lm, statsmodels.regression.linear_model.RegressionResultsWrapper):
        raise Exception('Invalid argument, need regression model')

    aov = statsmodels.api.stats.anova_lm(lm)
    
    sseffect = aov["sum_sq"].iloc[:-1]

    sstotal = np.sum(aov["sum_sq"])

    partial_eta_squared = sseffect / sstotal
    partial_eta_squared.name = 'partial_eta_squared'

    return partial_eta_squared

In [None]:
df = pd.read_json("data/clustering-pairs.json").dropna()
df['online'] = df['experiment_version'] != 1
df.head()

In [None]:
df['duration_stim_1'].median()

In [None]:
plt.hist(df["fowlkes_mallows_index"])

In [None]:
print(df["fowlkes_mallows_index"].mean())
print(df["fowlkes_mallows_index"].median())
print(df['fowlkes_mallows_index'].std())

In [None]:
g = sns.FacetGrid(col="number_of_points", row="group", data=df)
g.map(sns.histplot, "fowlkes_mallows_index")

In [None]:


tmp = df.groupby(['number_of_points', 'group']).agg({'fowlkes_mallows_index': 'median'}).reset_index()
fig, ax = plt.subplots(figsize=(6, 3.5))

for group in ["clustered", "disperse"]:
    filtered = tmp[tmp["group"] == group].sort_values("number_of_points")
    label = 'Clustered' if group == 'clustered' else 'Dispersed'
    ax.plot(filtered["number_of_points"], filtered["fowlkes_mallows_index"], label=label)




ax.set_ylim(0.5, 1)
fig.legend(bbox_to_anchor=(0.95, 0.93))
fig.tight_layout()
ax.set_xlabel("Number of Points")
ax.set_ylabel("Median Fowlkes Mallows index")
plt.savefig('internal_clustering_reliability.png', dpi=600)


In [None]:
sns.catplot(data=df, kind="bar", x="flipped", y="fowlkes_mallows_index", hue="group")


In [None]:
# How similar are the number of participants clusters generally between stimuli?

df["number_of_clusters_diff"] = df["number_of_clusters_stim_1"] - df["number_of_clusters_stim_2"]
df["abs_number_of_clusters_diff"] = np.abs(df["number_of_clusters_diff"])

df['cluster_structure'] = df['group'].map({'disperse': 'Dispersed', 'clustered': "Clustered"})

df['cluster_structure'] = pd.Categorical(df['cluster_structure'], categories=['Clustered', 'Dispersed'], ordered=True)
tmp = df["number_of_clusters_diff"]
g = sns.FacetGrid(col='cluster_structure', data=df, hue='cluster_structure')
g.set_titles(col_template='{col_name}')
g.map(sns.histplot, 'abs_number_of_clusters_diff', binwidth=1)


fig = g.figure

fig.supxlabel("Absolute difference in number of clusters")

g.set_xlabels("")

fig.tight_layout()

# sns.histplot(x="abs_number_of_clusters_diff", hue='cluster_structure', binwidth=1, data=df, ax=ax)
# # ax.hist(np.abs(df["number_of_clusters_diff"]), bins=20)
# ax.set_xlabel("Difference in number of clusters")
# ax.set_ylabel("# trials")

# ax.get_legend().set_title('Cluster Structure')
# print(tmp.describe())
# print(tmp.median(), tmp.std())
# fig.tight_layout()
fig.savefig('number-of-clusters-difference-histogram.png', dpi=600)


In [None]:

sns.barplot(x=df['number_of_points'], y=np.abs(df['number_of_clusters_diff']))

In [None]:
tmp = df.groupby(['participant_id', 'number_of_points', 'group']).agg({'number_of_clusters_diff': 'mean'}).reset_index()

smf.ols("abs(number_of_clusters_diff) ~ 1 + number_of_points + group", data=tmp).fit().summary()

In [None]:

tmpcol = df["number_of_clusters_diff"]
print(len(df[tmpcol == 0]) / len(df))

# 62% of plus or minus 1 clusters the second time
print(len(df[np.logical_and(tmpcol >= -1, tmpcol <= 1)]) / len(df))

In [None]:
cdf = pd.read_csv("data/clustering-cross-participants.csv")
cdf

In [None]:
cdf.groupby('participant_id_1').agg({'fowlkes_mallows_index': 'median'}).sort_values('fowlkes_mallows_index')

In [None]:
cdf.groupby('participant_id_1').agg({'fowlkes_mallows_index': 'median'}).sort_values('fowlkes_mallows_index')['fowlkes_mallows_index'].mean()

In [None]:
tmp = cdf.groupby(['cluster_structure']).agg({'fowlkes_mallows_index': 'mean'}).reset_index()
tmp

In [None]:
tmp = cdf.groupby(['cluster_structure', 'number_of_points']).agg({'fowlkes_mallows_index': 'median'}).reset_index()

fig, ax = plt.subplots(figsize=(6, 3.5))

ax.plot(tmp[tmp["cluster_structure"] == "clustered"]['number_of_points'], tmp[tmp["cluster_structure"] == "clustered"]['fowlkes_mallows_index'], label='Clustered')

ax.plot(tmp[tmp["cluster_structure"] == "disperse"]['number_of_points'], tmp[tmp["cluster_structure"] == "disperse"]['fowlkes_mallows_index'], label='Dispersed')

ax.set_ylim(0.5, 1)
ax.set_xlabel('Number of Points')
ax.set_ylabel('Median Fowlkes Mallows Index')

fig.legend(loc='upper right', bbox_to_anchor=(0.95, 0.93))

fig.tight_layout()

plt.savefig('external_clustering_reliability.png', dpi=600)

In [None]:
cdf['fowlkes_mallows_index'].hist()

## Publication statistical analyses

In [None]:
tmp = df.groupby(['participant_id', 'number_of_points', 'group', 'flipped']).agg({'fowlkes_mallows_index': 'median'}).reset_index()
# smf.ols("fowlkes_mallows_index ~ 1 + number_of_points + group", data=tmp).fit().summary()
model = smf.ols("fowlkes_mallows_index ~ 1 + flipped", data=tmp).fit()
print(model.summary())
print(df.groupby('flipped').agg(dict(fowlkes_mallows_index='mean')))
print(get_partial_eta_squared(model))


In [None]:
df.groupby('group').agg(dict(fowlkes_mallows_index='mean'))

In [None]:
trials = [dt for t in [json.loads(p.read_bytes()) for p in Path("data/normalized_clustering_trials/").glob("*.json")] for dt in t]

In [None]:
def get_points(trial):
    return [p for c in trial['clusters'] for p in c['points']]

def point_distance(x, y):
    return ((x['x'] - y['x']) ** 2 + (x['y'] - y['y']) ** 2) ** 0.5

def trial_cluster_sets(clusters):
    sets = []
    for c in clusters:
        cs = set()
        for p in c['points']:
            cs.add((p['x'], p['y']))
        sets.append(cs)
    return sets

def find_cluster_index(css, point):
    for idx, cs in enumerate(css):
        if (point['x'], point['y']) in cs:
            return idx
    return None

def is_same_cluster(css, p1, p2):
    i1 = find_cluster_index(css, p1)
    i2 = find_cluster_index(css, p2)
    return i1 == i2

items = []

for trial in trials:

    ps = get_points(trial)

    css = trial_cluster_sets(trial['clusters'])
    

    for p1 in ps:
        for p2 in ps:
            if not (p1['x'] == p2['x'] and p1['y'] == p2['y']):
               d = point_distance(p1, p2)
               items.append((d, is_same_cluster(css, p1, p2), trial['group'], trial['participant_id'], trial['base_uuid']))

df = pd.DataFrame(items, columns=('distance', 'same', 'cluster_structure', 'participant_id', 'base_uuid')).sort_values('distance')
df.head()
        

In [None]:

fig, ax = plt.subplots()

tmp1 = df[df['cluster_structure'] == 'disperse']
tmp = tmp1.groupby(pd.cut(tmp1['distance'], 100)).agg(same=('same', 'mean'), x_val=('distance', 'mean')).dropna()
ax.plot(tmp['x_val'], tmp['same'], label='Dispersed stimuli')
tmp1 = df[df['cluster_structure'] == 'clustered']
tmp = tmp1.groupby(pd.cut(tmp1['distance'], 100)).agg(same=('same', 'mean'), x_val=('distance', 'mean')).dropna()
ax.plot(tmp['x_val'], tmp['same'], label='Clustered stimuli')

ax.legend()



In [None]:

tmp = df.groupby([pd.cut(df['distance'], 700), 'cluster_structure']).agg(same=('same', 'mean'), x_val=('distance', 'mean')).dropna().reset_index()

tmp['cluster_structure'] = pd.Categorical(tmp['cluster_structure'])

smf.ols('same ~ 1 + x_val * cluster_structure', data=tmp).fit().summary()


In [None]:
tmp = df.groupby([pd.cut(df['distance'], 700), 'cluster_structure', 'participant_id']).agg(same=('same', 'mean'), x_val=('distance', 'mean')).dropna().reset_index()

tmp['cluster_structure'] = pd.Categorical(tmp['cluster_structure'])
tmp

In [None]:

fig, ax = plt.subplots()

tmp1 = df[df['cluster_structure'] == 'clustered']
tmp = tmp1.groupby(pd.cut(tmp1['distance'], 100)).agg(same=('same', 'mean'), x_val=('distance', 'mean')).dropna()
ax.plot(tmp['x_val'], tmp['same'], label='Participants: Clustered stimuli')
tmp1 = df[df['cluster_structure'] == 'disperse']
tmp = tmp1.groupby(pd.cut(tmp1['distance'], 100)).agg(same=('same', 'mean'), x_val=('distance', 'mean')).dropna()
ax.plot(tmp['x_val'], tmp['same'], label='Participants: Dispersed stimuli')


tmp = df.groupby(pd.cut(df['distance'], 700)).agg(same=('same', 'mean'), x_val=('distance', 'mean')).dropna()

# Normal distribution line
ax.plot(np.linspace(0, np.max(tmp['x_val']), len(tmp)),  scipy.stats.norm.pdf(np.linspace(0, np.max(tmp['x_val']), len(tmp)), 0, 144.34) * 332.021 + 0.05835314332011721, label='Normal fit', linestyle='dashed', alpha=0.5)

# Negative exponential
ax.plot(np.linspace(0, np.max(tmp['x_val']), len(tmp)),  np.exp(-0.00498114 * np.linspace(0, np.max(tmp['x_val']), len(tmp)) + 0.007978), label='Negative exponential fit', linestyle='dashdot', alpha=0.5)

ax.plot(np.linspace(64, np.max(tmp['x_val']), len(tmp)), (np.linspace(65, np.max(tmp['x_val']), len(tmp)) ** -1.3093910596756446) *  np.exp(5.484323759255021), label='Power fit', linestyle='dotted', alpha=0.5)
ax.set_xlabel('Distance between points')
ax.set_ylabel('P(points in same cluster)')

fig.legend(bbox_to_anchor=(0.95, 0.95))

fig.tight_layout()

fig.savefig('distance_between_points_analysis.png', dpi=600)



In [None]:
tmp1 = tmp.sort_values(['x_val'], ascending=True).dropna()
tmp_tests = []
linresults = []
for sd in np.arange(140, 150, 0.01):
    col = tmp1['same']
    preds = scipy.stats.norm.pdf(tmp1['x_val'], 0, sd)
    result = scipy.stats.linregress(preds, col)
    linresults.append(result)
    tmp_tests.append([sd, result.rvalue ** 2, result.slope, result.intercept, result.pvalue])


tmp_tests = np.array(tmp_tests)
tmp_tests[tmp_tests[:, 1].argsort()][-1]

linresults[tmp_tests[:, 1].argsort()[-1]]
    # plt.plot(np.linspace(0, sd * 4, 1000), scipy.stats.norm.pdf(np.linspace(0, sd * 4, 1000), 0, sd))

In [None]:
tmp1 = tmp.sort_values(['x_val'], ascending=True).dropna()
scipy.stats.linregress(tmp1['x_val'], -1 * np.log(tmp1['same']))

In [None]:
tmp1 = tmp.sort_values(['x_val'], ascending=True).dropna()
scipy.stats.linregress(np.log(tmp1['x_val']), np.log(tmp1['same']))

In [None]:
def bucketed_means(inp, n_buckets):
    groups = pd.cut(inp, 100)
    tdf = pd.DataFrame({"rvs": tmp}).groupby(groups).agg({'rvs': 'mean'}).reset_index().dropna()
    return tdf['rvs']

fig, ax = plt.subplots()


    
sns.kdeplot(scipy.stats.halfnorm.rvs(size=100000), bw_adjust=0.1, ax=ax)

ax.set_xlim(0, 4)




In [None]:
tmp = df.groupby(pd.cut(df['distance'], 700)).agg(same=('same', 'mean'), x_val=('distance', 'mean'))
plt.plot(tmp['x_val'] / 200, tmp['same'] * 0.8)

In [None]:
def apply_func(dfs):
    clusters1 = dfs.iloc[0]['clusters']
    clusters2 = dfs.iloc[1]['clusters']
    points = [p for c in clusters1 for p in c['points']]
    css1 = trial_cluster_sets(clusters1)
    css2 = trial_cluster_sets(clusters2)

    items = []

    for p1 in ps:
        for p2 in ps:
            if not (p1['x'] == p2['x'] and p1['y'] == p2['y']):
               d = point_distance(p1, p2)
               same1 = is_same_cluster(css1, p1, p2)
               same2 = is_same_cluster(css2, p1, p2)
               reliability = same1 == same2
               items.append(dict(distance=d, reliability=reliability))
               
    return pd.DataFrame.from_records(items)
    
    

rdf = pd.DataFrame.from_records(trials).groupby(['participant_id', 'base_uuid']).apply(apply_func).reset_index()
rdf.head()

In [None]:
tmp = rdf.groupby(pd.cut(rdf['distance'], 30)).agg(x_val=('distance', 'mean'), reliability=('reliability', 'mean'))
plt.plot(tmp['x_val'], tmp['reliability'])