In [43]:
import numpy as np
import pandas as pd
import scipy.stats as stats

import pickle
from pathlib import Path

In [44]:
metricName = "betweenCentrality" # CHANGE THIS

In [45]:
# DIR = "./data/consensusResults2"
DIR = Path.cwd() / "data" / "consensusResults2"
OUTPUT_DIR = Path.cwd() / "data" / "anova"

In [46]:
# load results from graph metric pickle files
with open(DIR / f"{metricName}.pkl", "rb") as handle:
    graphMetric = pickle.load(handle)

In [47]:
### Normal ANOVA Testing ###

# NOTE: iter10 has been ignored because of the low number of clusters (4)

# Run ANOVA on Cluster 1's metric for each of the 13 keys in the dictionary
iter10, iter25, iter50, iter75, iter100, iter150, iter200, iter250, iter300, iter350, iter400, iter450, iter500 = graphMetric[10], graphMetric[25], graphMetric[50], graphMetric[75], graphMetric[100], graphMetric[150], graphMetric[200], graphMetric[250], graphMetric[300], graphMetric[350], graphMetric[400], graphMetric[450], graphMetric[500]

# Find max number of clusters to iterate through
max_num_of_clusters = min(len(iter25), len(iter50), len(iter75), len(iter100), len(iter150), len(iter200), len(iter250), len(iter300), len(iter350), len(iter400), len(iter450), len(iter500))

# Print max number of clusters
print(f"Max number of clusters to iterate through: {max_num_of_clusters}")

list_of_iters = [iter25, iter50, iter75, iter100, iter150, iter200, iter250, iter300, iter350, iter400, iter450, iter500]

# Cast each cluster's metric values to a list
# Save statistics to file

with open(OUTPUT_DIR / "initial" / f"{metricName}_ANOVA.txt", "w") as f:
    for i in range(max_num_of_clusters):
        f.write(f"Cluster: {i+1}\n")
        list_of_clusters = [list(iteration[i].values()) for iteration in list_of_iters]

        statistic, pvalue = stats.f_oneway(*list_of_clusters)
        f.write(f"Statistic: {statistic}; P-Value: {pvalue}\n")
        f.write("\n")

print(metricName + " ANOVA testing complete.")

Max number of clusters to iterate through: 5
betweenCentrality ANOVA testing complete.


In [48]:
### Sliding Window ANOVA Testing ###

# NOTE: iter10 has been ignored because of the low number of clusters (4)

# Run ANOVA on Cluster 1's metric for each of the 13 keys in the dictionary
iter10, iter25, iter50, iter75, iter100, iter150, iter200, iter250, iter300, iter350, iter400, iter450, iter500 = graphMetric[10], graphMetric[25], graphMetric[50], graphMetric[75], graphMetric[100], graphMetric[150], graphMetric[200], graphMetric[250], graphMetric[300], graphMetric[350], graphMetric[400], graphMetric[450], graphMetric[500]

# Find max number of clusters to iterate through
max_num_of_clusters = min(len(iter25), len(iter50), len(iter75), len(iter100), len(iter150), len(iter200), len(iter250), len(iter300), len(iter350), len(iter400), len(iter450), len(iter500))

# Print max number of clusters
print(f"Max number of clusters to iterate through: {max_num_of_clusters}")

sliding_window_size = 3

list_of_iters = [iter25, iter50, iter75, iter100, iter150, iter200, iter250, iter300, iter350, iter400, iter450, iter500]
list_of_iter_tags = ["25", "50", "75", "100", "150", "200", "250", "300", "350", "400", "450", "500"]
# Cast each cluster's metric values to a list
# Save statistics to file

with open(OUTPUT_DIR / "sliding" / f"{metricName}_ANOVA.txt", "w") as f:
    for i in range(max_num_of_clusters):
        f.write(f"[CLUSTER {i+1}]\n\n")
        list_of_clusters = [list(iteration[i].values()) for iteration in list_of_iters]
        for j in range(len(list_of_clusters) - sliding_window_size):
            f.write(f"Cluster {i+1} from {list_of_iter_tags[j]} to {list_of_iter_tags[j+sliding_window_size]}\n")

            window_of_clusters = list_of_clusters[j:j+sliding_window_size]

            statistic, pvalue = stats.f_oneway(*list_of_clusters)
            f.write(f"Statistic: {statistic}; P-Value: {pvalue}\n")
            f.write("\n")

print(metricName + " ANOVA testing complete.")

Max number of clusters to iterate through: 5
betweenCentrality ANOVA testing complete.


In [49]:
# labels_unique, counts = np.unique(consensusResults[i][1], return_counts=True)
# print(labels_unique, counts)

""" 
Steps:
- Start at iter=75
- For each sliding window of size=3, find which iterations need swapping (check code above)
    - to get those counts, you need to load in consensusResults.npy
- Swap clusters 4&5 first
- Swap clusters 3&4 next
- Do ANOVA on this sliding window

Goal is to transform clusters that look like this (300.out):
[1 2 3 4 5] [1705 1286  444 1078 1083]

into this (we consider this "normal") (350.out):
[1 2 3 4 5] [1708 1287 1260  265 1076]
"""

### Sliding Window ANOVA Testing 2 ###

# NOTE: iter10, iter25 and iter50 havve been ignored

# Run ANOVA on Cluster 1's metric for each of the 13 keys in the dictionary
iter10, iter25, iter50, iter75, iter100, iter150, iter200, iter250, iter300, iter350, iter400, iter450, iter500 = graphMetric[10], graphMetric[25], graphMetric[50], graphMetric[75], graphMetric[100], graphMetric[150], graphMetric[200], graphMetric[250], graphMetric[300], graphMetric[350], graphMetric[400], graphMetric[450], graphMetric[500]

# Find max number of clusters to iterate through
max_num_of_clusters = min(len(iter75), len(iter100), len(iter150), len(iter200), len(iter250), len(iter300), len(iter350), len(iter400), len(iter450), len(iter500))

# Print max number of clusters
print(f"Max number of clusters to iterate through: {max_num_of_clusters}")

sliding_window_size = 2

list_of_iters = [iter75, iter100, iter150, iter200, iter250, iter300, iter350, iter400, iter450, iter500]
list_of_iter_tags = ["75", "100", "150", "200", "250", "300", "350", "400", "450", "500"]

# Swap clusters 4 & 5 for iter100
iter100[3], iter100[4] = iter100[4], iter100[3]
# Swap clusters 3 & 4 for iter100
iter100[2], iter100[3] = iter100[3], iter100[2]

# Swap clusters 4 & 5 for iter350
iter350[3], iter350[4] = iter350[4], iter350[3]
# Swap clusters 3 & 4 for iter350
iter350[2], iter350[3] = iter350[3], iter350[2]

# Swap clusters 4 & 5 for iter400
iter400[3], iter400[4] = iter400[4], iter400[3]
# Swap clusters 3 & 4 for iter400
iter400[2], iter400[3] = iter400[3], iter400[2]

# Swap clusters 4 & 5 for iter450
iter450[3], iter450[4] = iter450[4], iter450[3]
# Swap clusters 3 & 4 for iter450
iter450[2], iter450[3] = iter450[3], iter450[2]


# Cast each cluster's metric values to a list
# Save statistics to file

with open(OUTPUT_DIR / "sliding2" / f"{metricName}_ANOVA.txt", "w") as f:
    for i in range(max_num_of_clusters):
        f.write(f"[CLUSTER {i+1}]\n\n")
        list_of_clusters = [list(iteration[i].values()) for iteration in list_of_iters]
        for j in range(len(list_of_clusters) - sliding_window_size):
            f.write(f"Cluster {i+1} from {list_of_iter_tags[j]} to {list_of_iter_tags[j+sliding_window_size]}\n")

            window_of_clusters = list_of_clusters[j:j+sliding_window_size]

            statistic, pvalue = stats.f_oneway(*list_of_clusters)
            f.write(f"Statistic: {statistic}; P-Value: {pvalue}\n")
            f.write("\n")

print(metricName + " ANOVA testing complete.")

Max number of clusters to iterate through: 5
betweenCentrality ANOVA testing complete.
