In [None]:
import sys; sys.path.append('..')

In [None]:
from fractions import Fraction
from collections import defaultdict, Counter
import pickle
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations
import math
from tqdm import tqdm
from pathlib import Path
import seaborn as sns
from util import max_size2possible_t, ds_names, ds_names_short, colors_group

In [None]:
colormap = plt.get_cmap('viridis')
plt.rcParams.update({'font.size': 30})
p_core = Path('../data/core')
p_size = Path('../data/size')
p_size.mkdir(exist_ok=True)
for ds_name in ds_names:
    p_core_ds = p_core / ds_name
    p_size_ds = p_size / ds_name
    p_size_ds.mkdir(exist_ok=True)
    k2t_nkts = defaultdict(list)
    N = None
    for c_t in tqdm(list(p_core_ds.glob('c*.pkl'))):
        t = Fraction(c_t.name[2:-4].replace('-', '/'))
        with c_t.open('rb') as f:
            v2c_t = pickle.load(f)
        if N is None:
            N = len(v2c_t)
        cnt_c_t = Counter(v2c_t.values())
        k2nkt = defaultdict(int)
        for k, n in cnt_c_t.items():
            for kk in range(1, k + 1):
                k2nkt[kk] += n
        for k, nkt in k2nkt.items():
            k2t_nkts[k].append((t, nkt))
    print('k2t_nkts constructed!')
    for k, t_nkts in tqdm(k2t_nkts.items()):
        t_nkts_new = []
        for i, t_nkt in enumerate(t_nkts):
            tt, nnkt = t_nkt
            if i == 0 or t_nkt != t_nkts[i - 1]:
                t_nkts_new.append((tt, nnkt))
        t_nkts_new.sort()
        k2t_nkts[k] = t_nkts_new[:]
    print('k2t_nkts optimized!')
    with (p_size_ds / 'k2t_nkts.pkl').open('wb') as f:
        pickle.dump(k2t_nkts, f)
    
    
    # with (p_size_ds / 'k2t_nkts.pkl').open('rb') as f:
    #     k2t_nkts = pickle.load(f)

    plt.clf()
    for k, t_nkts in tqdm(k2t_nkts.items()):
        if k == 1:
            plt.bar(1, 1., 1, color=colormap(1.))
        else:
            for i, t_nkt in enumerate(t_nkts):
                tt, nnkt = t_nkt
                if i == 0:
                    plt.bar(k, tt, 1, color=colormap(math.log(nnkt, N)))
                else:
                    nkt_prev = t_nkts[i - 1][0]
                    plt.bar(k, tt - nkt_prev, 1, bottom=nkt_prev, color=colormap(math.log(nnkt, N)))
    plt.xlabel('k')
    plt.xscale('log')
    plt.ylabel('t')
    # plt.title('Hypercore sizes, ' + ds_name)
    p_figures = Path('figures')
    p_figures.mkdir(exist_ok=True)
    plt.savefig(p_figures / '{ds}.png'.format(ds=ds_name), bbox_inches='tight')
    # plt.savefig(p_figures / '{ds}.pdf'.format(ds=ds_name), bbox_inches='tight')
    plt.show()


In [None]:
p_incident = Path('../data/incident')
p_size = Path('../data/size')
S = 10000
ds_num = len(ds_names)
HSMD_matrix = np.zeros((ds_num, ds_num))
ds2rel_size = dict()
for ds_name in tqdm(ds_names):
    p_size_ds = p_size / ds_name
    ds_index = ds_names.index(ds_name)
    p_incident_ds = p_incident / ds_name
    with (p_incident_ds / 'i2edges.pkl').open('rb') as f:
        i2edges = pickle.load(f)
    n = len(i2edges)
    max_size = max(len(e) for e in i2edges.values())
    possible_t = max_size2possible_t[max_size]
    with (p_size_ds / 'k2t_nkts.pkl').open('rb') as f:
        k2t_nkts = pickle.load(f)
    k_max = max(k2t_nkts)
    k_samples = [math.ceil(k_max ** x) for x in np.linspace(0, 1, S)]
    t_samples = np.linspace(0, 1, 100)
    rel_size_matrix = -np.ones((S, 100))
    for i_k, k in enumerate(k_samples):
        t_nkts = k2t_nkts[k]
        i_t = 0
        for t, nkt in t_nkts:
            while t_samples[i_t] <= t:
                rel_size_matrix[i_k][i_t] = math.log(nkt, n)
                i_t += 1
                if i_t >= len(t_samples):
                    break
    ds2rel_size[ds_index] = rel_size_matrix
for i_1, i_2 in tqdm(list(combinations(range(ds_num), 2))):
    rel_size_1 = ds2rel_size[i_1]
    rel_size_2 = ds2rel_size[i_2]
    HSMD_matrix[i_1][i_2] = HSMD_matrix[i_2][i_1] = np.sqrt(np.mean(np.square(np.clip(rel_size_1 - rel_size_2, -1, 1))))

p_HSMD = Path('HSMD.pkl')
with p_HSMD.open('wb') as f:
    pickle.dump(HSMD_matrix, f)

plt.rcParams.update({'font.size': 15})
plt.clf()
ax = sns.heatmap(HSMD_matrix, xticklabels=False, yticklabels=ds_names_short, cmap='RdBu')
for ticklabel in ax.get_yticklabels():
    ticklabel_text = ticklabel.get_text()
    ds_index = ds_names_short.index(ticklabel_text)
    tickcolor = colors_group[ds_index]
    ticklabel.set_color(tickcolor)
ax.figure.savefig('HSMD.png', bbox_inches='tight')
ax.figure.savefig('HSMD.pdf', bbox_inches='tight')
plt.show()

dist_total = []
dist_within = []
dist_cross = []
for i_1, i_2 in tqdm(list(combinations(range(ds_num), 2))):
    HSMD = HSMD_matrix[i_1][i_2]
    if ds_names[i_1].split('-')[0] == ds_names[i_2].split('-')[0]:
        dist_within.append(HSMD)
    else:
        dist_cross.append(HSMD)
    dist_total.append(HSMD)
print('Global average: {:.3f}'.format(np.mean(dist_total)))
print('Within-domain average: {:.3f}'.format(np.mean(dist_within)))
print('Cross-domain average: {:.3f}'.format(np.mean(dist_cross)))