# Расчет общих микросостояний на основе всех имеющихся

In [1]:
import pandas as pd

from helper import *
from neurokit2.stats.cluster import cluster
import scipy
import numpy as np
import sklearn


In [2]:
def _microstates_segment_runsegmentation(data, microstates, gfp, n_microstates):
    # Find microstate corresponding to each datapoint
    activation = microstates.dot(data)
    segmentation = np.argmax(np.abs(activation), axis=0)
    polarity = np.sign(np.choose(segmentation, activation))

    # Get Global Explained Variance (GEV)
    gev, gev_all = _cluster_quality_gev(
        data.T, microstates, segmentation, sd=gfp, n_clusters=n_microstates
    )
    return segmentation, polarity, gev, gev_all

def _cluster_quality_gev(data, clusters, clustering, sd=None, n_clusters=4):
    """Global Variance Explained (GEV)"""
    if sd is None:
        sd = np.nanstd(data, axis=1)
    map_corr = _correlate_vectors(data.T, clusters[clustering].T)

    gev_all = np.zeros(n_clusters)
    for state in range(n_clusters):
        idx = clustering == state
        gev_all[state] = np.nansum((sd[idx] * map_corr[idx]) ** 2) / np.nansum(sd**2)

    gev = np.nansum(gev_all)
    #    gev = np.sum((sd * map_corr) ** 2) / np.sum(sd**2)
    return gev, gev_all


def _correlate_vectors(A, B, axis=0):
    """Compute pairwise correlation of multiple pairs of vectors.
    Fast way to compute correlation of multiple pairs of vectors without
    computing all pairs as would with corr(A,B). Borrowed from Oli at Stack
    overflow.

    Note the resulting coefficients vary slightly from the ones
    obtained from corr due differences in the order of the calculations.
    (Differences are of a magnitude of 1e-9 to 1e-17 depending of the tested
    data).

    Parameters
    ----------
    A : array
        The first collection of vectors of shape (n, m)
    B : array
        The second collection of vectors of shape (n, m)
    axis : int
        The axis that contains the elements of each vector. Defaults to 0.

    Returns
    -------
    corr : array
        For each pair of vectors, the correlation between them with shape (m, )

    """
    An = A - np.nanmean(A, axis=axis)
    Bn = B - np.nanmean(B, axis=axis)
    An /= np.linalg.norm(An, axis=axis)
    Bn /= np.linalg.norm(Bn, axis=axis)
    return np.nansum(An * Bn, axis=axis)

In [3]:
raw_data_folder_path = "./raw_data/"
preprocessed_data_folder_path = "./preprocessed_data/"
save_data_folder_path = "./save_data/"
images_folder_path = "./images/"
statistics_folder_path = "statistics/"
mhw_objects_folder_path = "mhw_objects/"

folders = Folders(
    end_folder = "",
    raw_data = raw_data_folder_path,
    preprocessed_data = preprocessed_data_folder_path,
    save_data = save_data_folder_path,
    images = images_folder_path,
    statistics=statistics_folder_path,
    mhw_objects=mhw_objects_folder_path
)


filenames014 = [
    "ACP_INP0014_REST1_1pnt_1vis",
    "ACP_INP0014_REST2_1pnt_1vis",
    "ACP_INP0014_REST3_1pnt_1vis",
    "ACP_INP0014_REST1_1pnt_2vis",
    "ACP_INP0014_REST2_1pnt_2vis",
    "ACP_INP0014_REST3_1pnt_2vis"
]
path014 = "rest_14/"

filenames019 = [
    "INP0019_v1.4_REST1_R003_R003_08.11.23",
    "INP0019_v1.4_REST2_R003_R003_08.11.23",
    "INP0019_v1.4_REST3_R003_R003_08.11.23",
    "INP0019_v1.5_REST2_Op005_Op007_08.12.23",
    "INP0019_v1.5_REST3_Op005_Op007_08.12.23",
]
path019 = "rest_19/"

filenames036 = [
    "INP0036_v1.4_REST1_Op008_Op011_11.12.23",
    "INP0036_v1.4_REST2_Op008_Op011_11.12.23",
    "INP0036_v1.4_REST3_Op008_Op011_11.12.23",
    "INP0036_v1.5_REST1_Op005_Op010_12.12.23",
    "INP0036_v1.5_REST2_Op005_Op010_12.12.23",
    "INP0036_v1.5_REST3_Op005_Op010_12.12.23",
]
path036 = "rest_36/"

filenames045 = [
    "INP0045_v1.4_REST1_R003_R003_08.11.23",
    "INP0045_v1.4_REST2_R003_R003_08.11.23",
    "INP0045_v1.4_REST3_R003_R003_08.11.23",
    "INP0045_v1.5_rs11_Op005_Op008_Op011_17.11.23",
    "INP0045_v1.5_rs22_Op005_Op008_Op011_17.11.23",
    "INP0045_v1.5_rs23_Op005_Op008_Op011_17.11.23",
]
path045 = "rest_45/"

filenames064 = [
    "INP0064_v1.4_rs11_S008_R003_04.12.23",
    "INP0064_v1.4_rs12_Op008_R003_04.12.23",
    "INP0064_v1.4_rs13_Op008_R003_04.12.23",
    "INP0064_v1.5_rs21_Op005_Op006_Op008_05.12.23",
    "INP0064_v1.5_rs22_Op009_A002_13.12.23",
    "INP0064_v1.5_rs23_Op009_A002_13.12.23",
]
path064 = "rest_64/"

filenames = [filenames014, filenames019, filenames036, filenames045, filenames064]
paths = [path014, path019, path036, path045, path064]

In [4]:
ms_df = pd.DataFrame()
mhw = None

In [5]:
# for i in range(len(filenames)):
# # for i in range(1):
#     for filename in filenames[i]:
#         path = paths[i]
#         folders.end_folder = path
#         th_filename = filename + "_th"
#         print(f"Processing {filename}")
#         raw = mne.io.read_raw_eeglab(folders.preprocessed_data + folders.end_folder + filename + '.set')
#         mhw = MicrostateHelperWrapper(folders, raw, th_filename)
#         mhw = mhw.load()
#         ms_df = pd.concat([ms_df, pd.DataFrame.from_records(mhw.ms['Microstates'])])
ms_df = pd.read_csv(folders.save_data + "global_ms_df.csv")

In [6]:
# ms_df.reset_index(drop=True, inplace=True)

In [7]:
# replace NaN with 0
# ms_df.fillna(0, inplace=True)

In [8]:
# ms_df.to_csv(folders.save_data + "global_ms_df.csv", index=False)

In [9]:
# clustering, clusters, cluster_info = cluster(ms_df, n_clusters=4, method='kmod', verbose=True)

In [10]:
# for i in range(4):
#     mne.viz.plot_topomap(clusters[i], mhw.raw.info, show=True)


In [11]:
# _cluster_quality_gev(ms_df, clusters, clustering["Cluster"])

In [12]:
methods = [
    "kmod",
    "kmeans",
    "kmedoids",
    "pca",
    "ica",
    "aahc",
    "hierarchical",
    "spectral",
    "mixture",
    "mixturebayesian"
]
# method_df = pd.DataFrame(columns=methods)
# for i in range(100):
#     print(i)
#     val_arr = []
#     for method in methods:
#         clustering, clusters, cluster_info = cluster(ms_df, n_clusters=4, method=method)
#         print("===============" + method + "===================")
#         # for i in range(4):
#         #     mne.viz.plot_topomap(clusters[i], mhw.raw.info, show=True)
#         quality = _cluster_quality_gev(ms_df, clusters, clustering["Cluster"])
#         print(quality)
#         val_arr.append(quality[0])
#     method_df = pd.concat([method_df, pd.DataFrame([val_arr], columns=methods)])

# cols2 = ["GEV", "GEV1", "GEV2", "GEV3", "GEV4", "Method"]
# method_df2 = pd.DataFrame(columns=cols2)
# for i in range(100):
#     print(i)
#     val_arr = []
#     for method in methods:
#         clustering, clusters, cluster_info = cluster(ms_df, n_clusters=4, method=method)
#         print("===============" + method + "===================")
#         # for i in range(4):
#         #     mne.viz.plot_topomap(clusters[i], mhw.raw.info, show=True)
#         quality = _cluster_quality_gev(ms_df, clusters, clustering["Cluster"])
#         print(quality)
#         sorted_quality = np.sort(quality[1])
#         val_arr.append([quality[0], *sorted_quality, method])
#     method_df2 = pd.concat([method_df2, pd.DataFrame(val_arr, columns=cols2)])


In [13]:
# method_df*100

In [14]:
# method_df2.groupby("Method").mean()*100

## Сравнение методов для исходных данных

In [15]:
raw = mne.io.read_raw_eeglab(folders.preprocessed_data + path014 + filenames[0][0] + '.set')
raw.load_data(verbose=False)

  raw = mne.io.read_raw_eeglab(folders.preprocessed_data + path014 + filenames[0][0] + '.set')
  raw = mne.io.read_raw_eeglab(folders.preprocessed_data + path014 + filenames[0][0] + '.set')


0,1
Measurement date,Unknown
Experimenter,Unknown
Participant,Unknown

0,1
Digitized points,64 points
Good channels,61 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available

0,1
Sampling frequency,2048.00 Hz
Highpass,0.00 Hz
Lowpass,1024.00 Hz
Filenames,ACP_INP0014_REST1_1pnt_1vis.set
Duration,00:08:15 (HH:MM:SS)


In [22]:

method_df = pd.DataFrame(columns=methods)

In [24]:
methods = [
    "kmod",
    "kmeans",
    # "kmedoids",
    # "pca",
    # "ica",
    "aahc",
    "hierarchical",
    "spectral",
    # "mixture",
    # "mixturebayesian"
]
# for i in range(100):
#     print(i)
#     val_arr = []
#     for method in methods:
#         clustering, clusters, cluster_info = cluster(ms_df, n_clusters=4, method=method)
#         print("===============" + method + "===================")
#         # for i in range(4):
#         #     mne.viz.plot_topomap(clusters[i], mhw.raw.info, show=True)
#         quality = _cluster_quality_gev(ms_df, clusters, clustering["Cluster"])
#         print(quality)
#         val_arr.append(quality[0])
#     method_df = pd.concat([method_df, pd.DataFrame([val_arr], columns=methods)])



# for i in range(10):
#     print(i)

ms_arr = {
    "kmod": [],
    "kmeans": [],
    # "kmedoids": [],
    # "pca": [],
    # "ica": [],
    "aahc": [],
    "hierarchical": [],
    "spectral": [],
    # "mixture": [],
    # "mixturebayesian": []
}

for i in range(5):
    print(i)
    val_arr = []
    for method in methods:
        print("===============" + method + "===================")
        microstates = nk.microstates_segment(raw, n_microstates=4, method=method)
        print(microstates["GEV"])
        val_arr.append(microstates["GEV"])
        ms_arr[method].append(microstates)
    method_df = pd.concat([method_df, pd.DataFrame([val_arr], columns=methods)])
        

0
0.571851200151974
0.4651734985324701
0.5440460177937156
0.5426774585135402
0.5440637719775352
1
0.5718204271646933
0.46517326304043727
0.5478592636471709
0.5493552758747735
0.550902699561708
2
0.5718540871617479
0.4651752352963844
0.5455260919336623
0.5439933600681984
0.5436798395463109
3
0.5718299910762786
0.4651637519089025
0.5521246630997593
0.545953900569224
0.5482237216120895
4
0.5718546158240361
0.465122087112725
0.5487907935032499
0.5557541573041658
0.5440169097845067


In [25]:
method_df

Unnamed: 0,kmod,kmeans,aahc,hierarchical,spectral
0,0.571821,0.465164,0.546402,0.549554,0.544087
0,0.571851,0.465173,0.544046,0.542677,0.544064
0,0.57182,0.465173,0.547859,0.549355,0.550903
0,0.571854,0.465175,0.545526,0.543993,0.54368
0,0.57183,0.465164,0.552125,0.545954,0.548224
0,0.571855,0.465122,0.548791,0.555754,0.544017


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
for (key, value) in ms_arr.items():
    print(key)
    fig, axes = plt.subplots(len(value), 4, figsize=(10, 10))
    fig.suptitle(key)
    for i in range(len(value)):
        for j in range(4):
            mne.viz.plot_topomap(value[i]["Microstates"][j], value[i]["Info"], show=False, axes=axes[i][j])
    plt.show()

In [72]:
# Gev per microstate into df
cols = ["GEV1", "GEV2", "GEV3", "GEV4", "Method"]
gev_per_ms = pd.DataFrame(columns=cols)
for (key, value) in ms_arr.items():
    for i in range(len(value)):
        # row = [*value[i]["GEV_per_microstate"], key]
        # print(pd.DataFrame.from_records(row))
        gev_per_ms = pd.concat([gev_per_ms, pd.DataFrame([[*np.sort(value[i]["GEV_per_microstate"]), key]], columns=cols)])

  gev_per_ms = pd.concat([gev_per_ms, pd.DataFrame([[*np.sort(value[i]["GEV_per_microstate"]), key]], columns=cols)])


In [74]:
gev_per_ms.groupby("Method").mean()*100

Unnamed: 0_level_0,GEV1,GEV2,GEV3,GEV4
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aahc,5.102415,10.159927,14.812849,24.691746
hierarchical,5.923084,10.734701,13.695496,24.401402
kmeans,0.12204,0.9286,6.926416,38.539101
kmod,9.282875,12.267936,15.015788,20.617607
spectral,4.866677,9.955302,13.314257,26.481503


In [77]:
# ms_arr to pickle 
import pickle
with open(folders.save_data + "ms_arr.pkl", "wb") as f:
    pickle.dump(ms_arr, f)


In [40]:
# cluster_quality = nk.cluster_quality(ms_df, clustering, clusters, cluster_info)

In [41]:
# scipy.spatial.distance.cdist(mhw.ms["Microstates"], clusters, 'correlation')
# 2 0 3 2

In [42]:
# scipy.spatial.distance.cdist(mhw.ms["Microstates"], clusters, 'euclidean')
# 2 1 3 3

In [43]:
import functools
from neurokit2.stats.cluster_quality import _cluster_quality_distance
from neurokit2.stats.cluster import _cluster_getclusters
import warnings
from neurokit2 import check_random_state


def _cluster_kmod(
        data,
        n_clusters=4,
        max_iterations=1000,
        threshold=1e-6,
        random_state=None,
        optimize=False,
        **kwargs
):

    n_samples, n_channels = data.shape

    # Cache this value for later to compute residual
    data_sum_sq = np.sum(data**2)

    # Select random timepoints for our initial topographic maps
    rng = check_random_state(random_state)
    init_times = rng.choice(n_samples, size=n_clusters, replace=False)

    # Initialize random cluster centroids
    clusters = data[init_times, :]

    # Normalize row-wise (across EEG channels)
    clusters /= np.linalg.norm(clusters, axis=1, keepdims=True)  # Normalize the maps

    # Initialize iteration
    prev_residual = 0
    for i in range(max_iterations):

        # Step 3: Assign each sample to the best matching microstate
        activation = clusters.dot(data.T)
        segmentation = np.argmax(np.abs(activation), axis=0)
        # print(activation[:4],segmentation[:4])
        # Step 4: Recompute the topographic maps of the microstates, based on the
        # samples that were assigned to each state.
        for state in np.arange(n_clusters):

            # Get data fro specific state
            idx = segmentation == state
            data_state = data[idx, :]

            # Sanity check
            if np.sum(idx) == 0:
                clusters[state] = 0
                continue

            # Retrieve map values

            if optimize:
                # Method 2 - optimized segmentation
                state_vals = data_state.T.dot(activation[state, idx])
            else:
                # Method 1 - eighen value
                # step 4a
                Sk = np.dot(data_state.T, data_state)
                # step 4b
                eigen_vals, eigen_vectors = scipy.linalg.eigh(Sk)
                state_vals = eigen_vectors[:, np.argmax(np.abs(eigen_vals))]

            state_vals /= np.linalg.norm(state_vals)  # Normalize Map
            clusters[state, :] = state_vals  # Store map

        # Estimate residual noise (step 5)
        act_sum_sq = np.sum(np.sum(clusters[segmentation, :] * data, axis=1) ** 2)
        residual = np.abs(data_sum_sq - act_sum_sq)
        residual = residual / float(n_samples * (n_channels - 1))

        # Have we converged? Convergence criterion: variance estimate (step 6)
        if np.abs(prev_residual - residual) < (threshold * residual):
            break

        # Next iteration
        prev_residual = residual.copy()

    if i == max_iterations:
        warnings.warn(
            "Modified K-means algorithm failed to converge after " + str(i) + "",
            "iterations. Consider increasing 'max_iterations'.",
        )

    # De-normalize
    clusters_unnormalized = _cluster_getclusters(data, segmentation)
    prediction = _cluster_quality_distance(data, clusters_unnormalized, to_dataframe=True)
    prediction["Cluster"] = segmentation

    # Copy function with given parameters
    clustering_function = functools.partial(
        _cluster_kmod,
        n_clusters=n_clusters,
        max_iterations=max_iterations,
        threshold=threshold,
        random_state=random_state,
        **kwargs
    )

    # Info dump
    info = {
        "n_clusters": n_clusters,
        "clustering_function": clustering_function,
        "random_state": random_state,
        "clusters_normalized": clusters,
        "residual": residual,
    }

    return prediction, clusters_unnormalized, info

In [44]:
# clustering1, clusters1, cluster_info1 = _cluster_kmod(ms_df.values, n_clusters=4, method='kmod', verbose=True)

In [45]:
# np.bincount(clustering1['Cluster'])

In [46]:
# for i in range(4):
#     mne.viz.plot_topomap(clusters1[i], mhw.raw.info, show=True)

In [47]:
# clustering2, clusters2, cluster_info2 = cluster(ms_df.values, n_clusters=4, method='kmean', verbose=True)

In [48]:
# clustering2

In [49]:
# kmeans = sklearn.cluster.KMeans(n_clusters=4, max_iter=100000, tol=1e-9).fit(ms_df.values)

In [50]:
# np.bincount(kmeans.labels_)

In [51]:
# for i in range(4):
#     mne.viz.plot_topomap(kmeans.cluster_centers_[i], mhw.raw.info, show=True)

In [52]:
# kmeans2 = sklearn.cluster.BisectingKMeans(n_clusters=4, init='k-means++').fit(ms_df.values)

In [53]:
# pd.DataFrame.from_records([kmeans2.labels_]).T
# np.bincount(kmeans2.labels_)

In [54]:
# for i in range(4):
#     mne.viz.plot_topomap(kmeans2.cluster_centers_[i], mhw.raw.info, show=True)

In [55]:
# centroid, label = scipy.cluster.vq.kmeans2(ms_df.values, 4, minit='++', iter=1000)