### Data Diagnostics: Variability of the Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import helper_functions as hf
import plotly.graph_objects as go
from sklearn.metrics.cluster import contingency_matrix
from munkres import Munkres
import seaborn as sns
sns.set_theme()

In [None]:
time_embedded = pd.read_hdf('time_embedded_2103.h5')

In [None]:
pca = PCA(n_components=3)
pca_data = pca.fit_transform(time_embedded.loc[:,~time_embedded.columns.isin(['state', 'dataset'])])
data = pd.DataFrame(pca_data)
data['state'] = time_embedded['state']
data['dataset'] = time_embedded['dataset']

In [None]:
datasets = {dataset: df for dataset, df in data.groupby('dataset')}
hf.plot_PCs_separately(datasets).run_server(debug=True, port=8054)

### Color-code trajectories based on dataset 
Each data point is colored based on the dataset it belongs to. This helps in understanding the variability of the data across different datasets.

In [None]:
all_traces = []

for name, df in datasets.items():
    trace = go.Scatter3d(x=df[0], y=df[1], z=df[2], mode="lines", name=name)
    all_traces.append(trace)
    
fig = go.Figure(data=all_traces)

variances = pca.explained_variance_ratio_ * 100
scene = dict(xaxis_title=f"PC 1 ({variances[0]:.2f}%)",
                yaxis_title=f"PC 2 ({variances[1]:.2f}%)",
                zaxis_title=f"PC 3 ({variances[2]:.2f}%)")

fig.update_layout(scene=scene)
fig.show()

### Clustering with KMeans 
We can try clustering our data without the dataset feature to see if the separation of data points is based on the dataset membership or not.

We could cross check with a dataset where no preprocessing has been done.

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=23)
data['cluster'] = kmeans.fit_predict(time_embedded.loc[:,~time_embedded.columns.isin(['state', 'dataset'])])

In [None]:
from sklearn.preprocessing import LabelEncoder

data_copy = data.copy()
# Assuming 'text_column' is your pandas DataFrame column with text values
label_encoder = LabelEncoder()
data_copy['dataset_numeric'] = label_encoder.fit_transform(data_copy['dataset'])

In [None]:
data_copy.head()

**Evaluation** <br>
We can evaluate the clustering using the adjusted mutual information score, which calculates the mutual information between two clusterings and then normalizes this value by the expected mutual information of two random clusterings.

In [None]:
from sklearn.metrics.cluster import adjusted_mutual_info_score

labels_pred_proc = data_copy['cluster']
labels_true_proc = data_copy['dataset_numeric']

# Calculate adjusted mutual information score, which tells us how well the clustering results match the ground truth
print(adjusted_mutual_info_score(labels_true_proc, labels_pred_proc))

#### Control Dataset
We will now cluster our unpreprocessed data and see if the clusters are based on the dataset membership or not.

In [None]:
imputed_dataframe = pd.read_hdf("imputed_dataframe_0602.h5")
kmeans2 = KMeans(n_clusters=23)
imputed_dataframe['cluster'] = kmeans2.fit_predict(imputed_dataframe.loc[:,~imputed_dataframe.columns.isin(['state', 'dataset'])])
labels_pred_unproc = imputed_dataframe['cluster']

In [None]:
imputed_dataframe_copy = imputed_dataframe.copy()
# Assuming 'text_column' is your pandas DataFrame column with text values
label_encoder2 = LabelEncoder()
imputed_dataframe_copy['dataset_numeric'] = label_encoder2.fit_transform(imputed_dataframe_copy['dataset'])
labels_true_unproc = imputed_dataframe_copy['dataset_numeric']

In [None]:
print(adjusted_mutual_info_score(imputed_dataframe_copy['cluster'], imputed_dataframe_copy['dataset_numeric']))

**Result**<br>
Since the adjusted mutual information score between the clustering of our data and the dataset membership is relatively low (also compared to the unpreprocessed data), we can assume that the separation of data points is not based on the dataset membership and that the dataset feature might not explain the variability of the trajectories.


#### Cluster Label Correspondence
We want to match the predicted cluster labels to the dataset labels to see if the clustering is consistent with the dataset membership.

**Preprocessed Data**<br>


In [None]:
m = Munkres()
contmat = contingency_matrix(labels_true_proc, labels_pred_proc)
mapping = pd.DataFrame(m.compute(contmat.max() - contmat), columns=['val', 'map'])
labels_pred_proc_mapped = labels_pred_proc.map(mapping.set_index('val')['map'])
print("Validity Check (should be same number as before):",adjusted_mutual_info_score(labels_pred_proc_mapped, labels_true_proc))

**Unpreprocessed Data**<br>

In [None]:
m = Munkres()
contmat = contingency_matrix(labels_true_unproc, labels_pred_unproc)
mapping = pd.DataFrame(m.compute(contmat.max() - contmat), columns=['val', 'map'])
labels_pred_unproc_mapped = labels_pred_unproc.map(mapping.set_index('val')['map'])
print("Validity Check (should be same number as before):",adjusted_mutual_info_score(labels_pred_unproc_mapped, labels_true_unproc))

### Contingency Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.clf()
res = sns.heatmap(contingency_matrix(labels_true_proc, labels_pred_proc_mapped), fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=100.0)
plt.show()

In [None]:
fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.clf()
res = sns.heatmap(contingency_matrix(labels_true_unproc, labels_pred_unproc_mapped), fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=100.0)
plt.show()

#### Silhouette Coefficient
We will also calculate the silhouette coefficient to evaluate the quality of the clusters. The silhouette score ranges from -1 to 1, where a higher value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.

In [None]:
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score


X = time_embedded.loc[:,~time_embedded.columns.isin(['state', 'dataset'])]

range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])


    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()

### Biplots

In [None]:
# encode the dataset column as numerical values
time_embedded_copy = time_embedded.copy()
label_encoder = LabelEncoder()
time_embedded_copy['dataset'] = label_encoder.fit_transform(time_embedded_copy['dataset'])
time_embedded_copy.columns = time_embedded_copy.columns.astype(str)

In [None]:
# only take the ventral turns
ventral_data = time_embedded_copy.loc[time_embedded_copy['state']=='ventral',:]
pcav = PCA(n_components=3)
ventral_pcs = pcav.fit_transform(ventral_data.loc[:,~ventral_data.columns.isin(['state', 'cluster'])]) # include the dataset column
ventral_components = pcav.components_ # directions of maximum variance in the data

In [None]:
import matplotlib.pyplot as plt

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = ventral_data['dataset'])
    for i in range(n):
        if i==740:
            scale=1
        else:
            scale=5
        plt.arrow(0, 0, coeff[i,0]*scale, coeff[i,1]*scale,color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* scale, coeff[i,1] * scale, "Var"+str(i+1), color = 'black', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 5, coeff[i,1] * 5, labels[i], color = 'b', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()

#Call the function. Use only the 2 PCs.
myplot(ventral_pcs[:,0:3],np.transpose(ventral_components[0:3, :]))
plt.show()

In [None]:
pcav2 = PCA(n_components=3)
ventral_pcs2 = pcav2.fit_transform(ventral_data.loc[:,~ventral_data.columns.isin(['state', 'dataset','cluster'])])
ventral_components2 = pcav2.components_ # directions of maximum variance in the data

In [None]:
import matplotlib.pyplot as plt

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = ventral_data['dataset'])
    for i in range(n):
        if i==740:
            scale=1
        else:
            scale=5
        plt.arrow(0, 0, coeff[i,0]*scale, coeff[i,1]*scale,color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* scale, coeff[i,1] * scale, "Var"+str(i+1), color = 'black', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 5, coeff[i,1] * 5, labels[i], color = 'b', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()

#Call the function. Use only the 2 PCs.
myplot(ventral_pcs2[:,0:3],np.transpose(pcav2.components_[0:3, :]))
plt.show()

### Quantification of variability within state

We want to perform PCA on the ventral state points and identify the direction (eigenvector) that captures a lot of variance but across different trajectories and not just within a single trajectory.

In [None]:
ventral_te = time_embedded_copy.copy()

pca_te = PCA(n_components=3)
ventral_te_pc = pca_te.fit_transform(ventral_te.loc[:,~ventral_te.columns.isin(['state','dataset', 'cluster'])])
ventral_te_pc_df = pd.DataFrame(ventral_te_pc)

turn_vec = time_embedded_copy['state'].values
ventral_te_pc_df['dataset'] = data['dataset'].values

In [None]:
dataset_names = []
intervals = []
is_ventral = False
count = 0
for i in range(ventral_te_pc.shape[0]):
    if turn_vec[i] == 'ventral':
        if is_ventral:
            continue
        else:
            start_idx=i
            is_ventral = True
        continue
    else:
        if not is_ventral:
            continue
        else:
            end_idx=i-1
            array = ventral_te_pc_df.loc[start_idx:end_idx,~ventral_te_pc_df.columns.isin(['dataset'])]
            dataset_names.append(ventral_te_pc_df['dataset'].loc[start_idx])
            intervals.append(array)
            is_ventral = False

In [None]:
all_medians = []
for i in intervals:
    quan = i.loc[int(np.median(i.index, axis=0)), :]
    all_medians.append(quan)
    
pca_median = PCA(n_components=3)
median_pcs = pca_median.fit_transform(pd.DataFrame(all_medians))
median_pc1 = median_pcs[:,0]
median_comps = pca_median.components_
#med = list(np.concatenate(median_pcs))

In [None]:
med_df = pd.DataFrame(median_pc1, columns=['Median'])
med_df["dataset"] = dataset_names
med_df.head()

In [None]:
sns.histplot(data=med_df, x="Median",y="dataset",hue='dataset',bins=250, legend=False)

In [None]:
plt.hist(median_pc1, bins=100)

In [None]:
vtp = pd.DataFrame(ventral_te_pc)
vtp['state'] = turn_vec

fig = hf.plot_PCs(vtp)

for i in range(len(all_medians)):
    fig.add_trace(go.Scatter3d(x=[all_medians[i][0]],
                                y=[all_medians[i][1]],
                                z=[all_medians[i][2]],
                                mode='markers',
                                marker=dict(color='black', size=3)))
    
# scale the components
scaled_ventral_components = np.zeros(median_comps.shape)
scaled_ventral_components[0]=median_comps[0]*abs_coord_max[0]

fig.add_trace(go.Scatter3d(x=[-scaled_ventral_components[0, 0], scaled_ventral_components[0, 0]],
                            y=[-scaled_ventral_components[0, 1], scaled_ventral_components[0, 1]],
                            z=[-scaled_ventral_components[0, 2], scaled_ventral_components[0, 2]],
                            mode='lines', name=f'Principal Component 1',
                            line=dict(color='black', width=3)))
    
fig.update_xaxes(type='linear')
fig.update_yaxes(type='linear')
fig.update_layout(title='PCA of time-embedded data')
fig.show()

# Appendix I: More Clustering

In [None]:
kmeans = KMeans(n_clusters=5)
state_clustering = data.copy()
state_clustering['cluster'] = kmeans.fit_predict(time_embedded.loc[:,~time_embedded.columns.isin(['state', 'dataset'])])

In [None]:

# Assuming 'text_column' is your pandas DataFrame column with text values
label_encoder = LabelEncoder()
state_clustering['dataset_numeric'] = label_encoder.fit_transform(state_clustering['dataset'])

labels_pred_proc = state_clustering['cluster']
labels_true_proc = state_clustering['dataset_numeric']

# Calculate adjusted mutual information score, which tells us how well the clustering results match the ground truth
print(adjusted_mutual_info_score(labels_true_proc, labels_pred_proc))