### Data Diagnostics: Variability of the Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from neumand.thesis_figures.variability_visualization_tools import biplot, silhouette_plots
import helper_functions as hf
import plotly.graph_objects as go
from sklearn.metrics.cluster import contingency_matrix
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import TimeSeriesSplit, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import make_scorer
from sklearn import metrics
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.cm as cm
from sklearn.metrics import confusion_matrix


sns.set_theme()

# Loading Data

In [None]:
preprocessed_data = pd.read_hdf('time_embedded_2103.h5')
unpreprocessed_data = pd.read_hdf("imputed_dataframe_0602.h5", key="data")

# Visualization

### Individual datasets in a shared PC space (Preprocessed Data)


In [None]:
pca = PCA(n_components=3)
prep_pca_data = pca.fit_transform(preprocessed_data.loc[:,~preprocessed_data.columns.isin(['state', 'dataset'])])
prep_pca_data = pd.DataFrame(prep_pca_data)
prep_pca_data['state'] = preprocessed_data['state']
prep_pca_data['dataset'] = preprocessed_data['dataset']

# encode the dataset column as numerical values
label_encoder = LabelEncoder()
encoded_datasets = label_encoder.fit_transform(prep_pca_data['dataset'])
prep_pca_data['dataset_numeric'] = encoded_datasets
preprocessed_data['dataset_numeric'] = encoded_datasets 

In [None]:
datasets = {dataset: df for dataset, df in prep_pca_data.groupby('dataset')}
hf.plot_PCs_separately(datasets).run_server(debug=True, port=8054)

### Individual datasets in a shared PC space (Unpreprocessed Data)


In [None]:
unprep_pca = PCA(n_components=3)
unprep_pca_data = unprep_pca.fit_transform(unpreprocessed_data.loc[:,~unpreprocessed_data.columns.isin(['state', 'dataset'])])
unprep_pca_data = pd.DataFrame(unprep_pca_data)
unprep_pca_data['state'] = unpreprocessed_data['state']
unprep_pca_data['dataset'] = unpreprocessed_data['dataset']

# encode the dataset column as numerical values
label_encoder = LabelEncoder()
encoded_datasets = label_encoder.fit_transform(unprep_pca_data['dataset'])
unprep_pca_data['dataset_numeric'] = encoded_datasets
unpreprocessed_data['dataset_numeric'] = encoded_datasets 

In [None]:
datasets_unprep = {dataset: df for dataset, df in unprep_pca_data.groupby('dataset')}
hf.plot_PCs_separately(datasets_unprep).run_server(debug=True, port=8055)

### Color-code trajectories based on dataset 
Each data point is colored based on the dataset it belongs to. This helps in understanding the variability of the data across different datasets.

In [None]:
all_traces = []

for name, df in datasets.items():
    trace = go.Scatter3d(x=df[0], y=df[1], z=df[2], mode="lines", name=name)
    all_traces.append(trace)
    
fig = go.Figure(data=all_traces)

variances = pca.explained_variance_ratio_ * 100
scene = dict(xaxis_title=f"PC 1 ({variances[0]:.2f}%)",
                yaxis_title=f"PC 2 ({variances[1]:.2f}%)",
                zaxis_title=f"PC 3 ({variances[2]:.2f}%)")

fig.update_layout(scene=scene)
fig.show()

### Biplot within a state

Biplots show us how strongly a variable influences a principal component.

In [None]:
# only take the ventral turns
ventral_data = preprocessed_data.loc[preprocessed_data['state']=='ventral',:]
ventral_data.columns = ventral_data.columns.astype(str)
pcav2 = PCA(n_components=3)
ventral_pcs2 = pcav2.fit_transform(ventral_data.loc[:,~ventral_data.columns.isin(['state', 'dataset','dataset_numeric','cluster'])]) # exclude the dataset column
ventral_components2 = pcav2.components_ # directions of maximum variance in the data

In [None]:
biplot(ventral_pcs2[:,0:3],np.transpose(pcav2.components_[0:3, :]),ventral_data['dataset_numeric'])
plt.show()

# Clustering for dataset membership

### KMeans with preprocessed data (in PC space)

We can try clustering our data without the dataset feature to see if the separation of data points is based on the dataset membership or not.
We could cross check with a dataset where no preprocessing has been done.

**Evaluation** <br>
We can evaluate the clustering using the adjusted mutual information score, which calculates the mutual information between two clusterings and then normalizes this value by the expected mutual information of two random clusterings.

In [None]:
AMI = cross_val_score(KMeans(n_clusters=23),prep_pca_data.loc[:,~prep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],prep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5),scoring=make_scorer(adjusted_mutual_info_score))
print(f"AMI: {np.mean(AMI)}")

In [None]:
prep_pred_labels = cross_val_predict(KMeans(n_clusters=23),prep_pca_data.loc[:,~prep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],prep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5))

### Control: KMeans with unpreprocessed dataset (in PC space)
We will now cluster our unpreprocessed data and see if the clusters are based on the dataset membership or not.

In [None]:
AMI_unpreprocessed = cross_val_score(KMeans(n_clusters=23),unprep_pca_data.loc[:,~unprep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],unprep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5),scoring=make_scorer(adjusted_mutual_info_score))
print(f"AMI for unpreprocessed data: {np.mean(AMI_unpreprocessed)}")

In [None]:
unprep_pred_labels = cross_val_predict(KMeans(n_clusters=23),unprep_pca_data.loc[:,~unprep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],unprep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5))

**Result**<br>
Since the adjusted mutual information score between the clustering of our data and the dataset membership is relatively low (also compared to the unpreprocessed data), we can assume that the separation of data points is not based on the dataset membership and that the dataset feature might not explain the variability of the trajectories.

#### Contingency Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.clf()
res = sns.heatmap(contingency_matrix(prep_pca_data['dataset_numeric'], prep_pred_labels), fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=100.0)
plt.show()

In [None]:
fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.clf()
res = sns.heatmap(contingency_matrix(unprep_pca_data['dataset_numeric'], unprep_pred_labels), fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=100.0)
plt.show()

#### Silhouette Coefficient
We will also calculate the silhouette coefficient to evaluate the quality of the clusters. The silhouette score ranges from -1 to 1, where a higher value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.

In [None]:

X = preprocessed_data.loc[:,~preprocessed_data.columns.isin(['state', 'dataset', 'dataset_numeric'])]
silhouette_plots(X)


# Classification

We will now try to classify the preprocessed data based on the dataset membership.

Note: Since cross_val_predict does not work with TimeSplit we will use custom code from stackexchange (Marco Cerliani)

### State Classification on Original (Unpreprocessed) Data with TimeSeriesSplit

In [None]:
prec = make_scorer(metrics.precision_score, average='weighted')
reca = make_scorer(metrics.recall_score, average='weighted')
f1 = make_scorer(metrics.f1_score, average='weighted')
acc = make_scorer(metrics.accuracy_score)
scoring={"accuracy":acc, "precision":prec, "recall":reca, "f1":f1}

In [None]:
tscv = TimeSeriesSplit()

X = unpreprocessed_data.loc[:,~unpreprocessed_data.columns.isin(['state', 'dataset', 'cluster'])]

label_encoder = LabelEncoder()
y =  label_encoder.fit_transform(unpreprocessed_data['state'])

tscv_results_original = cross_validate(SVC(gamma='auto'), X, y, cv=tscv, scoring=scoring)

In [None]:
for name,values in tscv_results_original.items():
    print(name,":", np.mean(list(values)))

#### with StratifiedKFold

In [None]:
scv = StratifiedKFold()

X = unpreprocessed_data.loc[:,~unpreprocessed_data.columns.isin(['state', 'dataset', 'cluster'])]

label_encoder = LabelEncoder()
y =  label_encoder.fit_transform(unpreprocessed_data['state'])

scv_results_original = cross_validate(SVC(gamma='auto'), X, y, cv=scv, scoring=scoring)

In [None]:
for name,values in scv_results_original.items():
    print(name,":", np.mean(list(values)))

### State Classification on Preprocessed Data

In [None]:
tscv = TimeSeriesSplit()

X = preprocessed_data.loc[:,~preprocessed_data.columns.isin(['state', 'dataset', 'cluster','dataset_numeric'])]

label_encoder = LabelEncoder()
y =  label_encoder.fit_transform(preprocessed_data['state'])

scv_results_preprocessed = cross_validate(SVC(gamma='auto'), X, y, cv=tscv, scoring=scoring)

In [None]:
for name,values in scv_results_preprocessed.items():
    print(name,":", np.mean(list(values)))

### State Classification on PCA Data

In [None]:
tscv = TimeSeriesSplit()

X = prep_pca_data.loc[:,~prep_pca_data.columns.isin(['state', 'dataset', 'cluster', 'dataset_numeric'])]

label_encoder = LabelEncoder()
y =  label_encoder.fit_transform(prep_pca_data['state'])

cv_results_pca = cross_validate(SVC(gamma='auto'), X, y, cv=tscv, scoring=scoring)

In [None]:
for name,values in cv_results_pca.items():
    print(name,":", np.mean(list(values)))

#### Confusion Matrix

In [None]:
y_pred = cross_val_predict(SVC(gamma='auto'), X, y, cv=scv, groups=y)

In [None]:
cm = confusion_matrix(y, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=False, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

## Dataset Membership Classification 


### New approach with StratifiedKFold (UNpreprocessed data)

In [None]:
scv = StratifiedKFold(n_splits=5)
y_ds = unpreprocessed_data["dataset_numeric"]
X = unpreprocessed_data.loc[:,~unpreprocessed_data.columns.isin(['state', 'dataset', 'cluster', 'dataset_numeric'])]
cv_results_unprep = cross_validate(SVC(gamma='auto'), X, y_ds, cv=scv, groups=y_ds, scoring=scoring)
y_pred_unprep = cross_val_predict(SVC(gamma='auto'), X, y_ds, cv=scv, groups=y_ds)

In [None]:
for name,values in cv_results_unprep.items():
    print(name,":", np.mean(list(values)))

In [None]:
cm = confusion_matrix(y_ds, y_pred_unprep)
ax= plt.subplot()
sns.heatmap(cm, annot=False, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

### Preprocessed Data

In [None]:
scv = StratifiedKFold(n_splits=5)
X = preprocessed_data.loc[:,~preprocessed_data.columns.isin(['state', 'dataset', 'cluster', 'dataset_numeric'])]
y_ds_prep = preprocessed_data["dataset_numeric"]


In [None]:
cv_results_prep = cross_validate(SVC(gamma='auto'), X, y_ds_prep, cv=scv, scoring=scoring, verbose=1, error_score= 'raise')

In [None]:
y_pred_prep = cross_val_predict(SVC(gamma='auto'), X, y_ds_prep, cv=scv, groups=y_ds_prep)

In [None]:
for name,values in cv_results_prep.items():
    print(name,":", np.mean(list(values)))

In [None]:
cm = confusion_matrix(y_ds_prep, y_pred_prep)
ax= plt.subplot()
sns.heatmap(cm, annot=False, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

### On PCA

In [None]:
scv = StratifiedKFold(n_splits=5)
X = prep_pca_data.loc[:,~prep_pca_data.columns.isin(['state', 'dataset', 'cluster', 'dataset_numeric'])]

In [None]:
cv_results_pca = cross_validate(SVC(gamma='auto'), X, y_ds, cv=scv, groups=y, scoring=scoring)

In [None]:
y_pred_pca = cross_val_predict(SVC(gamma='auto'), X, y_ds, cv=scv, groups=y)

In [None]:
for name,values in cv_results_pca.items():
    print(name,":", np.mean(list(values)))

In [None]:
cm = confusion_matrix(y_ds, y_pred_pca)
ax= plt.subplot()
sns.heatmap(cm, annot=False, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 


In [None]:
states = prep_pca_data['state'].unique().tolist()
for state in states:
    cv_results_pca = cross_validate(SVC(gamma='auto'),prep_pca_data.loc[prep_pca_data['state']==state,~prep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],prep_pca_data.loc[prep_pca_data['state']==state,'dataset_numeric'],cv=StratifiedKFold(n_splits=5), scoring=scoring)
    print(f"Results for {state}:")
    for name,values in cv_results_pca.items():
        print(name,":", np.mean(list(values)))
    
    labels_pred_proc = cross_val_predict(SVC(gamma='auto'),prep_pca_data.loc[prep_pca_data['state']==state,~prep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],prep_pca_data.loc[prep_pca_data['state']==state,'dataset_numeric'],cv=StratifiedKFold(n_splits=5))
    labels_true_proc = prep_pca_data.loc[prep_pca_data['state']==state,'dataset_numeric']
    
    fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
    plt.clf()
    res = sns.heatmap(contingency_matrix(labels_true_proc, labels_pred_proc), fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=100.0)
    plt.show()

# Recurrence Plots

In [None]:
te = np.asarray(data.loc[:,~data.columns.isin(['state', 'dataset'])])

In [None]:
pd_Y = np.linalg.norm(te[:20000, np.newaxis] - te[:20000,:], axis=-1)
plt.matshow(pd_Y, cmap='Greys')
plt.show()

## Using Loop

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

def recurrence_plot(data, threshold=0.1):
    """
    Generate a recurrence plot from a time series.

    :param data: Time series data
    :param threshold: Threshold to determine recurrence
    :return: Recurrence plot
    """
    # Calculate the distance matrix
    N = len(data)
    distance_matrix = np.zeros((N, N))
    count = 0
    for i in tqdm(range(N)):
        for j in range(N):
            distance_matrix[i, j] = np.linalg.norm(data[i] - data[j]) # euclidean distance between two points
            if distance_matrix[i, j] <= threshold:
                count += 1

    # Create the recurrence plot
    recurrence_plot = np.where(distance_matrix <= threshold, 1, 0)
    print(count)
    return recurrence_plot

In [None]:
# Generate and plot the recurrence plot of the first principal component
recurrence = recurrence_plot(np.array(data.loc[:6000,0]), threshold=0.8) # run time and memory allocation for full dataset is too high 

plt.figure(figsize=(8, 8))
plt.imshow(recurrence, cmap='Greys', origin='lower')
plt.title('Recurrence Plot')
plt.xlabel('Time')
plt.ylabel('Time')
plt.colorbar(label='Recurrence')
plt.show()

In [None]:
from scipy.spatial.distance import pdist, squareform
dist = pdist(te[:10000, :])
dist = squareform(dist)
sns.heatmap(dist, cmap="mako")
plt.show()


In [None]:
groups = data.groupby('dataset')
all_dfs = []
for name, group in groups:
    df = group.reset_index().loc[:900,:]
    all_dfs.append(df)
data_truncated = pd.concat(all_dfs)
te_trunc = np.asarray(data_truncated.loc[:,~data_truncated.columns.isin(['state', 'dataset'])])


In [None]:
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import seaborn as sns
dist = pdist(te_trunc)
dist = squareform(dist)
sns.heatmap(dist, cmap="mako")
plt.show()

# Appendix I: More Clustering

### 5 clusters and comparing with state membership

In [None]:
AMI_states = cross_val_score(KMeans(n_clusters=5),prep_pca_data.loc[:,~prep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],prep_pca_data['state'],cv=StratifiedKFold(n_splits=5),scoring=make_scorer(adjusted_mutual_info_score))
print(f"AMI for unpreprocessed data: {np.mean(AMI_states)}")

There is little correspondence between the clusters and the state membership. This suggests that the clustering is not based on the state membership.

### 23 clusters but within each state

In [None]:
states = prep_pca_data['state'].unique().tolist()
for state in states:
    AMI = cross_val_score(KMeans(n_clusters=23),prep_pca_data.loc[prep_pca_data['state']==state,~prep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],prep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5),scoring=make_scorer(adjusted_mutual_info_score))
    print(f"AMI for unpreprocessed data: {np.mean(AMI)}")
    
    labels_pred_proc = cross_val_predict(KMeans(n_clusters=23),prep_pca_data.loc[prep_pca_data['state']==state,~prep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],prep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5))
    labels_true_proc = prep_pca_data.loc[prep_pca_data['state']==state,'dataset_numeric']
    
    fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
    plt.clf()
    res = sns.heatmap(contingency_matrix(labels_true_proc, labels_pred_proc), fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=100.0)
    plt.show()

In [None]:
# control
states = prep_pca_data['state'].unique().tolist()
for state in states:
    AMI = cross_val_score(KMeans(n_clusters=23),unprep_pca_data.loc[prep_pca_data['state']==state,~unprep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],unprep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5),scoring=make_scorer(adjusted_mutual_info_score))
    print(f"AMI for unpreprocessed data: {np.mean(AMI)}")
    
    labels_pred_proc = cross_val_predict(KMeans(n_clusters=23),unprep_pca_data.loc[unprep_pca_data['state']==state,~unprep_pca_data.columns.isin(['state', 'dataset', 'dataset_numeric'])],unprep_pca_data['dataset_numeric'],cv=StratifiedKFold(n_splits=5))
    labels_true_proc = unprep_pca_data.loc[unprep_pca_data['state']==state,'dataset_numeric']
    
    fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
    plt.clf()
    res = sns.heatmap(contingency_matrix(labels_true_proc, labels_pred_proc), fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=100.0)
    plt.show()
    

# Appendix II: Quantification of variability within state with Median Points

We want to perform PCA on the ventral state points and identify the direction (eigenvector) that captures a lot of variance but across different trajectories and not just within a single trajectory.

In [None]:
ventral_te = prep_pca_data.copy()
turn_vec = prep_pca_data['state'].values
dataset_names = []
intervals = []
is_ventral = False
count = 0
for i in range(ventral_te.shape[0]):
    if turn_vec[i] == 'ventral':
        if is_ventral:
            continue
        else:
            start_idx=i
            is_ventral = True
        continue
    else:
        if not is_ventral:
            continue
        else:
            end_idx=i-1
            array = ventral_te.loc[start_idx:end_idx,~ventral_te.columns.isin(['dataset'])]
            dataset_names.append(ventral_te['dataset'].loc[start_idx])
            intervals.append(array)
            is_ventral = False
all_medians = []
for i in intervals:
    quan = i.loc[int(np.median(i.index, axis=0)), :]
    all_medians.append(quan)
    
pca_median = PCA(n_components=3)
median_pcs = pca_median.fit_transform(pd.DataFrame(all_medians))
median_pc1 = median_pcs[:,0]
median_comps = pca_median.components_
#med = list(np.concatenate(median_pcs))
med_df = pd.DataFrame(median_pc1, columns=['Median'])
med_df["dataset"] = dataset_names
med_df.head()
sns.histplot(data=med_df, x="Median",y="dataset",hue='dataset',bins=250, legend=False)


In [None]:
plt.hist(median_pc1, bins=100)

In [None]:

vtp = pd.DataFrame(ventral_te_pc)
vtp['state'] = turn_vec

fig = hf.plot_PCs(vtp)

for i in range(len(all_medians)):
    fig.add_trace(go.Scatter3d(x=[all_medians[i][0]],
                                y=[all_medians[i][1]],
                                z=[all_medians[i][2]],
                                mode='markers',
                                marker=dict(color='black', size=3)))
    
# scale the components
scaled_ventral_components = np.zeros(median_comps.shape)
max_coord = np.abs(ventral_te_pc).max(axis=1).max()
scaled_ventral_components[0]=median_comps[0]*max_coord

fig.add_trace(go.Scatter3d(x=[-scaled_ventral_components[0, 0], scaled_ventral_components[0, 0]],
                            y=[-scaled_ventral_components[0, 1], scaled_ventral_components[0, 1]],
                            z=[-scaled_ventral_components[0, 2], scaled_ventral_components[0, 2]],
                            mode='lines', name=f'Principal Component 1',
                            line=dict(color='black', width=3)))
    
fig.update_xaxes(type='linear')
fig.update_yaxes(type='linear')
fig.update_layout(title='PCA of time-embedded data')
fig.show()