# <b>Data Diagnostics I </b> *✲ﾟ*｡✧٩(･ิᴗ･ิ๑)۶*✲ﾟ*｡✧

In this notebook we will explore taking the min-max or percentile normalization between datasets and also derivatives and see how our data changes, i.e. the distribution of each variable, check the principal components, etc.

In [None]:
import helper_functions as hf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pynumdiff as pdiff
import pandas as pd
from collections import defaultdict
from sklearn.covariance import MinCovDet
from sklearn.preprocessing import RobustScaler
from scipy.stats import mstats, boxcox

from tqdm import tqdm
import importlib
importlib.reload(hf)

imputed_dataframe = hf.wbstruct_dataframes.loading_pkl('imputed_dataframe.pkl')
dataframes = hf.wbstruct_dataframes.loading_pkl('dataframes.pkl')
turn_vec = hf.wbstruct_dataframes.loading_pkl('turn_vec.pkl')

### Resampling (or Up-/Downsampling)
Our datasets have different sizes, so we have to upsample them. Most recordings range from 3200 to 3780 time points as we can see in the below figure but there is one dataset with 4146 and one with 5450 time points. 8 datasets have exactly 3529 time points. We will therefore down- or upsample to this number via linear interpolation (computing the slope between two data points) implemented in numpy.   

In [None]:
hf.visualize_fps(dataframes, title="frame rate of each dataset", xlabel="dataset", ylabel="frame rate", coloring="tab:red")

In [None]:
resampled_dataframe = imputed_dataframe.copy()

frames_num = 3529

# length_dict holds the length of each dataset as value and the name of the dataset as key
length_dict = defaultdict()
for key, value in dataframes.items():
    length_dict[key] = len(value)

# resample all dataframes to the same length of 3529 frames
resampled_dataframe = hf.resample(resampled_dataframe, (length_dict.values()), absolute_frames=frames_num)

In [None]:
%%capture

for key in dataframes.keys():
    length_dict[key] = frames_num

# we plot all the resampled traces 
saving_path="C:\\Users\\LAK\\Documents\\plots\\resampled_plots\\"

hf.plot_from_stacked_imputed(length_dict, resampled_dataframe, resampled_dataframe, saving_path)

### Truncation

We noticed some edge effects in the data, i.e. the first and last 100 time points are not very reliable. We will therefore truncate the data to the middle 3329 time points.

In [None]:
lengths = [3529 for i in range(0, len(dataframes))]
frames_num = 3329
for key in dataframes.keys():
    length_dict[key] = frames_num
truncated_dataframe = hf.truncate(resampled_dataframe, lengths)

In [None]:
%%capture
# we plot all the resampled traces 
saving_path="C:\\Users\\LAK\\Documents\\plots\\truncated_plots\\"

hf.plot_from_stacked_imputed(length_dict, truncated_dataframe, truncated_dataframe, saving_path)

### Normalization between datasets

As seen above we have to deal with different scales across datasets so a natural next step is to normalize the data across datasets to make them comparable. We will do this by taking the min-max normalization between datasets. This means that we will take the minimum and maximum value of each variable across all datasets and then normalize each dataset to this range. This will be done the time derivatives of the resampled data.

We can also try the percentile normalization between datasets. This means that we will take the 5th and 95th percentile of each variable across all datasets and then normalize each dataset to this range. This will be done the time derivatives of the resampled data.

### Based on Quantiles: RobustScaler

In [None]:
scaler = RobustScaler(with_centering=False, with_scaling=True, quantile_range=(5, 95))

# normalize per dataset
quartiled_separate = hf.normalize_per_dataset(truncated_dataframe, length_dict, scaler)

# normalize across datasets 
quartiled_data = pd.DataFrame(scaler.fit_transform(quartiled_separate), columns = quartiled_separate.columns)

In [None]:
robust_scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(1, 9))
quartiled_data1 = pd.DataFrame(robust_scaler.fit_transform(truncated_dataframe), columns = resampled_dataframe.columns)
robust_scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(5, 95))
quartiled_data2 = pd.DataFrame(robust_scaler.fit_transform(truncated_dataframe), columns = resampled_dataframe.columns)

In [None]:
from sklearn.preprocessing import quantile_transform

In [None]:
# plotting the trace of one neuron across all datasets
# and save the plot
fig, ax = plt.subplots(figsize=(40, 10))
ax.plot(quartiled_data['AVAR'].T, color="tab:blue")
ax.set_ylabel("AVAR")
ax.set_xlabel("time")
ax.set_title("AVAR across all datasets")
fig.savefig("normalized_AVAR_alldatasets.png")

In [None]:
%%capture
%matplotlib widget
saving_path="C:\\Users\\LAK\\Documents\\plots\\normalized_plots_10_90\\"

hf.plot_from_stacked_imputed(length_dict, quartiled_data, quartiled_data, saving_path)

In [None]:
%%capture
%matplotlib widget
saving_path="C:\\Users\\LAK\\Documents\\plots\\normalized_plots_10_90_separate\\"

hf.plot_from_stacked_imputed(length_dict, quartiled_separate, quartiled_separate, saving_path)

### PCA on normalized data

In [None]:
%%capture
turn_vec =hf.get_behavioural_states(truncated_dataframe) # this step has to be revisited

In [None]:
turn_vec.to_pickle("turn_vec_truncated.pkl")

In [None]:
pca_quartile = hf.PCA(n_components=3)
imputed_pc_quartile = pd.DataFrame(pca_quartile.fit_transform(quartiled_data))

window_size = 10

# Applying a 10-sample sliding average for smoother visualizations!
imputed_pc_quartile[0] = np.convolve(imputed_pc_quartile[0], np.ones(window_size)/window_size, mode='same')
imputed_pc_quartile[1] = np.convolve(imputed_pc_quartile[1], np.ones(window_size)/window_size, mode='same')
imputed_pc_quartile[2] = np.convolve(imputed_pc_quartile[2], np.ones(window_size)/window_size, mode='same')

imputed_pc_quartile['state'] = turn_vec.values
hf.plot_PCs(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.html')
#hf.plot_PC_gif(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.gif')

In [None]:
pca_quartile = hf.PCA(n_components=3)
imputed_pc_quartile = pd.DataFrame(pca_quartile.fit_transform(quartiled_data2))

window_size = 10

# Applying a 10-sample sliding average for smoother visualizations!
imputed_pc_quartile[0] = np.convolve(imputed_pc_quartile[0], np.ones(window_size)/window_size, mode='same')
imputed_pc_quartile[1] = np.convolve(imputed_pc_quartile[1], np.ones(window_size)/window_size, mode='same')
imputed_pc_quartile[2] = np.convolve(imputed_pc_quartile[2], np.ones(window_size)/window_size, mode='same')

imputed_pc_quartile['state'] = turn_vec.values
hf.plot_PCs(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.html')
#hf.plot_PC_gif(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.gif')


##### Mahalanobis Distances with normalized data

In [None]:
from scipy.stats import mstats, boxcox
quartile_copy = quartiled_data.copy()
quartile_copy = quartile_copy + abs(quartile_copy.min().min()) + 0.01
quartile_transformed = hf.pd.DataFrame()
all_lambdas = []
for col in quartile_copy.columns:
    quartile_transformed[col], best_lambda = boxcox(quartile_copy[col])
    all_lambdas.append(best_lambda)

In [None]:
from sklearn.covariance import MinCovDet

cov = MinCovDet(random_state=0).fit(quartile_transformed)

In [None]:
t = cov.location_
C = cov.covariance_
MD2_robust = quartile_transformed.apply(lambda x: np.sqrt((x-t).T @ np.linalg.inv(C) @ (x-t)), axis=1)

In [None]:

MD2_robust2 = hf.pd.DataFrame(MD2_robust.copy(), columns=['MD2'])

start_index = 0
count = 0

# we will unstack the dataframe and plot the traces for each dataset
for key,obs_count in length_dict.items():

    # we take the number of observations from the length dictionary and add it to the start index
    end_index = start_index + obs_count
    #MD2_truncated = np.concatenate((MD2_truncated[:start_index], MD2_truncated[start_index+100:]))
    MD2_robust2.loc[start_index:end_index,'dataset'] = key
    start_index = end_index
    count += 1


In [None]:
fig = hf.px.scatter(y=MD2_robust2["MD2"],x=range(len(MD2_robust2)), title='Robust Mahalanobis Distances on Quartile Normalized Data', labels={'x':'Observation', 'y':'MD'}, color=MD2_robust2["dataset"], color_continuous_scale='viridis')
fig.update_traces(marker_size=3)
fig.update_layout(legend_title='Dataset')
fig.show()

### Time Derivatives - Total Variation Regularization
Since we are interested in the shape of our data and want to eliminate noise as much as possible, we will take the time derivative of our data. 
To this end we will use an iterative total variation regularization method to compute the first order derivative of our data. Finite difference methods estimate derivatibes by looking at the changes in the values over small intervals dt. This time step size dt is the reciprocal of the sampling frequency, which is 2.9-3.5 volumes per second for Rebecca's data and about 3 volumes per second for Kerem's data.
We will apply this on each dataset individually.

In [None]:
resampled_derivatives = hf.compute_derivatives(quartiled_data, length_dict,1,0.01) 

In [None]:
resampled_derivatives.to_pickle("resampled_derivatives_It1_Gam01.pkl")

In [None]:
# iteration 2 and 10
resampled_derivatives_It2= hf.compute_derivatives(quartiled_data, length_dict, 2, 0.01)
resampled_derivatives_It5 = hf.compute_derivatives(quartiled_data, length_dict, 5, 0.01)

In [None]:
# iteration 2 and gamma 0.001 and 10
resampled_derivatives_Gam001 = hf.compute_derivatives(quartiled_data, length_dict, 2, 0.001)
resampled_derivatives_Gam10 = hf.compute_derivatives(quartiled_data, length_dict, 2, 0.1)

In [None]:
resampled_derivatives_595_It5_Gam001 = hf.compute_derivatives(quartiled_data2, length_dict, 5, 0.001)
resampled_derivatives_595_It5_Gam001.to_pickle("resampled_derivatives_595_It5_Gam001.pkl")

In [None]:
# load from pickle
resampled_derivatives_It2 = pd.read_pickle("resampled_derivatives_It2.pkl")

In [None]:
%%capture
%matplotlib widget
saving_path2="C:\\Users\\LAK\\Documents\\plots\\totalvariation_plots\\Iteration2Gamma0.01\\"
saving_path10="C:\\Users\\LAK\\Documents\\plots\\totalvariation_plots\\Iteration5Gamma0.01\\"
    


In [None]:
resampled_derivatives_cumsum = resampled_derivatives.copy()

dt = 1/3 # time step: 1/(frame rate)
start_index = 0
for dataset_idx in tqdm(range(len(dataframes.keys())), desc="Computing derivatives"):
    end_index = start_index + frames_num
    integrated = np.cumsum(resampled_derivatives_cumsum[start_index:end_index])
    resampled_derivatives_cumsum[start_index:end_index] = integrated + abs(integrated.min()) + 0.01 
    start_index = end_index

In [None]:


# plotting the trace of one neuron across all datasets
# and save the plot
fig, ax = plt.subplots(figsize=(40, 10))
ax.plot(resampled_derivatives_cumsum['AVAR'].T, color="tab:blue")
ax.set_ylabel("AVAR")
ax.set_xlabel("time")
ax.set_title("AVAR across all datasets")
fig.savefig("resampled_AVAR_alldatasets_It5.png")

In [None]:
pca = hf.PCA(n_components=3)
temporal_PCs_totalvariation = pd.DataFrame(pca.fit_transform(resampled_derivatives_cumsum))

In [None]:
%matplotlib widget
window_size = 10

# Applyin a 10-sample sliding average for smoother visualizations!
temporal_PCs_totalvariation[0] = np.convolve(temporal_PCs_totalvariation[0], np.ones(window_size)/window_size, mode='same')
temporal_PCs_totalvariation[1] = np.convolve(temporal_PCs_totalvariation[1], np.ones(window_size)/window_size, mode='same')
temporal_PCs_totalvariation[2] = np.convolve(temporal_PCs_totalvariation[2], np.ones(window_size)/window_size, mode='same')

temporal_PCs_totalvariation['state'] = turn_vec.values
hf.plot_PCs(temporal_PCs_totalvariation,temporal_PCs_totalvariation['state'],'PCA_derivatives_totalvariation.html')
hf.plot_PC_gif(temporal_PCs_totalvariation,temporal_PCs_totalvariation['state'],'PCA_totalvariation.gif')

In [None]:
pca = hf.PCA(n_components=3)
temporal_PCs_totalvariation = pd.DataFrame(pca.fit_transform(resampled_derivatives_It2_cumsum))

# Applyin a 10-sample sliding average for smoother visualizations!
temporal_PCs_totalvariation[0] = np.convolve(temporal_PCs_totalvariation[0], np.ones(window_size)/window_size, mode='same')
temporal_PCs_totalvariation[1] = np.convolve(temporal_PCs_totalvariation[1], np.ones(window_size)/window_size, mode='same')
temporal_PCs_totalvariation[2] = np.convolve(temporal_PCs_totalvariation[2], np.ones(window_size)/window_size, mode='same')

temporal_PCs_totalvariation['state'] = turn_vec.values
hf.plot_PCs(temporal_PCs_totalvariation,temporal_PCs_totalvariation['state'],'PCA_derivatives_totalvariation_It2.html')
hf.plot_PC_gif(temporal_PCs_totalvariation,temporal_PCs_totalvariation['state'],'PCA_totalvariation_It2.gif')

### Butterworth Smoothing

In [None]:
butterworth_derivatives = quartiled_data.copy()
dt = 1/3 # time step: 1/(frame rate)
start_index = 0
for dataset in dataframes.values():
    end_index = start_index + frames_num
    for col_index in range(len(butterworth_derivatives.columns)):
        x_hat, dxdt_hat = pdiff.smooth_finite_difference.butterdiff(resampled_derivatives.iloc[start_index:end_index, col_index], dt, [3, 0.09], options={'iterate': False}) # x_hat: estimated (smoothed) x, dxdt_hat: estimated dx/dt, [1, 0.0001]: regularization parameters -> gamma=0.2 is too high, derivatives become too blocky
        butterworth_derivatives.iloc[start_index:end_index, col_index] = dxdt_hat
    #if end_index != len(resampled_derivatives):
    #    resampled_derivatives.iloc[end_index, :] = np.nan #so that we have a separation between datasets   
    start_index = end_index

In [None]:
%%capture
%matplotlib widget
saving_path="C:\\Users\\LAK\\Documents\\butterworth_plots\\"


start_index = 0
count = 0

# we will unstack the dataframe and plot the traces for each dataset
for obs_count in list(length_dict.values()):

    # we take the number of observations from the length dictionary and add it to the start index
    end_index = start_index + obs_count
    res_data_df = butterworth_derivatives.iloc[start_index:end_index]

    fig = hf.plot_traces.make_grid_plot_from_two_dataframes(
            res_data_df, res_data_df)
    # fig, ax = plot_traces.make_grid_plot_from_dataframe(df_imputed)

    # save all plots in a folder
    pathname = saving_path + list(length_dict.keys())[count] + ".png"
    fig.savefig(pathname)
    plt.close(fig)
    start_index = end_index
    count += 1

In [None]:
resampled_derivatives_butter_cumsum = resampled_derivatives.copy()

dt = 1/3 # time step: 1/(frame rate)
start_index = 0
for dataset_idx in tqdm(range(len(dataframes.keys())), desc="Computing derivatives"):
    end_index = start_index + pts
    integrated_bt = np.cumsum(butterworth_derivatives[start_index:end_index])
    resampled_derivatives_butter_cumsum[start_index:end_index] = integrated_bt + abs(integrated_bt.min()) + 0.01

    start_index = end_index

In [None]:
pca = hf.PCA(n_components=3)
pca_butterworth = pd.DataFrame(pca.fit_transform(resampled_derivatives_butter_cumsum))

In [None]:
avg = pca_butterworth#.iloc[68595:72344]
avg["state"] = turn_vec.values#iloc[68595:72344].values

avg[0] = np.convolve(avg[0], np.ones(window_size)/window_size, mode='same')
avg[1] = np.convolve(avg[1], np.ones(window_size)/window_size, mode='same')
avg[2] = np.convolve(avg[2], np.ones(window_size)/window_size, mode='same')

hf.plot_PCs(avg,avg["state"] ,'PCA_butterworth.html')
hf.plot_PC_gif(avg,avg["state"] ,'PCA_butterworth.gif')