# <b>Data Diagnostics II </b> *✲ﾟ*｡✧٩(･ิᴗ･ิ๑)۶*✲ﾟ*｡✧

In this notebook we will explore taking the min-max or percentile normalization between datasets and also derivatives and see how our data changes, i.e. the distribution of each variable, check the principal components, etc.

In [None]:
import helper_functions as hf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pynumdiff as pdiff
import pandas as pd
from collections import defaultdict
from sklearn.covariance import MinCovDet
from sklearn.preprocessing import RobustScaler
from scipy.stats import mstats, boxcox

from tqdm import tqdm
import importlib
importlib.reload(hf)

truncated_dataframe = pd.read_pickle('truncated_dataframe.pkl')
dataframes = hf.wbstruct_dataframes.loading_pkl('dataframes.pkl')

In [None]:
frames_num = 3329
length_dict = defaultdict()
for key in dataframes.keys():
    length_dict[key] = frames_num
turn_vec =hf.get_behavioural_states(truncated_dataframe) # this step has to be revisited

### Normalization between datasets

As seen above we have to deal with different scales across datasets so a natural next step is to normalize the data across datasets to make them comparable. We will do this by taking the min-max normalization between datasets. This means that we will take the minimum and maximum value of each variable across all datasets and then normalize each dataset to this range. This will be done the time derivatives of the resampled data.

We can also try the percentile normalization between datasets. This means that we will take the 5th and 95th percentile of each variable across all datasets and then normalize each dataset to this range. This will be done the time derivatives of the resampled data.

In [None]:
# raw trace of AVAR across all datasets
# plotting the trace of AVAR across all normalized datasets
fig, ax = plt.subplots(figsize=(40, 10))
ax.plot(truncated_dataframe['AVAR'].T, color="tab:blue")
ax.set_ylabel("AVAR")
ax.set_xlabel("time")
ax.set_title("AVAR across all datasets")
fig.savefig("raw_AVAR_alldatasets.png")

## Based on Quantiles: RobustScaler

### Individual Datasets And Then Combined with 0.05, 0.95 Quantiles

In [None]:
# per dataset normalization
scaler = RobustScaler(with_centering=False, with_scaling=True, quantile_range=(10, 90))
quartiled_separate = hf.normalize_per_dataset(truncated_dataframe, length_dict, scaler)

In [None]:
# across dataset normalization
scaler = RobustScaler(with_centering=False, with_scaling=True, quantile_range=(10, 90))
quartiled_data = pd.DataFrame(scaler.fit_transform(quartiled_separate), columns = quartiled_separate.columns)

In [None]:
# truncated_dataframe_new = quartiled_data.copy()
# nested_col = [[name] * dataset_len for name, dataset_len in length_dict.items()]
# truncated_dataframe_new['dataset'] = [x for xs in nested_col for x in xs]
# truncated_dataframe_new.groupby('dataset')['AVAR'].hist()


In [None]:
# plotting the trace of AVAR across all normalized datasets
fig, ax = plt.subplots(figsize=(40, 10))
ax.plot(quartiled_data['AVAR'].T, color="tab:blue")
ax.set_ylabel("AVAR")
ax.set_xlabel("time")
ax.set_title("AVAR across all datasets")
fig.savefig("normalized_AVAR_alldatasets.png")

In [None]:
pca_quartile = hf.PCA(n_components=3)
imputed_pc_quartile = pd.DataFrame(pca_quartile.fit_transform(quartiled_data))

window_size = 10

# Applying a 10-sample sliding average for smoother visualizations!
for i in range(3):
    imputed_pc_quartile[i] = np.convolve(imputed_pc_quartile[i], np.ones(window_size)/window_size, mode='same')
imputed_pc_quartile['state'] = turn_vec.values
hf.plot_PCs(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.html')
#hf.plot_PC_gif(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.gif')

## Based on MinMax: 

### Individual Datasets And Then Combined

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# minmaxing per dataset
scaler = MinMaxScaler()
minmax_separate = hf.normalize_per_dataset(truncated_dataframe, length_dict, scaler)

In [None]:
# minmaxing across dataset
minmax_data = pd.DataFrame(scaler.fit_transform(minmax_separate), columns = minmax_separate.columns)

In [None]:
# plotting the trace of AVAR across all normalized datasets
fig, ax = plt.subplots(figsize=(40, 10))
ax.plot(minmax_data['AVAR'].T, color="tab:blue")
ax.set_ylabel("AVAR")
ax.set_xlabel("time")
ax.set_title("AVAR across all datasets")
fig.savefig("minmaxed_AVAR_alldatasets.png")

In [None]:
pca_quartile = hf.PCA(n_components=3)
imputed_pc_quartile = pd.DataFrame(pca_quartile.fit_transform(minmax_data))

for i in range(3):
    imputed_pc_quartile[i] = np.convolve(imputed_pc_quartile[i], np.ones(window_size)/window_size, mode='same')

imputed_pc_quartile['state'] = turn_vec.values
hf.plot_PCs(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.html')
#hf.plot_PC_gif(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.gif')


## Based on Standard Deviation: QuantileTransformer

### Individual Datasets And Then Combined

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
# minmaxing per dataset
scaler = QuantileTransformer()
quantile_separate = hf.normalize_per_dataset(truncated_dataframe, length_dict, scaler)

In [None]:
# minmaxing across dataset
quantile_transformed_data = pd.DataFrame(scaler.fit_transform(quantile_separate), columns = quantile_separate.columns)

In [None]:
# plotting the trace of AVAR across all normalized datasets
fig, ax = plt.subplots(figsize=(40, 10))
ax.plot(quantile_transformed_data['AVAR'].T, color="tab:blue")
ax.set_ylabel("AVAR")
ax.set_xlabel("time")
ax.set_title("AVAR across all datasets")
fig.savefig("quantile_AVAR_alldatasets.png")

In [None]:
pca_quartile = hf.PCA(n_components=3)
imputed_pc_quartile = pd.DataFrame(pca_quartile.fit_transform(quantile_transformed_data))

for i in range(3):
    imputed_pc_quartile[i] = np.convolve(imputed_pc_quartile[i], np.ones(window_size)/window_size, mode='same')

imputed_pc_quartile['state'] = turn_vec.values
hf.plot_PCs(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quantile_transformed.html')
#hf.plot_PC_gif(imputed_pc_quartile,imputed_pc_quartile['state'],'PCA_quartiled.gif')

## Histograms of the Data

In [None]:
%matplotlib widget
feature_names2 = truncated_dataframe.columns
for i in range(25):
    end_index = (i+1) * frames_num
    start_index = end_index - frames_num
    fig, axes = plt.subplots(nrows=15, ncols=5, figsize=(12, 23))
    fig.suptitle('Distribution of Some Features', y=1.02)

    for n, ax in enumerate(axes.flat):
        if n < len(feature_names2):
            sns.histplot(truncated_dataframe[start_index:end_index], x=feature_names2[n], ax=ax)
            ax.set(title=f'Histogram of {feature_names2[n]}')

    plt.tight_layout()
    fig.savefig(f'histograms/{list(dataframes.keys())[i]}.png')