### Data Diagnostics: Time Delay Embedding

In [None]:
import pandas as pd
import numpy as np
from gtda.time_series import TakensEmbedding
from sklearn.decomposition import PCA
import helper_functions as hf


In [None]:
dataframe = pd.read_hdf('quartiled_data_0602.h5')

In [None]:
TE = TakensEmbedding(time_delay=1, dimension=10, flatten=True)
transformed_data = TE.fit_transform(dataframe.loc[:,~dataframe.columns.isin(['state','dataset'])].T)
time_embedded = pd.DataFrame(np.concatenate((transformed_data),axis=1))

In [None]:
pca = PCA(n_components=3)
pca_data = pca.fit_transform(time_embedded)
data = pd.DataFrame(pca_data)
data['state'] = dataframe['state']
data['dataset'] = dataframe['dataset']

In [None]:
time_embedded['dataset'] = dataframe['dataset']
time_embedded['state'] = dataframe['state']

In [None]:
time_embedded.to_hdf('time_embedded_2103.h5', key='df')

In [None]:
hf.plot_PCs(data, 'PCA of Time Embedded Data', pca.explained_variance_ratio_*100)

### PCA with 7 components

In [None]:
import plotly.express as px

In [None]:
pca7 = PCA(n_components=7)
pca7_data = pd.DataFrame(pca7.fit_transform(time_embedded.loc[:,~time_embedded.columns.isin(['state','dataset'])]))
pca7_data['state'] = dataframe['state']
pca7_data['dataset'] = dataframe['dataset']

In [None]:
pd.DataFrame(pca7.explained_variance_ratio_*100, columns=['Explained Variance']).to_hdf('explained_variance_2903.h5', key='preprocessed')

In [None]:
fig = px.bar(y=pca7.explained_variance_ratio_*100,  x=[i+1 for i in range(7)], text_auto='.2s', labels={"x":"PC","y":"explained variance (%)"}, height=400)
fig.update_layout(title='Explained Variance of PCA of Preprocessed Data', showlegend=False)
fig.show()

## Comparison with unpreprocessed data

In [None]:
variances = pd.read_hdf("explained_variance_2903.h5", key='unpreprocessed')
variances = variances.rename(columns={'Explained Variance':'Unpreprocessed'})
variances['Preprocessed'] = pca7.explained_variance_ratio_*100

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=[i+1 for i in range(7)], y=variances['Preprocessed'], name='Preprocessed', text=round(variances['Preprocessed'],2), marker_color='lightseagreen'))
fig.add_trace(go.Bar(x=[i+1 for i in range(7)], y=variances['Unpreprocessed'], name='Unpreprocessed', marker_color='lightpink', text=round(variances['Unpreprocessed'], 2)))
fig.update_layout(xaxis_title='PC', yaxis_title='Explained Variance (%)', title='Explained Variance and Cumulative Explained Variance of PCA of Preprocessed and Unpreprocessed Data', barmode='group', showlegend=True)
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=[i+1 for i in range(7)], y=variances['Preprocessed'].cumsum(), mode='lines+markers', name='Preprocessed', marker_color='lightseagreen'))
fig.add_trace(go.Scatter(x=[i+1 for i in range(7)], y=variances['Unpreprocessed'].cumsum(), mode='lines+markers', name='Unpreprocessed', marker_color='lightpink'))
fig.update_layout(xaxis_title='PC', yaxis_title='Explained Variance (%)', title='Cumulative Explained Variance of PCA of Preprocessed and Unpreprocessed Data', barmode='group', showlegend=True)
fig.show()

In [None]:
# Control for time delay embedding
pca_raw = PCA(n_components=3)
pca_data_raw = pca_raw.fit_transform(dataframe.loc[:,~dataframe.columns.isin(['state','dataset'])])
data_raw = pd.DataFrame(pca_data_raw)
data_raw['state'] = dataframe['state']
hf.plot_PCs(data_raw, 'PCA of Time Embedded Data', pca_raw.explained_variance_ratio_*100)