# Data exploration and preparation
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berni-lehner/structural_health_monitoring/blob/main/notebooks/data_exploration.ipynb)

In [None]:
random_state = 42

In [None]:
import sys
import os
import time
import glob
from pathlib import Path
import sklearn
import pandas as pd
import numpy as np
from collections import Counter


import matplotlib.pyplot as plt
import seaborn as sns


IN_COLAB = 'google.colab' in sys.modules

if(IN_COLAB):
    !git clone https://github.com/berni-lehner/structural_health_monitoring.git
    sys.path.insert(0,"/content/structural_health_monitoring/src")
    sys.path.insert(0,"/content/structural_health_monitoring/data")    
else:
    sys.path.insert(0,"../src")
    sys.path.insert(0,"../data")
    


try:
    from zippeekiyay import namelist
except ImportError or ModuleNotFoundError:
    print('installing zippee-ki-yay...')
    !pip install git+https://github.com/berni-lehner/zippee-ki-yay.git

    from zippeekiyay import namelist

In [None]:
from DataDownloader import DataDownloader as ddl
from data_utils import load_raw_specs, FEATURE_LIST
from manifold_utils import tsne_embedding
from plot_utils import plot_embedding_targets
from plot_utils import plot_classwise_dist, plot_classwise_kde

In [None]:
url = r"https://sandbox.zenodo.org/record/1115172/files/data_synthetic.zip"
DATA_PATH = Path(r"../data/synthetic/")


start_time = time.perf_counter()
dl_succeed = ddl.download_and_unpack(url, DATA_PATH, cache=True)
end_time = time.perf_counter()
print(f"time passed: {end_time-start_time:.2f} s")
print(f"downloading successful: {dl_succeed}")

In [None]:
# general plot configuration
SMALL_SIZE = 10
MEDIUM_SIZE = 16
LARGE_SIZE = 20
HUGE_SIZE = 24

plt.rc('figure', figsize=(22, 8))        # default figure size
plt.rc('figure', titlesize=HUGE_SIZE)     # fontsize of the figure title
plt.rc('figure', titleweight='bold')      # weight of the figure title
#plt.rc('font', size=MEDIUM_SIZE)          # default text sizes
#plt.rc('axes', titlesize=LARGE_SIZE)      # fontsize of the axes title
#plt.rc('axes', titleweight='bold')        # weight of the axes title
#plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
#plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
#plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize

In [None]:
file_names = list(DATA_PATH.glob('**/*.csv'))

# cache file for faster data loading on later iterations
pickle_name = Path(DATA_PATH, 'raw_specs.pkl')

In [None]:
%%time

df = load_raw_specs(file_names=file_names,
                    cache_file=pickle_name,
                    y_col=FEATURE_LIST)

df.head()

## keep only the defect radius as target variable

In [None]:
FEATURE_LIST

In [None]:
target = 'y_radius'

drop_cols = FEATURE_LIST.copy()
drop_cols.remove(target)
drop_cols.append('file')

df = df.drop(columns=drop_cols)

In [None]:
df.columns

In [None]:
features = df.columns[0:-1]

In [None]:
df = df.sort_values(target) # make sure the class labels are sorted for further convenience

cntr = Counter(df[target])
cntr.keys()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[features]) # use both inliers and outliers

df[features] = scaler.transform(df[features])

In [None]:
fig = plot_classwise_dist(df=df, label_col=target)
fig.set(xlim=(-300, None)); 

In [None]:
fig = plot_classwise_dist(df=df, label_col=target)
fig.set(xlim=(-13, 32)); 

In [None]:
fig = plot_classwise_kde(df, label_col=target, feature_idx=17, focus=34);

## Plot t-sne embedding

### convert to numpy array

In [None]:
X = df[features[17:]].values
y = df[target].values

X.shape

In [None]:
X_embedded = tsne_embedding(X, perplexity=40)

plot_embedding_targets(X_embedded, y, alpha=0.2)