# <b>Outlier Detection 2</b> *✲ﾟ*｡✧٩(･ิᴗ･ิ๑)۶*✲ﾟ*｡✧

Now that we have observed some patterns in the exploration_visualization notebook, we want to investigate where the patterns are coming from. We will perform some basic statistical diagnostics like outlier detection to confirm whether the ventral/dorsal data points that are so far away from the rest are indeed outliers and if they are from the same dataset. 

In [None]:
import helper_functions as hf
import numpy as np
import pandas as pd
import plotly.express as px
from pyod.models.knn import KNN
import wbfm.utils.visualization.filtering_traces as filtering_traces

#### Loading Data

In [None]:
# current modifications on the dataset:
# stacked dataframe of 23 datasets
# removed neurons that are NaN-ed in more than 13 datasets
# imputed remaining NaNs with PPCA (see data_wrangling section 2)
imputed_dataframe = pd.read_hdf("imputed_dataframe_0602.h5")

### 1. PyOD - k-Nearest Neighbors (KNN)

In [None]:
outliers = []

for column in [column for column in list(imputed_dataframe.columns) if column != "state" and column != "dataset"]:

    knn_model = KNN(contamination=0.1)
    col = pd.DataFrame(imputed_dataframe[column])
    knn_model.fit(col)
    outliers_knn = knn_model.predict(col)
    col['outlier'] = outliers_knn
    col["outlier"] = col["outlier"].apply(lambda x: "outlier" if x == 1 else "no outlier")
    
    if column == 'PVR':
        fig = px.scatter(col, x=col.index, y=column, title=column, color='outlier', color_continuous_scale='viridis', marginal_y='histogram').show()

### 2. Rolling Mean

In [None]:
inlier_dataframe = imputed_dataframe.copy()

for column in inlier_dataframe.columns:
    
    # detect outliers per neuron
    col = pd.DataFrame(inlier_dataframe[column])
    col_outliers = filtering_traces.remove_outliers_via_rolling_mean(col, window=40, std_factor=3, fill_value='outlier')
    col['outlier'] = col_outliers[column].apply(lambda x: x if x == 'outlier' else 'no outlier')
    
    if column == 'SMDVR':
        fig = px.scatter(col, x=col.index, y=column, title=column, color='outlier', color_continuous_scale='viridis', marginal_y='histogram').show()

    inlier_dataframe[column][col["outlier"]=="outlier"] = np.nan
    
    if column == 'SMDVR':
        fig = px.scatter(inlier_dataframe, x=col.index, y=column, title=column, color_continuous_scale='viridis', marginal_y='histogram').show()

### Isolation Forest

In [None]:
contanimation = 0.025
isolated_forest = imputed_dataframe.groupby("dataset").apply(lambda x: hf.apply_isolation_forest(x, contanimation))

In [None]:
fig = px.scatter(isolated_forest, x=isolated_forest.index, y='PVR', title='PVR', color='outlier', color_continuous_scale='viridis', marginal_y='histogram').show()