# Data Processing for Analysis

---

### Import Libraries

In [1]:
import sys
import os
root = os.path.abspath('..')  
sys.path.append(root)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from modules import processing, load, plots, analysis

---

### 1. Load Data and Visualize Them

In [2]:
name = 'AW1D_YSI_20230826'
path = f'../data/raw/{name}.csv'

df = load.load_data(path)

x_row = df[0]
y_row = df[1]

In [3]:
mode='lines'

plots.plot_data(x_row, y_row, plot_mode=mode)

In [4]:
width = 1 # meters

initial_density = analysis.calculate_density(x_row, y_row, width)
plots.plot_histogram(initial_density, 
                    value_column='x_bin', 
                    weight_column='frequency',
                    x_axis_title='Vertical Position [m]',
                    y_axis_title='Frequency',
                    title=f'Data density: {width} meter(s)',
                    num_bins=len(initial_density)
                    )

---
### Process Data for Cleaning

1. Removal of negative data in x.
2. Removal of outliers.
3. Grouping of duplicates by x and assigning the average value in y.
4. Digital filter for smoothing.

In [5]:
x_positive, y_positive = processing.filter_non_negative_values(x_row, y_row)

In [6]:
plots.plot_data(x_row, y_row, 
                secondary_x=x_positive, secondary_y=y_positive,
                trace_names=['Original Data', 'Positive Data'],
                )

In [10]:
#x_out, y_out = processing.remove_outliers_iqr(x_positive, y_positive)

In [None]:
#plots.plot_data(x_positive, y_positive, 
#                secondary_x=x_out, secondary_y=y_out,
#                trace_names=['Positive Data', 'Without outliers']
#                )

In [7]:
x_ave, y_ave, duplicates = processing.average_grouped_by_x(x_positive, y_positive)

In [12]:
plots.plot_data(x_row, y_row, 
                secondary_x=x_ave, secondary_y=y_ave,
                trace_names=['Original Data', 'Without duplicates'], plot_mode='markers'
                )

In [None]:
plots.plot_histogram(duplicates, 
                        value_column='Duplicated Value', 
                        weight_column='Frequency',
                        title="Duplicate data density", 
                        num_bins=35
                    )

---

#### Save `rowdy`

> Run only if you want to save the `rowdy`


In [None]:
df_rowdy = pd.DataFrame({'Vertical Position [m]': x_ave, 'Corrected sp Cond [uS/cm]': y_ave})

df_rowdy.to_csv(f'../data/rowdy/{name}_rowdy.csv', index=False)

---

In [None]:
y_smoothed = processing.apply_savgol_filter(y_ave, 
                                            window_length=25, 
                                            poly_order=2
                                            )

In [None]:
plots.plot_data(x_ave, y_ave, 
                secondary_x=x_ave, secondary_y=y_smoothed,
                trace_names=['Average Data', 'Smoothed Data'],
                enable_error_y=False  
                )

---
### Save Processed Data

> Execute only if saving the processed data is required


In [None]:
df_processed = pd.DataFrame({'Vertical Position [m] (Processed)': x_ave, 
                            'Corrected sp Cond [uS/cm] (Processed)': y_smoothed}
                            )
df_processed

In [15]:
# save as csv
df_processed.to_csv(f'../data/processed/{name}_processed.csv', index=False)

---

### Calculate BIC (Bayesian Information Criterion)

> Since `piecewise-regression` calculates the BIC with random initialization, the choice of the **number of breakpoints** must be validated by performing the calculation multiple times (`n_trials`).


In [None]:
results = analysis.best_n_breakpoints(x_ave, y_smoothed, 
                    max_breakpoints=10, 
                    n_trials=5 
                    )

---

### Save BIC Calculations

> Calculating different piecewise linear fits is intensive; the information is stored in `JSON` format to avoid repeating this process.



In [18]:
results_df = pd.DataFrame(results)
results_df.to_json(f'../data/results/{name}_results.json')