# Data Processing for Analysis

---

### Import Libraries

In [1]:
import sys
import os
root = os.path.abspath('..')  
sys.path.append(root)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from modules import processing, load, plots, analysis

---

### 1. Load Data and Visualize Them

In [2]:
#name = 'AW1D_YSI_20230826'
#name = 'AW2D_YSI_20230815'
#name = 'AW5D_YSI_20230824'
#name = 'AW6D_YSI_20230815'
#name = 'AW7D_YSI_20230814'
#name = 'BW1D_YSI_20230824'
#name = 'BW2D_YSI_20230819'
#name = 'BW3D_YSI_20230818'
#name = 'BW4D_YSI_20230816'
#name = 'BW5D_YSI_20230822'
#name = 'BW6D_YSI_20230826'
#name = 'BW7D_YSI_20230826'
#name = 'BW8D_YSI_20230823'
#name = 'BW9D_YSI_20230823'
name = 'BW10D_YSI_20230825'

path = f'../data/raw/{name}.csv'

df = load.load_data(path)

x_row = df[0]
y_row = df[1]

In [3]:
mode='lines'

plots.plot_data(x_row, y_row, plot_mode=mode, title=name)

In [4]:
width = 1 # meters

initial_density = analysis.calculate_density(x_row, y_row, width)
plots.plot_histogram(initial_density, 
                    value_column='x_bin', 
                    weight_column='frequency',
                    x_axis_title='Vertical Position [m]',
                    y_axis_title='Frequency',
                    title=f'Data density: {width} meter(s) / {name}',
                    num_bins=len(initial_density)
                    )

---
### Process Data for Cleaning

1. Removal of negative data in x.
2. Removal of outliers.
3. Grouping of duplicates by x and assigning the average value in y.
4. Digital filter for smoothing.

In [5]:
x_positive, y_positive = processing.filter_non_negative_values(x_row, y_row)

In [6]:
plots.plot_data(x_row, y_row, 
                secondary_x=x_positive, secondary_y=y_positive,
                trace_names=['Original Data', 'Positive Data'],title=name
                )

In [7]:
#x_out, y_out = processing.remove_outliers_iqr(x_positive, y_positive)

In [8]:
#plots.plot_data(x_positive, y_positive, 
#                secondary_x=x_out, secondary_y=y_out,
#                trace_names=['Positive Data', 'Without outliers']
#                )

In [9]:
x_ave, y_ave, duplicates = processing.average_grouped_by_x(x_positive, y_positive)

In [10]:
plots.plot_data(x_row, y_row, 
                secondary_x=x_ave, secondary_y=y_ave,
                trace_names=['Original Data', 'Without duplicates'], plot_mode='markers', title=name
                )

## Fresh cap

In [11]:
plots.plot_histogram(duplicates, 
                        value_column='Duplicated Value', 
                        weight_column='Frequency',
                        title="Duplicate data density / {name}", 
                        num_bins=35
                    )

---

#### Save `rawdy`

> Run only if you want to save the `rawdy`


In [12]:
df_rowdy = pd.DataFrame({'Vertical Position [m]': x_ave, 'Corrected sp Cond [uS/cm]': y_ave})

#df_rowdy.to_csv(f'../data/rowdy/{name}_rowdy.csv', index=False)

In [13]:
y_smoothed = processing.apply_savgol_filter(y_ave, 
                                            window_length=25, 
                                            poly_order=2
                                            )

In [14]:
plots.plot_data(x_ave, y_ave, 
                secondary_x=x_ave, secondary_y=y_smoothed,
                trace_names=['Average Data', 'Smoothed Data'],
                enable_error_y=False  
                )

## Fresh cap

In [15]:
plots.plot_data(x_ave, y_ave, 
                secondary_x=x_ave, secondary_y=y_smoothed,
                trace_names=['Average Data', 'Smoothed Data'],
                enable_error_y=False, fresh_cap=True  
                )

---
### Save Processed Data

> Execute only if saving the processed data is required


In [16]:
df_processed = pd.DataFrame({'Vertical Position [m]': x_ave, 
                            'Corrected sp Cond [uS/cm]': y_smoothed}
                            )
df_processed

Unnamed: 0,Vertical Position [m],Corrected sp Cond [uS/cm]
0,0.002,1277.400991
1,0.009,1277.016436
2,0.025,1276.676159
3,0.035,1276.380162
4,0.044,1276.128444
...,...,...
6511,28.429,52631.762564
6512,28.430,52631.528088
6513,28.431,52631.214417
6514,28.432,52630.821553


In [17]:
# save as csv
#df_processed.to_csv(f'../data/processed/{name}_processed.csv', index=False)

---

### Calculate BIC (Bayesian Information Criterion)

> Since `piecewise-regression` calculates the BIC with random initialization, the choice of the **number of breakpoints** must be validated by performing the calculation multiple times (`n_trials`).


In [18]:
# results = analysis.best_n_breakpoints(x_ave, y_smoothed, 
#                     max_breakpoints=10, 
#                     n_trials=5 
#                     )

---

### Save BIC Calculations

> Calculating different piecewise linear fits is intensive; the information is stored in `JSON` format to avoid repeating this process.



In [19]:
#results_df = pd.DataFrame(results)
#results_df.to_json(f'../data/results/{name}_results.json')