### Load required packages 

In [None]:
using DataFrames: DataFrame, ncol, nrow
using Dates: Dates, DateTime, Time, unix2datetime, Hour, Minute, Microsecond
using NativeFileDialog: pick_file
using Statistics: median, mean, std
using Plots: Plots, plot, plot!, annotate!, hline!, @layout, text, plotly, font, scatter!
using Printf: @sprintf
using JLD2

include("model_functions.jl")

### Recover earlier separated data from file (Note: does not include Model data)

In [None]:
# Load the model and optimizer states from the JLD2 file
current_path = pwd() * "\\Training_data"
filterlist = "JLD2"
infil = pick_file(current_path; filterlist)

# Load all saved data and labels
##@load infil training_data training_date training_data_good training_date_good training_data_bad training_date_bad training_indicies_good training_indicies_bad training_labels # median_train std_train
@load infil training_data_good training_data_bad training_data_bad

println("Data and labels loaded successfully.")
println("Good data contains ",string(size(training_data_good)[2])," records")
println("Bad  data contains ",string(size(training_data_bad)[2])," records\n")

### Build the Model using a hybrid approach

In [None]:
using Flux, Statistics

# (Keep existing functions: min_max_normalize_matrix, z_score_normalize_matrix, pad_or_truncate, get_heave)

function min_max_normalize_matrix(X)
####################################
    
    min_vals = minimum(X, dims=1)  # Compute min for each column
    max_vals = maximum(X, dims=1)  # Compute max for each column
    
    return((X .- min_vals) ./ (max_vals .- min_vals))
    
end    # min_max_normalize_matrix()


function calc_reconstruction_errors(data_matrix, model)
#######################################################
    
    reconstruction_errors = Float32[]
    
    for record in eachcol(data_matrix)
        reconstructed_record = model(record)
        error = mean((reconstructed_record .- record).^2)
        push!(reconstruction_errors, error)
    end
    
    return(reconstruction_errors)
    
end    # calc_reconstruction_errors()


####################################################################
####################################################################
####################################################################

# Define your refined autoencoder model as you have it
refined_model = Chain(
    Dense(4608, 256, relu),
    Dense(256, 128, relu),
    Dense(128, 32, relu),
    Dense(32, 128, relu),
    Dense(128, 256, relu),
    Dense(256, 4608)
)

# Concatenate and normalize the training data
training_data_combined = hcat(training_data_good, training_data_bad)
training_data_normalized = min_max_normalize_matrix(training_data_combined)
training_data_float32 = Float32.(training_data_normalized)

@time begin
    
    println("Building hybrid model now\n")
    flush(stdout)

    # Train the model
    loss(x) = Flux.mse(refined_model(x), x)
    opt = Adam()
    
    data = Iterators.repeated((training_data_float32,), 100)
    Flux.train!(loss, Flux.params(refined_model), data, opt)
    println("Model training complete.")

end

### Select a .BVA file to check for outliers

In [None]:
using Sockets

#global X_data = Matrix{Float32}(undef, 0, 0)

hostname = gethostname()
println("The name of the computer is: ", hostname)

if hostname == "QUEENSLAND-BASIN"
    
    display("text/html", "<style>.container { width:100% !important; }</style>")
    initial_path = "E:\\Card Data\\"
    
else
    
    display(HTML("<style>.jp-Cell { width: 120% !important; }</style>"))    
    initial_path = "F:\\Card Data\\"
    
end

REC_LENGTH = 4608       # Number of WSE's in a Mk4 30-minute record
SAMPLE_FREQUENCY = 2.56 # Mk4 sample frequency in Hertz
SAMPLE_LENGTH = 1800    # record length in seconds
SAMPLE_RATE = Float64(1/SAMPLE_FREQUENCY) # sample spacing in seconds

#########################################################################################################################
##    confidence_interval = 2.576  # corresponds to a 99% confidence interval (for a normal distribution)
##    confidence_interval = 3.0    # corresponds to a 99.73% confidence interval (for a normal distribution)    
##    confidence_interval = 3.29   # corresponds to a 99.9% confidence interval (for a normal distribution)
#########################################################################################################################

# Widen screen for better viewing
display(HTML("<style>.jp-Cell { width: 120% !important; }</style>"))
display("text/html", "<style>.container { width:100% !important; }</style>")

if hostname == "QUEENSLAND-BASIN"
    
    initial_path = "E:\\Card Data\\"
    
else
    
    initial_path = "F:\\Card Data\\"
    
end

infil = pick_file(initial_path)

f23_df, Data = get_hex_array(infil)

f23_df = get_matches(Data, f23_df)

# remove those vectors from F23 df that are not located in the Data vector df
f23_df = f23_first_row_check(f23_df)

X_data, X_date = get_heave(Data, f23_df);

X_data = Float32.(X_data)

println(string(length(X_date))," records processed.\n")
println("\nNow run hybrid model against this data to check for outliers!")
flush(stdout)

### Run the hybrid model against data in the selected .BVA file

In [None]:
# Calculate reconstruction errors for good and bad training data separately
errors_good = calc_reconstruction_errors(Float32.(min_max_normalize_matrix(training_data_good)), refined_model)
errors_bad = calc_reconstruction_errors(Float32.(min_max_normalize_matrix(training_data_bad)), refined_model)

# Normalize new data
X_data_32 = Float32.(X_data)
mean_train_selected = mean(training_data_good, dims=1)[:, 1:size(X_data)[2]]
std_train_selected = std(training_data_good, dims=1)[:, 1:size(X_data)[2]]

X_new_normalized = min_max_normalize_matrix(X_data_32)
# Make predictions using the trained model

predicted_X_data = refined_model(X_new_normalized)
reconstruction_error = sum((X_new_normalized .- predicted_X_data) .^ 2, dims=1)
reconstruction_error_vector = vec(reconstruction_error)    # convert reconstruction_error matrix to vector
#==
Note: The reconstruction error is the difference between the original data and the reconstructed data. 
      This error reflects how well the model can replicate the original data. 
      A low reconstruction error suggests the model has captured the structure of the data well, 
          while a high reconstruction error suggests that the data is unusual or anomalous.
==#

# Normalize reconstruction errors to match scaling with good and bad thresholds
normalized_reconstruction_error = min_max_normalize_matrix(reconstruction_error_vector)

# Weighting errors for bad data
bad_weight_factor = 2.0  # Apply more weight to bad data
weighted_errors_bad = errors_bad .* bad_weight_factor

# Set separate thresholds
good_threshold = quantile(errors_good, 0.95)  
bad_threshold = quantile(weighted_errors_bad, 0.995)

##threshold = mean(normalized_reconstruction_error) + 3 * std(normalized_reconstruction_error)
threshold = quantile(normalized_reconstruction_error, 0.995)

println("Threshold: ", threshold)
println("Good data threshold: ", good_threshold)
println("Bad data threshold: ", bad_threshold)

inverted_reconstruction_error = 1.0 .- normalized_reconstruction_error

# Calculate adaptive thresholds based on median and standard deviation
inverted_median = median(inverted_reconstruction_error)
inverted_std = std(inverted_reconstruction_error)

# Adaptive uncertain and bad thresholds
uncertain_threshold = inverted_median + inverted_std
bad_threshold = inverted_median + 1.5 * inverted_std

# Identify outliers and uncertain points based on adaptive thresholds
outliers = findall(inverted_reconstruction_error .> bad_threshold)
uncertain_indices = findall(x -> uncertain_threshold < x <= bad_threshold, inverted_reconstruction_error)

# Convert indices to dates if necessary
outlier_dates = X_date[outliers]
uncertain_dates = X_date[uncertain_indices]

# Output results
if !isempty(outlier_dates)
    println("Outliers detected at the following dates:")
    for date in outlier_dates
        println("    ", Dates.format(date, "yyyy-mm-dd HH:MM"))
    end
else
    println("No outliers detected.")
end

if !isempty(uncertain_dates)
    println("Uncertain data points detected at the following dates:")
    for date in uncertain_dates
        println("    ", Dates.format(date, "yyyy-mm-dd HH:MM"))
    end
else
    println("No uncertain data points detected.")
end

### Plot reconstruction errors

In [None]:
using Plots: vline!, hspan!, xlims, ylims
using Dates: Day, Month, Year, Hour, Minute
start_date = DateTime(Year(X_date[1]), Month(X_date[1]), Day(X_date[1])) + Day(1)  # next day's 00:00
tick_interval = Hour(2)  # or Hour(6), Day(1), etc., depending on desired interval

# Generate tick positions and labels
ticks = collect(start_date:tick_interval:X_date[end])
tick_labels = Dates.format.(ticks, "dd HH:MM")  # Adjust format to show days and hours

title="Hybrid Model reconstruction errors - "*split(infil,"\\")[end]

# Set plotting limits of Y-axis
y_max = 1.05

plot( size=(1200,600), dpi=100, title=title, xlims=(start_time,end_time), ylims=(0,y_max), 
    xticks=(ticks, tick_labels), xrotation=90, xtickfont=font(7), 
    yticks = false, ylabel=("Reconstruction Error"),
    framestyle=:box, fg_legend=:transparent, bg_legend=:transparent, 
    legend=:bottomleft, leftmargin=8Plots.mm, bottommargin=5Plots.mm,
    grid=true, gridlinewidth=0.125, gridstyle=:dot, gridcolor=:grey, gridalpha=0.5)

plot!(X_date, inverted_reconstruction_error, lw=:2, lc=:grey, label="")

# Plot bands of error type
hspan!([bad_threshold, y_max], fillcolor=:red, fillalpha=:0.125, label="Outlier area")
hspan!([uncertain_threshold, bad_threshold], fillcolor=:lightgrey, fillalpha=:0.25, label="Uncertain area")
hspan!([0, uncertain_threshold], fillcolor=:green, fillalpha=:0.125, label="Valid area")

# Plot location of outliers on X-axis
vline!(outlier_dates, ls=:dash, lc=:red, lw=:0.5, label="Detected Outliers")


### Plot records with suspect data (as identified by the model)

In [None]:
using Statistics: quantile
using Plots: hspan!


function do_plots(ii, start_time, heave)
####################################
    
    end_time = start_time + Minute(30)
    xvals = start_time + Microsecond.((0:REC_LENGTH-1) / SAMPLE_FREQUENCY * 1000000)

    Q1 = quantile(heave, 0.25)
    Q3 = quantile(heave, 0.75)

    multiplier = 1.5

    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR

    # Plot initialization
    p1 = plot(size=(2000, 400), dpi=100, framestyle=:box, fg_legend=:transparent, bg_legend=:transparent, 
        legend=:topright, xtickfont=font(8), ytickfont=font(8), bottommargin=5Plots.mm, 
        grid=true, gridlinewidth=0.125, gridstyle=:dot, gridcolor=:grey, gridalpha=0.5)
    
    p1 = hspan!([lower_bound, upper_bound], fillcolor=:lightblue, fillalpha=:0.125, label="IQR limits")
    
    tm_tick = range(start_time, end_time, step=Minute(1))
    ticks = Dates.format.(tm_tick, "MM")
    
    # Calculate dynamic confidence interval
    confidence_interval = 3.29 # threshold at the 99.9th percentile level

    # Identify z_scores using modified z-score
    z_score_indices, mod_z_scores = modified_z_score(heave, confidence_interval)
    if !isempty(z_score_indices)
        scatter!(p1, xvals[z_score_indices], heave[z_score_indices], 
            markersize=4, markerstrokecolor=:red, markerstrokewidth=1, 
            markercolor=:white, markershape=:circle, label="Modified Z-score beyond 99.9% confidence limits")
    end

    # Plot confidence limits
    confidence_limits = calc_confidence_limits(heave, confidence_interval)
    hline!(p1, [confidence_limits[1], confidence_limits[2]], color=:red, lw=0.5, linestyle=:dash, label="99.9% confidence limits")

    # Plot heave data
    plot!(p1, xvals, heave, xlims=(xvals[1], xvals[end]), lw=0.5, lc=:blue, alpha=0.5, 
        xticks=(tm_tick, ticks), label="")

    # Annotate plot with the number of outliers and confidence interval
    num_outliers = length(z_score_indices)
    suspect_string = string("  ", string(ii)," ",Dates.format(start_time, "yyyy-mm-dd HH:MM"), " - ", num_outliers, " Possible outliers") # using Confidence Interval of ", 
##        @sprintf("%.2f", confidence_interval))
    annotate!(p1, xvals[1], maximum(heave) * 0.9, text(suspect_string, :left, 10, :blue))

    display(p1)
    
end    # do_plots()  


for ii ∈ outliers

    # Initialize variables
    start_time = X_date[ii]
    global heave = X_data[:, ii]
    
    do_plots(ii, start_time, heave)
end

In [None]:
# Retrieve the y-axis limits
y_min, y_max = ylims()

In [None]:
maximum(inverted_reconstruction_error)