# Locate possible outliers in Heave, North, and West data in .BVA files

### Load required packages 

In [None]:
# List of packages (and their functions) used in the modules below
using DataFrames: DataFrame, ncol, nrow
using Dates: Day, Dates, DateTime, Hour, Microsecond, Minute, Month, Time, unix2datetime, Year
using Flux: Adam, Chain, Dense, Flux, mse, params, relu, train!
using JLD2: @load 
using NativeFileDialog: pick_file
using Plots:  annotate!, font, hline!, hspan!, plot, Plots, plotly, plot!, scatter!, text, vline!, xlims, ylims, @layout
using Printf: @sprintf
using Sockets: gethostname
using Statistics: mean, median, quantile, std

include("model_functions.jl");    # this contains the functions called by the modules below

println("Now recover the training data from file")

### Recover earlier separated training data from file (Note: does not include Model data)

In [None]:
# Load the training data from the JLD2 file
current_path = pwd() * "\\Training_data"
filterlist = "JLD2"
infil = pick_file(current_path; filterlist)

@time begin
    
    # Load all saved data and labels
    @load infil training_data_good training_data_bad training_data_bad
    
end

println("\nTraining Data loaded successfully.\n")
println("Good data contains ",string(size(training_data_good)[2])," records")
println("Bad  data contains ",string(size(training_data_bad)[2])," records\n")

println("\nNow build the model")

### Build the Model using a hybrid approach

In [None]:
#==
Calls:

    min_max_normalize_matrix()

==#

# Define autoencoder model
hybrid_model = Chain(
    Dense(4608, 256, relu),
    Dense(256, 128, relu),
    Dense(128, 32, relu),
    Dense(32, 128, relu),
    Dense(128, 256, relu),
    Dense(256, 4608)
)

# Concatenate and normalize the training data
training_data_combined = hcat(training_data_good, training_data_bad)
training_data_normalized = min_max_normalize_matrix(training_data_combined)
training_data_float32 = Float32.(training_data_normalized)

hostname = gethostname()
println("The name of the computer is: ", hostname)

# computer-specific actions
if hostname == "QUEENSLAND-BASIN"  
    initial_path = "E:\\Card Data\\"
    display("text/html", "<style>.container { width:100% !important; }</style>")     
    println("Building hybrid model now - on this computer it takes about 30s\n")
else   
    initial_path = "F:\\Card Data\\"
    display(HTML("<style>.jp-Cell { width: 120% !important; }</style>"))
    println("Building hybrid model now - on this computer it takes about 200s\n")
end    
flush(stdout)  
    
@time begin
    
    # Train the model
    loss(x) = Flux.mse(hybrid_model(x), x)
    opt = Adam()
    
    data = Iterators.repeated((training_data_float32,), 100)
    Flux.train!(loss, Flux.params(hybrid_model), data, opt)
    println("Model training complete.")

end

println("\nNow select a .BVA file to check for outliers"

### Select a .BVA file to check for outliers

In [None]:
#==
Calls:

    get_hex_array()
    get_matches()
    f23_first_row_check()
    get_heave_north_west()

==#

hostname = gethostname()
##println("The name of the computer is: ", hostname)

if hostname == "QUEENSLAND-BASIN"
    
    display("text/html", "<style>.container { width:100% !important; }</style>")
    initial_path = "E:\\Card Data\\"
    
else
    
    display(HTML("<style>.jp-Cell { width: 120% !important; }</style>"))    
    initial_path = "F:\\Card Data\\"
    
end

REC_LENGTH = 4608       # Number of WSE's in a Mk4 30-minute record
SAMPLE_FREQUENCY = 2.56 # Mk4 sample frequency in Hertz
SAMPLE_LENGTH = 1800    # record length in seconds
SAMPLE_RATE = Float64(1/SAMPLE_FREQUENCY) # sample spacing in seconds

X_data = Matrix{Float32}(undef, 0, 0)
#########################################################################################################################
##    confidence_interval = 2.576  # corresponds to a 99% confidence interval (for a normal distribution)
##    confidence_interval = 3.0    # corresponds to a 99.73% confidence interval (for a normal distribution)    
##    confidence_interval = 3.29   # corresponds to a 99.9% confidence interval (for a normal distribution)
#########################################################################################################################

infil = pick_file(initial_path)

f23_df, Data = get_hex_array(infil)

if !isempty(f23_df)
    
    f23_df = get_matches(Data, f23_df)
    
    # remove those vectors from F23 df that are not located in the Data vector df
    f23_df = f23_first_row_check(f23_df)
    
    X_data, X_date = get_heave_north_west(Data, f23_df);
    
    # ensure matrix is Float32 (the format required by the model)
    X_data_32 = Float32.(X_data)
    
    println(string(length(X_date))," records processed.\n")
    println("\nNow run hybrid model against this data to check for outliers!")
    flush(stdout)

else

    println("No f23 data to process!")

end

### Run the hybrid model against data in the selected .BVA file

In [None]:
#==
Calls:

    detect_outliers()

==#

@time begin
    
    # identify possible outliers in Heave, North, and West data
    outlier_heave, uncertain_heave, outlier_dates_heave, uncertain_dates_heave, good_thresh_heave, bad_thresh_heave = 
        detect_outliers(X_data[:,:,1], X_date, training_data_good, training_data_bad, hybrid_model)
    outlier_north, uncertain_north, outlier_dates_north, uncertain_dates_north, good_thresh_north, bad_thresh_north = 
        detect_outliers(X_data[:,:,2], X_date, training_data_good, training_data_bad, hybrid_model)
    outlier_west, uncertain_west, outlier_dates_west, uncertain_dates_west, good_thresh_west, bad_thresh_west = 
        detect_outliers(X_data[:,:,3], X_date, training_data_good, training_data_bad, hybrid_model)
    
    # Combine and deduplicate dates across components
    all_outlier = unique(vcat(outlier_heave, outlier_north, outlier_west))
    all_uncertain = unique(vcat(uncertain_heave, uncertain_north, uncertain_west))
    
    # Combine and deduplicate dates across components
    all_outlier_dates = unique(vcat(outlier_dates_heave, outlier_dates_north, outlier_dates_west))
    all_uncertain_dates = unique(vcat(uncertain_dates_heave, uncertain_dates_north, uncertain_dates_west));

end

# Output results
println("\nFor ",infil,"\n")
if !isempty(all_outlier_dates)
    println(string(length(all_outlier_dates)), " records contain suspected outliers at the following dates:\n")
    for date in all_outlier_dates
        println("    ", Dates.format(date, "yyyy-mm-dd HH:MM"))
    end
    print("\n")
else
    println("No suspected outliers detected.\n")
end

if !isempty(all_uncertain_dates)
    println(string(length(all_uncertain_dates)), " records contain uncertain data points at the following dates:\n")
    for date in all_uncertain_dates
        println("    ", Dates.format(date, "yyyy-mm-dd HH:MM"))
    end
else
    println("No uncertain data points detected.")
end

println("\nNow run the plot routine to view the suspect records")

### Plot records with suspect data (as identified by the model)

In [None]:
#==
Calls:

    do_heave_north_west_plots()

==#

for ii ∈ all_outlier

    # Initialize variables
    start_time = X_date[ii]
       
    do_heave_north_west_plots(ii, start_time, X_data)

end

### Plot reconstruction errors

In [None]:
start_date = DateTime(Year(X_date[1]), Month(X_date[1]), Day(X_date[1])) + Day(1)  # next day's 00:00
tick_interval = Hour(2)  # or Hour(6), Day(1), etc., depending on desired interval

# Generate tick positions and labels
ticks = collect(start_date:tick_interval:X_date[end])
tick_labels = Dates.format.(ticks, "dd HH:MM")  # Adjust format to show days and hours

title="Hybrid Model reconstruction errors - "*split(infil,"\\")[end]

# Set plotting limits of Y-axis
y_max = 1.05

plot( size=(1200,600), dpi=100, title=title, xlims=(start_time,end_time), ylims=(0,y_max), 
    xticks=(ticks, tick_labels), xrotation=90, xtickfont=font(7), 
    yticks = false, ylabel=("Reconstruction Error"),
    framestyle=:box, fg_legend=:transparent, bg_legend=:transparent, 
    legend=:bottomleft, leftmargin=8Plots.mm, bottommargin=5Plots.mm,
    grid=true, gridlinewidth=0.125, gridstyle=:dot, gridcolor=:grey, gridalpha=0.5)

plot!(X_date, inverted_reconstruction_error, lw=:2, lc=:grey, label="")

# Plot bands of error type
hspan!([bad_threshold, y_max], fillcolor=:red, fillalpha=:0.125, label="Outlier area")
hspan!([uncertain_threshold, bad_threshold], fillcolor=:lightgrey, fillalpha=:0.25, label="Uncertain area")
hspan!([0, uncertain_threshold], fillcolor=:green, fillalpha=:0.125, label="Valid area")

# Plot location of outliers on X-axis
vline!(outlier_dates, ls=:dash, lc=:red, lw=:0.5, label="Detected Outliers")
