# Locate possible outliers in Heave, North, and West data in .RDT files

### Load required packages

In [None]:
# List of packages (and their functions) used in the modules below
using DataFrames: DataFrame, ncol, nrow
using Dates: Day, Date, Dates, DateTime, Hour, Microsecond, Minute, Month, now, Time, unix2datetime, Year
#using FilePathsBase
using Flux: Adam, Chain, Dense, Flux, mse, params, relu, train!
using Glob: glob
using JLD2: @load, @save, jldopen
using NativeFileDialog: pick_file, pick_folder
using Plots:  annotate!, font, hline!, hspan!, plot, Plots, plotly, plot!, scatter!, text, vline!, xlims, ylims, @layout
using Printf: @sprintf
using Sockets: gethostname
using Statistics: mean, median, quantile, std
using Tk: Button, Frame, Tk, Toplevel, Treeview, bind, destroy, get_value, pack, scrollbars_add, tcl

include(".\\Mk3_model_functions.jl");    # this contains the functions called by the modules below

println("Loading packages completed")

hostname = gethostname()
##println("The name of the computer is: ", hostname)

if hostname == "QUEENSLAND-BASIN"
    
    display("text/html", "<style>.container { width:100% !important; }</style>")
    initial_path = "E:\\Card Data\\"
    
else
    
    display(HTML("<style>.jp-Cell { width: 120% !important; }</style>"))    
    initial_path = "F:\\Card Data\\"
    
end;

### Recover .RDT hybrid_model, optimiser, and training data from file

In [None]:
# Select the JLD2 file
current_path = pwd() * "\\Model"
filterlist = "JLD2"
infil = pick_file(current_path; filterlist)

# Ensure the file extension is uppercase
infil = replace(infil, ".jld2" => ".JLD2")

println("File selected: $infil")

# Verify keys in the JLD2 file
jldopen(infil, "r") do file
    println("Keys in saved hybrid_model file:", keys(file))
end

# Load the saved data into variables
@load infil hybrid_model opt training_data_good training_data_bad

# Verify the loaded data
println("Model type: ", typeof(hybrid_model))         # Should be Chain
println("Optimizer type: ", typeof(opt))             # Should be an optimizer, e.g., Adam
println("Good training data type: ", typeof(training_data_good))  # Should be Array
println("Bad training data type: ", typeof(training_data_bad))    # Should be Array

println("\nModel and training data loaded")

### Select .RDT directory and read its files

In [None]:
#==
Calls:

    get_sorted_file_data()
    plot_all_directory()

==#

# Define the path to the directory you want to search in
directory_path = pick_folder(initial_path)

# Use glob to find all .RDT files in the directory and subdirectories
println("Reading all .RDT files in ",directory_path)
flush(stdout)
rdt_files = glob(".//*.RDT", directory_path)

# remove the TMP.RDT file from the array
TMP_file = "TMP.RDT"
rdt_files = filter(file -> basename(file) != TMP_file, rdt_files)

# Extract sorted dates, Hmax_values, and rdt_files
sorted_dates, sorted_rdt_files, sorted_Hsig_values = get_sorted_file_data(infil, rdt_files)
date_array = Dates.format.(values(sorted_dates), "yyyy-mm-dd");
  
plot_all_directory(sorted_dates, sorted_Hsig_values)

### Select a .RDT file from menu

In [None]:
#==
Calls:

    select_date_from_list()
    decode_rdt_data()

==#

dates_array = Dates.format.(sorted_dates, "yyyy-mm-dd")

selected_date = select_date_from_list(dates_array)
println("Selected date: ", selected_date === nothing ? "None" : selected_date)

index = findall(x -> x == selected_date,string.(sorted_dates));

# Read the selected .RDT file
infil = directory_path * "\\" * sorted_rdt_files[index[1]]

# Initialize GPS_errors as an empty 3D array
        
REC_LENGTH = 2304       # Number of WSE's in a Mk4 30-minute record
SAMPLE_FREQUENCY = 1.28 # Mk4 sample frequency in Hertz
SAMPLE_LENGTH = 1800    # Record length in seconds
SAMPLE_RATE = Float64(1 / SAMPLE_FREQUENCY) # Sample spacing in seconds

X_data, X_date, GPS_errors = decode_rdt_data(infil)

# Identify zero columns and remove them
zero_columns = findall(i -> all(==(0), X_data[:, i, :]), 1:size(X_data, 2))
##X_data = X_data[:, setdiff(1:size(X_data, 2), zero_columns), :]
##X_date = X_date[setdiff(1:length(X_date), zero_columns)]

# Locate GPS errors flagged by Datawell
column_sums = sum(GPS_errors[:, :, 1], dims=1)  # Sum along the rows (dimension 1)

column_sums_vector = vec(column_sums)  # Converts the 1×288 matrix to a 288-element vector
records_with_errors = findall(x -> x > 0, column_sums_vector)

println("\n",string(length(records_with_errors))," records with GPS errors: ")

### Run the hybrid model against data in the selected .RDT file

In [None]:
#==
Calls:

    detect_outliers()

==#

@time begin
    
    # identify possible outliers in Heave, North, and West data
    outlier_heave, uncertain_heave, outlier_dates_heave, uncertain_dates_heave, good_thresh_heave, bad_thresh_heave = 
        detect_outliers(X_data[:,:,1], X_date, training_data_good, training_data_bad, hybrid_model)
    outlier_north, uncertain_north, outlier_dates_north, uncertain_dates_north, good_thresh_north, bad_thresh_north = 
        detect_outliers(X_data[:,:,2], X_date, training_data_good, training_data_bad, hybrid_model)
    outlier_west, uncertain_west, outlier_dates_west, uncertain_dates_west, good_thresh_west, bad_thresh_west = 
        detect_outliers(X_data[:,:,3], X_date, training_data_good, training_data_bad, hybrid_model)
    
    # Combine and deduplicate dates across components
    all_outlier = unique(vcat(outlier_heave, outlier_north, outlier_west))
    all_uncertain = unique(vcat(uncertain_heave, uncertain_north, uncertain_west))
    
    # Combine and deduplicate dates across components
    all_outlier_dates = unique(vcat(outlier_dates_heave, outlier_dates_north, outlier_dates_west))
    all_uncertain_dates = unique(vcat(uncertain_dates_heave, uncertain_dates_north, uncertain_dates_west));

end

# Output results
println("\nFor ",infil,"\n")
if !isempty(all_outlier_dates)
    println(string(length(all_outlier_dates)), " records contain suspected outliers at the following dates:\n")
    for date in all_outlier_dates
        println("    ", Dates.format(date, "yyyy-mm-dd HH:MM"))
    end
    print("\n")
else
    println("No suspected outliers detected.\n")
end

if !isempty(all_uncertain_dates)
    println(string(length(all_uncertain_dates)), " records contain uncertain data points at the following dates:\n")
    for date in all_uncertain_dates
        println("    ", Dates.format(date, "yyyy-mm-dd HH:MM"))
    end
else
    println("No uncertain data points detected.")
end

println("\nNow run the plot routine to view the suspect records")

### Plot records with suspect data (as identified by the model)

In [None]:
#==
Calls:

    do_heave_north_west_plots()

==#

combined_outlier = unique(vcat(all_outlier, records_with_errors))

jj = 1

for ii ∈ sort(combined_outlier) # all_outlier)

    # Initialize variables
    start_time = X_date[ii]
       
    do_heave_north_west_plots(ii, start_time, X_data, GPS_errors)

    jj += 1
    
end