In [None]:
import os
import random
import re
import logging
import numpy as np
import librosa
import soundfile as sf
from concurrent.futures import ThreadPoolExecutor
from tensorflow.keras import models, layers as tf_layers
import mlflow
import tempfile
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
import pandas as pd


logging.basicConfig(level=logging.INFO)

def load_audio_file(file_path, sample_rate, duration):
    return librosa.load(file_path, sr=sample_rate, duration=duration)

def extract_label_from_filename(filename):
    return float(filename.split('.')[0])

def extract_max_flow_from_filename(filename):
    match = re.search(r'Max (\d+)L', filename)
    if match:
        return int(match.group(1))
    else:
        print("RETURN max flow of 1 for filename", filename)
        return 1

def extract_features(audio, sample_rate, frame_length, feature_type, num_mfcc_features):
    samples_per_frame = int(sample_rate * frame_length)
    total_frames = int(len(audio) / samples_per_frame)
    features = []

    for i in range(total_frames):
        start_idx = i * samples_per_frame
        end_idx = start_idx + samples_per_frame
        frame = audio[start_idx:end_idx]

        if feature_type == 'mfcc':
            feature = librosa.feature.mfcc(y=frame, sr=sample_rate, n_mfcc=num_mfcc_features).T
        elif feature_type == 'spectrogram':
            feature = np.abs(librosa.stft(frame)).T
        elif feature_type == 'raw':
            feature = frame
        else:
            raise ValueError("Unsupported feature type")

        features.append(feature if feature_type != 'raw' else feature.reshape(-1, 1))
    
    return features


def build_model(model_config):
    model = models.Sequential()
    
    for layer_config in model_config["layers"]:
        layer_type = layer_config.pop("type")
        
        if layer_type == "Conv1D":
            model.add(tf_layers.Conv1D(**layer_config))
        elif layer_type == "MaxPooling1D":
            model.add(tf_layers.MaxPooling1D(**layer_config))
        elif layer_type == "Flatten":
            model.add(tf_layers.Flatten())
        elif layer_type == "Dense":
            model.add(tf_layers.Dense(**layer_config))
        else:
            raise ValueError(f"Unsupported layer type: {layer_type}")
    
    model.compile(optimizer=model_config["optimizer"], 
                loss=model_config["loss"], 
                metrics=model_config["metrics"])
    
    return model

def plot_waveform_and_prediction_overlay(file_path, model, sample_rate=44100, frame_length=0.5, feature_type='mfcc', mfcc_num_features=13):
    # Load the raw audio file
    audio, _ = librosa.load(file_path, sr=sample_rate)
    time = np.arange(0, len(audio)) / sample_rate

    # Process the audio file for prediction
    samples_per_frame = int(sample_rate * frame_length)
    total_frames = int(len(audio) / samples_per_frame)
    features = []

    for i in range(total_frames):
        start_idx = i * samples_per_frame
        end_idx = start_idx + samples_per_frame
        frame = audio[start_idx:end_idx]
        # Depending on the feature type, process the frame accordingly
        if feature_type == 'mfcc':
            feature = librosa.feature.mfcc(y=frame, sr=sample_rate, n_mfcc=mfcc_num_features).T
        elif feature_type == 'raw':
            feature = frame.reshape(-1, 1)  # Reshape for consistency with (sequence_length, num_features)
        features.append(feature)

    features = np.array(features)
    # If raw audio, pad sequences to the same length
    if feature_type == 'raw':
        max_length = max(len(f) for f in features)
        features = np.array([np.pad(f, ((0, max_length - len(f)), (0, 0)), 'constant') for f in features])
        features = np.expand_dims(features, -1)  # Add the num_features dimension

    predicted_flow_rates = model.predict(features).flatten()

    known_max_flow = extract_max_flow_from_filename(os.path.basename(file_path))
    predicted_max_flow = np.max(predicted_flow_rates)
    top_predictions = np.percentile(predicted_flow_rates, 90)
    adjusted_median = np.median(predicted_flow_rates[predicted_flow_rates >= top_predictions])

    # Plot the waveform
    fig, ax1 = plt.subplots(figsize=(10, 4))
    ax1.plot(time, audio, label='Waveform', color='b')
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Amplitude', color='b')
    ax1.tick_params(axis='y', labelcolor='b')

    # Create a second y-axis for the predicted flow rate
    ax2 = ax1.twinx()
    ax2.plot(np.linspace(0, time[-1], len(predicted_flow_rates)), predicted_flow_rates, label='Predicted Flow Rate', color='r', alpha=0.7)
    ax2.set_ylabel('Flow Rate', color='r')
    ax2.tick_params(axis='y', labelcolor='r')

    # Title and grid
    plt.title(os.path.basename(file_path))
    ax1.grid(True)

    max_flow_error = abs(known_max_flow - predicted_max_flow)
    max_flow_percentage_error = (abs(known_max_flow - predicted_max_flow) / known_max_flow) * 100
    median_flow_error = abs(known_max_flow - adjusted_median)
    median_flow_percentage_error = (abs(known_max_flow - adjusted_median) / known_max_flow) * 100

    overestimation_threshold = 0.1
    # Define the threshold for overestimation
    threshold = known_max_flow * (1 + overestimation_threshold)
    
    # Count the number of times predictions exceed this threshold
    overestimations = np.sum(predicted_flow_rates > threshold)

    return max_flow_error, max_flow_percentage_error, median_flow_error, median_flow_percentage_error, overestimations

def load_data(folder, sample_rate=44100, duration=10, feature_type='mfcc', frame_length=0.5, num_mfcc_features=13, load_labels=True):
    features, labels = [], []

    def process_file(filename):
        if not filename.endswith('.wav'):
            return None, None
        file_path = os.path.join(folder, filename)
        audio, _ = librosa.load(file_path, sr=sample_rate, duration=duration)

        if load_labels:
            label = extract_label_from_filename(filename)
        else:
            label = None

        feature = extract_features(audio, sample_rate, frame_length, feature_type, num_mfcc_features)
        return feature, label

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_file, os.listdir(folder)))

    for feature, label in results:
        if feature is not None:
            features.extend(feature)
            if load_labels and label is not None:
                labels.extend([label] * len(feature))

    features = np.array(features)
    labels = np.array(labels) if load_labels else None

    # Log shape of features and labels
    logging.info(f"Features shape: {features.shape}")
    if load_labels:
        logging.info(f"Labels shape: {labels.shape}")

    if feature_type == 'raw':
        features = pad_sequences(features, maxlen=int(sample_rate * frame_length), padding='post', truncating='post', dtype='float32')
    
    return features, labels

def run_experiment(config):
    with mlflow.start_run():
        # Load training, validation, and test data
        train_features, train_labels = load_data(
            folder=config["data"]["train_folder"],
            sample_rate=config["data"]["sample_rate"],
            duration=config["data"]["duration"],
            feature_type=config["data"]["feature_type"],
            frame_length=config["data"]["frame_length"],
            num_mfcc_features=config["data"]["num_mfcc_features"],
            load_labels=config["data"]["load_labels"]
        )

        val_features, val_labels = load_data(
            folder=config["data"]["val_folder"],
            sample_rate=config["data"]["sample_rate"],
            duration=config["data"]["duration"],
            feature_type=config["data"]["feature_type"],
            frame_length=config["data"]["frame_length"],
            num_mfcc_features=config["data"]["num_mfcc_features"],
            load_labels=config["data"]["load_labels"]
        )

        test_features, test_labels = load_data(
            folder=config["data"]["test_folder"],
            sample_rate=config["data"]["sample_rate"],
            duration=config["data"]["duration"],
            feature_type=config["data"]["feature_type"],
            frame_length=config["data"]["frame_length"],
            num_mfcc_features=config["data"]["num_mfcc_features"],
            load_labels=config["data"]["load_labels"]
        )

        config["model"]["input_shape"] = train_features.shape[1:]
        mlflow.log_params(config["model"])
        
        model = build_model(config["model"])

        
        early_stopping = EarlyStopping(
            monitor='val_loss', 
            patience=3, 
            restore_best_weights=True
        )

        history = model.fit(
            train_features, 
            train_labels, 
            validation_data=(val_features, val_labels), 
            epochs=config["model"]["epochs"],
            callbacks=[early_stopping]
        )
        model.summary()
        # Evaluate the model on the test set
        test_loss, test_mae = model.evaluate(test_features, test_labels)
        mlflow.log_metrics({"test_loss": test_loss, "test_mae": test_mae})

        mlflow.log_metrics({"final_loss": history.history["loss"][-1], "final_val_loss": history.history["val_loss"][-1]})

        mlflow.keras.log_model(model, "model")

        plt.figure(figsize=(10, 6))
        plt.plot(history.history['loss'], label='Training loss')
        plt.plot(history.history['val_loss'], label='Validation loss')
        plt.title('Training and Validation Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()
        plt.grid(True)

        with tempfile.TemporaryDirectory() as tempdir:
            plot_path = os.path.join(tempdir, "training_validation_loss.png")
            plt.savefig(plot_path)
            mlflow.log_artifact(plot_path, "plots")
            plt.close()

            # Log test predictions
            test_predictions = model.predict(test_features).flatten()
            test_results = pd.DataFrame({
                "True Value": test_labels,
                "Predicted Value": test_predictions
            })

            test_results_path = os.path.join(tempdir, "test_results.csv")
            test_results.to_csv(test_results_path, index=False)
            mlflow.log_artifact(test_results_path, "test_predictions")

# Configuration dictionary
config = {
    "data": {
        "sample_rate": 44100,
        "duration": 10,
        "feature_type": "raw",
        "frame_length": 0.05,
        "train_on_label_0": True,
        "num_mfcc_features": 13,
        "load_labels": True,
        "train_folder": r"D:\file_train",       # Folder containing training files r"F:\test\Sonohaler\train"
        "val_folder": r"D:\file_val",       # Folder containing validation files
        "test_folder": r"D:\file_test"          # Folder containing test files r"F:\test\Sonohaler\test"
    },
    "model": {
        "layers": [
            {"type": "Conv1D", "filters": 32, "kernel_size": 5, "activation": "relu"},
            {"type": "MaxPooling1D", "pool_size": 2},
            {"type": "Conv1D", "filters": 64, "kernel_size": 5, "activation": "relu"},
            {"type": "MaxPooling1D", "pool_size": 2},
            {"type": "Flatten"},
            {"type": "Dense", "units": 128, "activation": "relu"},
            {"type": "Dense", "units": 1, "activation": "linear"}
        ],
        "optimizer": "Adam",
        "optimizer_learning_rate": 0.001,
        "loss": "mean_squared_error",
        "metrics": ["mae"],
        "epochs": 100,
        "validation_split": 0.1,  # This will be overridden by validation_data parameter
        "input_shape": (2205, 1)
    }
}

# Running the experiment
run_experiment(config)