### Library Imports


In [None]:
%load_ext autoreload
%autoreload 2
import os
import nltk
import cv2
import numpy as np
import librosa
import pandas as pd
import matplotlib.pyplot as plt
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import librosa.display
from matplotlib import rcParams
from matplotlib.patches import Circle
from matplotlib.patches import Rectangle
from matplotlib.patches import FancyArrow


## Background Graphs


### Audio as WaveForm


- Waveform is a visual representation of the audio signal, where the x-axis represents time and the y-axis represents amplitude.


In [None]:
def save_fig_to_pdf(fig, save_name):
    """
    Saves the given plt figure object to a PDF.

    :param fig: matplotlib.pyplot figure object to be saved.
    :param save_name: The name of the file to save the figure as.
    """
    # Save the figure as a PDF with the given file name
    fig.savefig(save_name, dpi=150)
    print(f"Waveform saved as PDF at: {save_name}")
    plt.show()


def plot_waveform(input_file):
    """
    Plots the waveform for the given audio file.

    :param input_file: Path to the input audio file.
    :return: plt.figure object.
    """
    # Load the audio file
    array, sampling_rate = librosa.load(input_file)

    # Create the plot
    fig = plt.figure(
        figsize=(12, 4), dpi=150
    )  # Set the figure size (width x height in inches)
    librosa.display.waveshow(
        array, sr=sampling_rate, color="#800080"
    )  # Plot the waveform

    # Beautify the plot
    plt.title("Audio Waveform", fontsize=16)
    plt.xlabel("Time (seconds)", fontsize=14)
    plt.ylabel("Amplitude", fontsize=14)
    plt.grid(True, linestyle="-", alpha=0.5)  # Add a grid
    plt.tight_layout()  # Adjust layout to avoid clipping

    return array, sampling_rate, fig


# Example usage
input_file = "../example/chant.mp3"
array, sr, fig = plot_waveform(input_file)  # Generate the plot
save_fig_to_pdf(fig, "../example/01_waveform.pdf")  # Save the plot as a PDF

### Audio as Frequency Spectrum


- Frequency Spectrumrum plots the strength of the various frequency components that are present in this audio segment. The frequency values are on the x-axis, usually plotted on a logarithmic scale, while their amplitudes are on the y-axis.


In [None]:
def plot_dft(input_file, num_samples=500000):
    """
    Plots the DFT (Discrete Fourier Transform) of the given audio file.

    :param input_file: Path to the input audio file.
    :param num_samples: Number of samples to use from the beginning of the audio file for DFT.
    :return: plt.figure object.
    """
    # Load the audio file
    array, sr = librosa.load(input_file)

    # Taking only the first num_samples for better visualization
    dft_input = array[:num_samples]

    # Compute the DFT of the input signal
    window = np.hanning(len(dft_input))
    windowed_input = dft_input * window
    dft = np.fft.rfft(windowed_input)

    # Get the amplitude spectrum in decibels
    amplitude = np.abs(dft)
    amplitude_db = librosa.amplitude_to_db(amplitude, ref=np.max)

    # Get the frequency bins
    frequency = librosa.fft_frequencies(sr=sr, n_fft=len(dft_input))

    # Create the plot
    fig = plt.figure(
        figsize=(12, 4), dpi=150
    )  # Set the figure size (width x height in inches)
    plt.plot(frequency, amplitude_db)
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Amplitude (dB)")
    plt.xscale("log")
    plt.title("DFT of Audio Signal", fontsize=16)
    plt.tight_layout()  # Adjust layout to avoid clipping

    return fig


fig = plot_dft(input_file)
save_fig_to_pdf(fig, "../example/02_dft.pdf")  # Save the plot as a PDF

### Audio as Spectrogram


- Spectrum only shows a frozen snapshot of the frequencies at a given instant.
- The solution is to take multiple DFTs, each covering only a small slice of time, and stack the resulting spectra together into a spectrogram.
- A spectrogram plots the frequency content of an audio signal as it changes over time. It allows you to see time, frequency, and amplitude all on one graph. The algorithm that performs this computation is the STFT or Short Time Fourier Transform.
- The spectrogram is one of the most informative audio tools available to you


In [None]:
def plot_stft(input_file):
    """
    Plots the Short-Time Fourier Transform (STFT) of the given audio file.

    :param input_file: Path to the input audio file.
    :return: plt.figure object.
    """
    # Load the audio file
    array, sr = librosa.load(input_file)

    # Compute the Short-Time Fourier Transform (STFT)
    D = librosa.stft(array)

    # Convert the amplitude to decibels
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

    # Create the plot
    fig = plt.figure(
        figsize=(12, 4), dpi=150
    )  # Set the figure size (width x height in inches)
    librosa.display.specshow(S_db, x_axis="time", y_axis="hz")
    plt.colorbar(format="%+2.0f dB")
    plt.title("STFT Magnitude (in dB)", fontsize=16)
    plt.tight_layout()  # Adjust layout to avoid clipping

    return fig


# Example usage
input_file = librosa.ex("trumpet")  # Example trumpet sound from librosa
fig = plot_stft(input_file)  # Generate the STFT plot
save_fig_to_pdf(fig, "../example/03_stft.pdf")  # Save the plot as a PDF

### Mel Spectograms


- A mel spectrogram is a variation of the spectrogram that is commonly used in speech processing and machine learning tasks. It is similar to a spectrogram in that it shows the frequency content of an audio signal over time, but on a different frequency axis.
- In a standard spectrogram, the frequency axis is linear and is measured in hertz (Hz). However, the human auditory system is more sensitive to changes in lower frequencies than higher frequencies, and this sensitivity decreases logarithmically as frequency increases. The mel scale is a perceptual scale that approximates the non-linear frequency response of the human ear.


In [None]:
def plot_melspectrogram(input_file, n_mels=128, fmax=8000):
    """
    Plots the Mel spectrogram of the given audio file.

    :param input_file: Path to the input audio file.
    :param n_mels: Number of Mel bands (default: 128).
    :param fmax: Maximum frequency for Mel scale (default: 8000 Hz).
    :return: plt.figure object.
    """
    # Load the audio file
    array, sr = librosa.load(input_file)

    # Compute the Mel spectrogram
    S = librosa.feature.melspectrogram(
        y=array, sr=sr, n_mels=n_mels, fmax=fmax
    )

    # Convert the Mel spectrogram to decibels
    S_dB = librosa.power_to_db(S, ref=np.max)

    # Create the plot
    fig = plt.figure(
        figsize=(12, 4), dpi=150
    )  # Set the figure size (width x height in inches)
    librosa.display.specshow(
        S_dB, x_axis="time", y_axis="mel", sr=sr, fmax=fmax
    )
    plt.colorbar(format="%+2.0f dB")
    plt.title(
        f"Mel Spectrogram (n_mels={n_mels}, fmax={fmax} Hz)", fontsize=16
    )
    plt.tight_layout()  # Adjust layout to avoid clipping

    return fig


# Example usage
input_file = librosa.ex("trumpet")  # Example trumpet sound from librosa
fig = plot_melspectrogram(input_file)  # Generate the Mel spectrogram plot
save_fig_to_pdf(fig, "../example/04_melspectrogram.pdf")  # S

In [None]:
def create_mel_filter_bank(
    num_filters=10, num_fft_bins=128, mel_min=0, mel_max=1100
):
    """
    Generates a Mel filter bank and returns the figure object.

    Parameters:
        num_filters (int): Number of Mel filters.
        num_fft_bins (int): Number of FFT bins.
        mel_min (float): Minimum value in the Mel scale.
        mel_max (float): Maximum value in the Mel scale.

    Returns:
        matplotlib.figure.Figure: Figure object with the Mel filter bank plot.
    """
    # Generate linearly spaced points in the Mel scale
    mel_points = np.linspace(mel_min, mel_max, num_filters + 2)
    bin_points = np.floor(mel_points / mel_max * (num_fft_bins - 1)).astype(
        int
    )

    # Initialize filter bank
    filter_bank = np.zeros((num_filters, num_fft_bins))

    # Create triangular Mel filters
    for i in range(1, len(bin_points) - 1):
        start, center, end = (
            bin_points[i - 1],
            bin_points[i],
            bin_points[i + 1],
        )

        # Left slope
        filter_bank[i - 1, start : center + 1] = np.linspace(
            0, 1, center - start + 1
        )
        # Right slope
        filter_bank[i - 1, center : end + 1] = np.linspace(
            1, 0, end - center + 1
        )

    # Plot the Mel filter bank
    fig, ax = plt.subplots(figsize=(10, 6), dpi=150)
    for i in range(num_filters):
        ax.plot(filter_bank[i], label=f"Filter {i + 1}")
    ax.set_title("Mel Filter Bank")
    ax.set_xlabel("FFT Bin Index")
    ax.set_ylabel("Amplitude")
    ax.grid(True)
    ax.legend(loc="upper right")
    return fig


# Generate the Mel filter bank figure
mel_filter_fig = create_mel_filter_bank()
save_fig_to_pdf(mel_filter_fig, "../example/05_mel_filter_bank.pdf")

### NN OverFitting and UnderFitting Graph


In [None]:
def plot_fitting_examples(output_filename):
    """
    Creates plots demonstrating underfitting, good fit, and overfitting, including the true function,
    and saves the result to a PDF file.

    Parameters:
        output_filename (str): The name of the output PDF file (e.g., 'fitting_examples.pdf').
    """
    # Generate sample data
    np.random.seed(42)
    x = np.linspace(0, 10, 100)
    y_true = np.sin(x)  # True function
    y = y_true + 0.2 * np.random.normal(size=x.shape)  # Noisy observations

    # Underfitting model (Linear)
    coeff_underfit = np.polyfit(x, y, 1)
    y_underfit = np.polyval(coeff_underfit, x)

    # Good fit model (Polynomial degree 5)
    coeff_goodfit = np.polyfit(x, y, 5)
    y_goodfit = np.polyval(coeff_goodfit, x)

    # Overfitting model (Polynomial degree 15)
    coeff_overfit = np.polyfit(x, y, 15)
    y_overfit = np.polyval(coeff_overfit, x)

    # Plotting
    fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True, dpi=150)

    # Underfitting plot
    axes[0].scatter(x, y, label="Data", color="blue", s=10)
    axes[0].plot(x, y_underfit, label="Underfitting (Linear)", color="red")
    axes[0].plot(
        x,
        y_true,
        label="True Function (sin(x))",
        color="black",
        linestyle="--",
    )
    axes[0].set_title("Underfitting")
    axes[0].legend()

    # Good fit plot
    axes[1].scatter(x, y, label="Data", color="blue", s=10)
    axes[1].plot(x, y_goodfit, label="Good Fit (Degree 5)", color="green")
    axes[1].plot(
        x,
        y_true,
        label="True Function (sin(x))",
        color="black",
        linestyle="--",
    )
    axes[1].set_title("Good Fit")
    axes[1].legend()

    # Overfitting plot
    axes[2].scatter(x, y, label="Data", color="blue", s=10)
    axes[2].plot(x, y_overfit, label="Overfitting (Degree 15)", color="orange")
    axes[2].plot(
        x,
        y_true,
        label="True Function (sin(x))",
        color="black",
        linestyle="--",
    )
    axes[2].set_title("Overfitting")
    axes[2].legend()

    # Add common labels
    fig.supylabel("Output")
    fig.supxlabel("Input")

    # Save the figure to a PDF
    plt.tight_layout()
    fig.savefig(output_filename, format="pdf")
    plt.close(fig)


# Example usage
plot_fitting_examples("../example/06_fitting_concept.pdf")

### AutoEncoder Diagram


In [None]:
def generate_autoencoder_diagram(output_pdf="autoencoder_diagram.pdf"):
    """
    Generates a high-quality PDF diagram of an Autoencoder architecture.

    Parameters:
    - output_pdf (str): The filename for the output PDF file.
    """
    # Create a new figure
    fig, ax = plt.subplots(figsize=(10, 6), dpi=150)

    # Define layer positions
    input_layer = [-2, 0, 2]
    hidden_layer = [-1, 0, 1]
    bottleneck_layer = [0]
    decoder_layer = [-1, 0, 1]
    output_layer = [-2, 0, 2]

    # Helper to draw nodes
    def draw_nodes(layer, x, label=None, color="lightblue"):
        for y in layer:
            ax.add_patch(Circle((x, y), 0.3, color=color, ec="black", lw=1.5))
        if label:
            ax.text(
                x,
                max(layer) + 0.8,
                label,
                ha="center",
                fontsize=12,
                fontweight="bold",
            )

    # Draw nodes for each layer
    draw_nodes(input_layer, -3, label="Input Layer", color="lightcoral")
    draw_nodes(hidden_layer, -1.5, label="Encoder", color="lightblue")
    draw_nodes(bottleneck_layer, 0, label="Latent Space", color="gold")
    draw_nodes(decoder_layer, 1.5, label="Decoder", color="lightgreen")
    draw_nodes(output_layer, 3, label="Output Layer", color="lightcoral")

    # Helper to draw connections
    def draw_connections(layer1, x1, layer2, x2, color="gray", alpha=0.7):
        for y1 in layer1:
            for y2 in layer2:
                ax.add_patch(
                    FancyArrow(
                        x1 + 0.3,
                        y1,
                        x2 - x1 - 0.6,
                        y2 - y1,
                        width=0.01,
                        color=color,
                        alpha=alpha,
                        head_width=0.1,
                        length_includes_head=True,
                    )
                )

    # Draw connections between layers
    draw_connections(input_layer, -3, hidden_layer, -1.5)
    draw_connections(
        hidden_layer, -1.5, bottleneck_layer, 0, color="green", alpha=0.8
    )
    draw_connections(
        bottleneck_layer, 0, decoder_layer, 1.5, color="blue", alpha=0.8
    )
    draw_connections(decoder_layer, 1.5, output_layer, 3)

    # Add labels for data flow
    ax.text(-3, -3, "Input Data", ha="center", fontsize=10)
    ax.text(3, -3, "Reconstructed Output", ha="center", fontsize=10)

    # Aesthetic adjustments
    ax.set_xlim(-4, 4)
    ax.set_ylim(-4, 4)
    ax.axis("off")
    # plt.title("Autoencoder Architecture", fontsize=16, fontweight="bold")

    # Save the figure as a high-quality PDF
    fig.savefig(output_pdf, format="pdf", bbox_inches="tight", dpi=300)
    print(f"Diagram saved as {output_pdf}")

    # Close the plot
    plt.close(fig)


# Call the function
generate_autoencoder_diagram("../example/14_autoencoder_architecture.pdf")

### Variational Autoencoder Diagram


In [None]:
def generate_vae_diagram(output_pdf="vae_diagram.pdf"):
    """
    Generates a high-quality PDF diagram of a Variational Autoencoder (VAE) architecture.

    Parameters:
    - output_pdf (str): The filename for the output PDF file.
    """
    # Create a new figure
    fig, ax = plt.subplots(figsize=(12, 7), dpi=150)

    # Define layer positions
    input_layer = [-2, 0, 2]
    hidden_layer = [-1, 0, 1]
    latent_space = [0]
    decoder_layer = [-1, 0, 1]
    output_layer = [-2, 0, 2]

    # Helper to draw nodes
    def draw_nodes(layer, x, label=None, color="lightblue"):
        for y in layer:
            ax.add_patch(Circle((x, y), 0.3, color=color, ec="black", lw=1.5))
        if label:
            ax.text(
                x,
                max(layer) + 0.8,
                label,
                ha="center",
                fontsize=12,
                fontweight="bold",
            )

    # Draw nodes for each layer
    draw_nodes(input_layer, -4, label="Input Layer", color="lightcoral")
    draw_nodes(hidden_layer, -2, label="Encoder", color="lightblue")
    draw_nodes(latent_space, 0, label="Latent Space", color="gold")
    draw_nodes(decoder_layer, 2, label="Decoder", color="lightgreen")
    draw_nodes(output_layer, 4, label="Output Layer", color="lightcoral")

    # Draw connections between layers
    def draw_connections(layer1, x1, layer2, x2, color="gray", alpha=0.7):
        for y1 in layer1:
            for y2 in layer2:
                ax.add_patch(
                    FancyArrow(
                        x1 + 0.3,
                        y1,
                        x2 - x1 - 0.6,
                        y2 - y1,
                        width=0.01,
                        color=color,
                        alpha=alpha,
                        head_width=0.1,
                        length_includes_head=True,
                    )
                )

    draw_connections(input_layer, -4, hidden_layer, -2)
    draw_connections(
        hidden_layer, -2, latent_space, 0, color="green", alpha=0.8
    )
    draw_connections(
        latent_space, 0, decoder_layer, 2, color="blue", alpha=0.8
    )
    draw_connections(decoder_layer, 2, output_layer, 4)

    # Annotate VAE-specific elements
    ax.text(
        -1,
        -1.5,
        r"$\mu$",
        fontsize=12,
        fontweight="bold",
        color="black",
        ha="center",
    )
    ax.text(
        -1,
        1.5,
        r"$\sigma^2$",
        fontsize=12,
        fontweight="bold",
        color="black",
        ha="center",
    )
    ax.text(
        -0.3,
        0.3,
        r"$\mathbf{z} \sim \mathcal{N}(\mu, \sigma^2)$",
        fontsize=12,
        fontstyle="italic",
        color="black",
        ha="center",
    )
    ax.text(
        2,
        -1.8,
        "Sampled Latent Vector",
        fontsize=10,
        ha="center",
        fontstyle="italic",
    )
    ax.text(4, -3, "Reconstructed Output", ha="center", fontsize=10)

    # Draw rectangles for latent space distribution
    ax.add_patch(
        Rectangle(
            (-1.8, -2),
            1.6,
            4,
            edgecolor="black",
            facecolor="none",
            linestyle="--",
            linewidth=1.5,
        )
    )
    ax.text(-1.8, 2.3, "Latent Distribution", fontsize=10, ha="left")

    # Add title and clean up
    ax.set_xlim(-5, 5)
    ax.set_ylim(-4, 4)
    ax.axis("off")

    # Save the figure as a high-quality PDF
    fig.savefig(output_pdf, format="pdf", bbox_inches="tight", dpi=300)
    print(f"Diagram saved as {output_pdf}")

    # Close the plot
    plt.close(fig)


# Call the function
generate_vae_diagram("../example/15_vae_architecture.pdf")

## Experimental Setup Graphs


### POS & Word per caption distribution


In [None]:
# Download necessary resources
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

pos_map = {
    "NN": "Nouns",
    "NNS": "Nouns",
    "NNP": "Nouns",
    "NNPS": "Nouns",
    "VB": "Verbs",
    "VBD": "Verbs",
    "VBG": "Verbs",
    "VBN": "Verbs",
    "VBP": "Verbs",
    "VBZ": "Verbs",
    "JJ": "Adjectives",
    "JJR": "Adjectives",
    "JJS": "Adjectives",
    "RB": "Adverbs",
    "RBR": "Adverbs",
    "RBS": "Adverbs",
    "PRP": "Pronouns",
    "PRP$": "Pronouns",
    "WP": "Pronouns",
    "WP$": "Pronouns",
}


def process_text(file_path):
    df = pd.read_csv(file_path)
    pos_counts = {
        "Nouns": set(),
        "Verbs": set(),
        "Adjectives": set(),
        "Adverbs": set(),
        "Pronouns": set(),
    }
    word_counts = []

    for text in df["raw_text"].dropna():
        words = word_tokenize(text.lower())
        word_counts.append(len(words))
        pos_tags = pos_tag(words)
        for word, tag in pos_tags:
            category = pos_map.get(tag)
            if category:
                pos_counts[category].add(word)

    pos_vocab_sizes = {k: len(v) for k, v in pos_counts.items()}
    return pos_vocab_sizes, word_counts


def plot_vocab_size(pos_vocab_sizes, save_path):
    plt.figure(figsize=(8, 5), dpi=150)
    bars = plt.bar(
        pos_vocab_sizes.keys(),
        pos_vocab_sizes.values(),
        color=["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"],
    )
    split_name = save_path.split("_")[1]
    plt.xlabel("POS Category")
    plt.ylabel("Vocabulary Size")
    plt.title(f"{split_name} Split Vocabulary Size per POS Category")
    plt.grid(axis="y", linestyle="-", alpha=0.3)
    for bar in bars:
        yval = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            yval,
            int(yval),
            ha="center",
            va="bottom",
            fontsize=10,
        )
    plt.savefig(save_path, format="pdf")
    plt.close()


def plot_word_distribution(word_counts, save_path):
    plt.figure(figsize=(8, 5), dpi=150)
    counts, bins, patches = plt.hist(
        word_counts,
        bins=range(8, max(word_counts) + 2),
        weights=[100.0 / len(word_counts)] * len(word_counts),
        edgecolor="black",
        alpha=0.7,
        color="#1f77b4",
    )
    plt.xlabel("Number of Words per Caption")
    plt.ylabel("Percentage of Captions")
    plt.title("Distribution of Number of Words per Caption")
    plt.grid(axis="y", linestyle="-", alpha=0.3)
    plt.savefig(save_path, format="pdf")
    plt.close()


def caption_plots(input_file, vocab_save_path, word_dist_save_path):
    pos_vocab_sizes, word_counts = process_text(input_file)
    plot_vocab_size(pos_vocab_sizes, vocab_save_path)
    plot_word_distribution(word_counts, word_dist_save_path)


caption_plots(
    "../curated_clotho_captions/development_captions.csv",
    "../example/21_Development_vocab_size.pdf",
    "../example/22_Development_word_distribution.pdf",
)
caption_plots(
    "../curated_clotho_captions/validation_captions.csv",
    "../example/23_Validation_vocab_size.pdf",
    "../example/24_Validation_word_distribution.pdf",
)
caption_plots(
    "../curated_clotho_captions/evaluation_captions.csv",
    "../example/25_Evaluation_vocab_size.pdf",
    "../example/26_Evaluation_word_distribution.pdf",
)

### Audio Length Distribution


In [None]:
from tqdm import tqdm


def get_audio_durations(audio_folder):
    durations = []
    audio_files = [
        f
        for f in os.listdir(audio_folder)
        if f.endswith(".wav") or f.endswith(".mp3")
    ]
    for filename in tqdm(audio_files, desc="Processing audio files"):
        file_path = os.path.join(audio_folder, filename)
        y, sr = librosa.load(file_path, sr=None)
        durations.append(librosa.get_duration(y=y, sr=sr))
    return durations


def plot_audio_durations(durations, save_path):
    split_name = save_path.split("_")[1]
    plt.figure(figsize=(8, 5), dpi=150)
    bins = np.arange(
        15, 35, 2.5
    )  # Bins from 15 to 35 with a size of 2.5 seconds
    counts, _, patches = plt.hist(
        durations, bins=bins, edgecolor="black", alpha=0.7, color="orange"
    )
    plt.xlabel("Audio Duration (seconds)")
    plt.ylabel("Audio Count")
    plt.title(f"{split_name} Split Audio Duration Distribution")
    plt.grid(axis="y", linestyle="-", alpha=0.3)

    # Add count labels on top of the bars
    for count, patch in zip(counts, patches):
        height = patch.get_height()
        plt.text(
            patch.get_x() + patch.get_width() / 2,
            height,
            f"{int(count)}",
            ha="center",
            va="bottom",
            fontsize=10,
        )

    plt.savefig(save_path, format="pdf")
    plt.close()


durations = get_audio_durations("../data/Clotho/development")
plot_audio_durations(durations, "../example/27_Development_durations.pdf")
durations = get_audio_durations("../data/Clotho/validation")
plot_audio_durations(durations, "../example/28_Validation_durations.pdf")
durations = get_audio_durations("../data/Clotho/evaluation")
plot_audio_durations(durations, "../example/29_Evaluation_durations.pdf")

### Combining all caption plots


In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np

# Function to convert transparent images to white background
def add_white_bg(image):
    if image.shape[2] == 4:  # If the image has an alpha channel (RGBA)
        bgr = image[:, :, :3]
        alpha = image[:, :, 3]
        bgr[alpha == 0] = [255, 255, 255]  # Replace transparent with white
        return bgr
    else:
        return image

# Function to create a figure with 2 subplots (side by side) and save as PDF
def create_figure(images, titles, output_path):
    fig = plt.figure(figsize=(12, 6))  # Adjust the size for side-by-side images

    # Create subplots (2 columns, 1 row)
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)

    # Plot images and titles
    for i, ax in enumerate([ax1, ax2]):
        ax.imshow(images[i])
        ax.set_title(titles[i], fontsize=14, fontweight="bold")
        ax.axis("off")  # Hide axes

    # Remove space between subplots
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)

    # Save the figure as PDF
    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white", pad_inches=0, format='pdf')
    print(f"Figure saved as {output_path}")

# Load images with alpha channel (transparency)
img_a = cv2.imread("../example/30_raw_data.png", cv2.IMREAD_UNCHANGED)
img_b = cv2.imread("../example/31_processed_data.png", cv2.IMREAD_UNCHANGED)
img_c = cv2.imread("../example/32_agen_mid_data.png", cv2.IMREAD_UNCHANGED)
img_d = cv2.imread("../example/33_agen_data.png", cv2.IMREAD_UNCHANGED)

# Convert transparent portions to white background
img_a = add_white_bg(img_a)
img_b = add_white_bg(img_b)
img_c = add_white_bg(img_c)
img_d = add_white_bg(img_d)

# Convert images to RGB (Matplotlib uses RGB, OpenCV uses BGR)
img_a = cv2.cvtColor(img_a, cv2.COLOR_BGR2RGB)
img_b = cv2.cvtColor(img_b, cv2.COLOR_BGR2RGB)
img_c = cv2.cvtColor(img_c, cv2.COLOR_BGR2RGB)
img_d = cv2.cvtColor(img_d, cv2.COLOR_BGR2RGB)

# Titles for subfigures
titles1 = ["Fig. a", "Fig. b"]
titles2 = ["Fig. c", "Fig. d"]

# Create and save the first figure (a and b)
create_figure([img_a, img_b], titles1, "../example/34_figure_ab.pdf")

# Create and save the second figure (c and d)
create_figure([img_c, img_d], titles2, "../example/34_figure_cd.pdf")

In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np
import math

def add_white_bg(image):
    """
    Convert transparent images to have a white background.

    @param np.ndarray image: Input image with potential alpha channel.
    :return np.ndarray: Image with transparency replaced by white.
    """
    if image.shape[2] == 4:  # If the image has an alpha channel (RGBA)
        bgr = image[:, :, :3]
        alpha = image[:, :, 3]
        bgr[alpha == 0] = [255, 255, 255]  # Replace transparent with white
        return bgr
    return image

def create_figure(images, titles, output_path):
    """
    Create a figure with a 2-column layout and save as a PDF.

    @param list images: List of images to be displayed.
    @param list titles: Corresponding titles for each image.
    @param str output_path: File path to save the output PDF.
    """
    num_images = len(images)
    num_rows = math.ceil(num_images / 2)  # Compute required rows
    fig, axes = plt.subplots(num_rows, 2, figsize=(12, 6 * num_rows))
    axes = np.array(axes).reshape(-1)  # Flatten in case of single row

    for i, ax in enumerate(axes):
        if i < num_images:
            ax.imshow(images[i])
            ax.set_title(titles[i], fontsize=14, fontweight="bold")
        ax.axis("off")  # Hide axes

    plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white", pad_inches=0, format='pdf')
    print(f"Figure saved as {output_path}")

# Load image paths
image_paths = [
    "../example/30_raw_data.png",
    "../example/31_processed_data.png",
    "../example/32_agen_mid_data.png",
    "../example/33_agen_data.png",
    "../example/34_extra_data.png"  # Example extra image
]

# Load and preprocess images
images = []
titles = []
for i, path in enumerate(image_paths):
    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    img = add_white_bg(img)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    images.append(img)
    titles.append(f"Fig. {chr(97 + i)}")  # Generate titles (a, b, c...)

# Save the figure
create_figure(images, titles, "../example/combined_figure.pdf")


In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np
import math
import os

def add_white_bg(image):
    """
    Convert transparent images to have a white background.

    @param np.ndarray image: Input image with potential alpha channel.
    :return np.ndarray: Image with transparency replaced by white.
    """
    if image.shape[2] == 4:  # If the image has an alpha channel (RGBA)
        bgr = image[:, :, :3]
        alpha = image[:, :, 3]
        bgr[alpha == 0] = [255, 255, 255]  # Replace transparent with white
        return bgr
    return image

def create_figure(images, titles, output_path):
    """
    Create a figure with a 2-column layout and save as a PDF.

    @param list images: List of images to be displayed.
    @param list titles: Corresponding titles for each image.
    @param str output_path: File path to save the output PDF.
    """
    num_images = len(images)
    num_rows = math.ceil(num_images / 2)  # Compute required rows
    fig, axes = plt.subplots(num_rows, 2, figsize=(12, 6 * num_rows))
    axes = np.array(axes).reshape(-1)  # Flatten in case of single row

    for i, ax in enumerate(axes):
        if i < num_images:
            ax.imshow(images[i])
            ax.set_title(titles[i], fontsize=14, fontweight="bold")
        ax.axis("off")  # Hide axes

    plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
    plt.savefig(output_path, dpi=600, bbox_inches="tight", facecolor="white", pad_inches=0, format='pdf')
    print(f"Figure saved as {output_path}")

def get_image_paths(folder_path):
    """
    Get all image file paths from the given folder.

    @param str folder_path: Path to the folder containing images.
    :return list: List of image file paths.
    """
    valid_extensions = {".png", ".jpg", ".jpeg"}
    return [os.path.join(folder_path, f) for f in sorted(os.listdir(folder_path))
            if os.path.splitext(f)[1].lower() in valid_extensions]

# Define folder path
folder_path = "../temp/cmp_embedd_figs"

# Get image file paths
image_paths = get_image_paths(folder_path)

# Load and preprocess images
images = []
titles = []
for i, path in enumerate(image_paths):
    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    img = add_white_bg(img)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    images.append(img)
    titles.append(f"Fig. {chr(97 + i)}")  # Generate titles (a, b, c...)

# Save the figure
create_figure(images, titles, "../temp/ez_embedd_figure.pdf")
