### 0. Splitting the captions 


In [4]:
import pandas as pd

# Load the original CSV file
input_file = "../curated_clotho_captions/clotho_captions_development.csv"  # Replace with your actual file path
output_folder = "../agen_clotho_captions/random_seed_experiment/"  # Replace with your desired output folder

def split_csv(input_file, output_folder, num_chunks=10):
    """
    Splits a CSV file into a specified number of chunks with uniform row distribution.

    Parameters:
        input_file (str): Path to the input CSV file.
        output_folder (str): Directory where the output CSV files will be saved.
        num_chunks (int): Number of chunks to split the CSV into (default is 10).

    Returns:
        None
    """
    # Read the CSV into a DataFrame
    df = pd.read_csv(input_file)

    # Calculate the size of each chunk
    chunk_size = len(df) // num_chunks
    remainder = len(df) % num_chunks

    start = 0
    for i in range(num_chunks):
        # Calculate the end index for the current chunk
        extra_row = 1 if i < remainder else 0  # Distribute remainder rows across the first few files
        end = start + chunk_size + extra_row

        # Slice the DataFrame for the current chunk
        chunk_df = df.iloc[start:end]

        # Save the chunk to a new CSV file
        output_file = f"{output_folder}chunk_{i+1}.csv"
        chunk_df.to_csv(output_file, index=False)

        # Update the start index for the next chunk
        start = end

# Example usage
split_csv(input_file, output_folder, num_chunks=10)


In [None]:
### Expand the original captions csv file
for i in range(1, 11):
    expanded_df = expand_and_fill(
        f"../agen_clotho_captions/random_seed_experiment/chunk_{i}.csv"
    )
    expanded_df.to_csv(
        f"../agen_clotho_captions/random_seed_experiment/expanded_captions_chunk_{i}.csv",
        index=False,
    )

In [9]:
for i in range(1, 11):
    input_csv_fp =f"../agen_clotho_captions/random_seed_experiment/expanded_captions_chunk_{i}.csv"
    output_csv_fp = f"../agen_clotho_captions/random_seed_experiment/agen_captions_chunk_{i}.csv"
    transform_csv(input_csv_fp, output_csv_fp)

### 1. Expand original caption csv


In [5]:
import pandas as pd
def expand_and_fill(csv_file):
    """
    Expands the CSV by duplicating file names with caption suffixes
    and fills caption columns accordingly.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: DataFrame with expanded entries and filled captions.
    """

    df = pd.read_csv(csv_file)
    expanded_data = []

    for _, row in df.iterrows():
        expanded_data.append(row.to_dict())  # Add original row

        file_name = row["file_name"]
        for i in range(1, 6):
            new_row = row.to_dict()  # Copy the entire row
            new_row["file_name"] = f"{file_name[:-4]}_cap_{i}.wav"

            # Set all caption columns to "N/A" except the matching one
            for j in range(1, 6):
                new_row[f"caption_{j}"] = (
                    "N/A" if i != j else row[f"caption_{j}"]
                )

            expanded_data.append(new_row)

    return pd.DataFrame(expanded_data)

In [1]:
# Example usage
for split in ["development", "validation", "evaluation"]:
    expanded_df = expand_and_fill(
        f"./curated_clotho_captions/clotho_captions_{split}.csv"
    )
    expanded_df.to_csv(
        f"./agen_clotho_captions/clotho_expanded_captions_{split}.csv",
        index=False,
    )

### 2. Convert above generated csv's to row wise audio-raw_text-text pairs


In [8]:
import pandas as pd
import string


def parse_text(text):
    """Remove punctuation, convert to lowercase, and remove extra spaces."""
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = " ".join(text.split())
    return text


def transform_csv(input_csv_fp, output_csv_fp):
    """From a normal file having a wav_fname with 5 caption columns, create 5 rows, 1 for each non-na caption column"""
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(input_csv_fp)

    # Create a new DataFrame for the transformed data
    transformed_df = pd.DataFrame(columns=["fname", "raw_text", "text"])

    # Iterate through each row in the original DataFrame
    for _, row in df.iterrows():
        file_name = row["file_name"]
        for i in range(1, 6):  # Loop through caption_1 to caption_num
            caption = row[f"caption_{i}"]
            if pd.notna(caption):  # Ensure the caption is not NaN
                # Add the row to the transformed DataFrame
                transformed_df = pd.concat(
                    [
                        transformed_df,
                        pd.DataFrame(
                            [
                                {
                                    "fname": file_name,
                                    "raw_text": caption,
                                    "text": parse_text(caption),
                                }
                            ]
                        ),
                    ],
                    ignore_index=True,
                )

    # Save the transformed DataFrame to the new CSV file
    transformed_df.to_csv(output_csv_fp, index=False)



In [None]:

# Provide the path to your input CSV file and the desired output CSV file
for split in ["development", "validation", "evaluation"]:
    input_csv_fp = (
        f"./agen_clotho_captions/clotho_expanded_captions_{split}.csv"
    )
    output_csv_fp = f"./agen_clotho_captions/agen_captions_{split}.csv"
    transform_csv(input_csv_fp, output_csv_fp)

### 3. From the above cell generated csv's, extract the information for every caption columns and create individual csv's


In [None]:
import pandas as pd
import os


def create_caption_csv_files(input_csv_fp, split, output_dir="./conf_yamls"):
    """
    Creates 5 CSV files, each containing captions for a specific caption index,
    combined with the base captions.

    Args:
        input_csv_fp (str): Path to the input CSV file.
        split (str): Identifier for the data split (e.g., 'evaluation', 'train').
        output_dir (str, optional): Directory to save the output CSV files.
            Defaults to './data'.
    """

    df = pd.read_csv(input_csv_fp)

    for i in range(1, 6):
        caption_index = i
        output_subdir = os.path.join(
            output_dir, f"Clotho_caption_{caption_index}"
        )
        os.makedirs(output_subdir, exist_ok=True)

        # Filter rows for the current caption index
        cap_row = df[df["fname"].str.endswith(f"_cap_{caption_index}.wav")]

        # Combine base rows and filtered rows
        combined_df = pd.concat(
            [df[~df["fname"].str.contains("_cap_")], cap_row],
            ignore_index=True,
        )

        # Save the combined DataFrame to a CSV file
        output_csv_fp = os.path.join(output_subdir, f"{split}_captions.csv")
        combined_df.to_csv(output_csv_fp, index=False)


for split in ["development"]:
    input_csv_file = f"./agen_clotho_captions/agen_captions_{split}.csv"
    create_caption_csv_files(input_csv_file, split, output_dir="./data")

### 4. From the expanded csv, generate a single csv file filled with all captions


In [None]:
import pandas as pd
import string

# Function to normalize the text by converting to lowercase and removing extra punctuation


def normalize_text(text):
    """Remove punctuation, convert to lowercase, and remove extra spaces."""
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = " ".join(text.split())
    return text


# Load the CSV file
df = pd.read_csv(
    "agen_clotho_captions/clotho_expanded_captions_development.csv"
)

# Reshape the dataframe by melting, gathering all caption columns into a single column
df_melted = df.melt(
    id_vars=["file_name"],
    value_vars=[
        "caption_1",
        "caption_2",
        "caption_3",
        "caption_4",
        "caption_5",
    ],
    var_name="caption_column",
    value_name="raw_text",
)

# Remove rows where the 'raw_text' is 'N/A' or empty
df_filtered = df_melted.dropna(subset=["raw_text"])
df_filtered = df_filtered[df_filtered["raw_text"] != "N/A"]

# Add a new column with normalized text
df_filtered["text"] = df_filtered["raw_text"].apply(normalize_text)

# Sort the DataFrame to ensure that _cap_{num} files appear just after the original file
df_filtered["sort_key"] = df_filtered["file_name"].str.replace(
    r"_cap_\d+", "", regex=True
)  # Create a sort key
df_filtered["is_cap"] = df_filtered["file_name"].str.contains(
    r"_cap_\d+"
)  # Identify if it's a _cap file

# Sort by the sort_key first, then by whether it's a _cap file or not
df_sorted = df_filtered.sort_values(
    by=["sort_key", "is_cap", "file_name"]
).drop(columns=["caption_column", "sort_key", "is_cap"])

# Select only the required columns
df_final = df_sorted[["file_name", "raw_text", "text"]]

In [None]:
# Save the transformed CSV to a new file
df_final.to_csv(
    "agen_clotho_captions/transformed_development.csv", index=False
)

### Rough


In [None]:
import os
from typing import List
from typing import Dict


def count_files_with_extensions(
    folder_path: str, extensions: List[str]
) -> Dict[str, int]:
    """
    Counts the number of files with each specified extension in a given folder.

    :param folder_path: Path to the folder where files will be counted.
    :param extensions: List of file extensions to look for (e.g., ['.txt', '.jpg']).
    :return: Dictionary with extensions as keys and counts as values.
    """
    counts = {
        ext: 0 for ext in extensions
    }  # Initialize a dictionary with each extension set to 0

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            for ext in extensions:
                if file.endswith(ext):
                    counts[ext] += 1

    return counts


# Example usage:
folder_path = "data/Clotho_caption_2/development"
extensions = [".wav", ".csv"]
file_counts = count_files_with_extensions(folder_path, extensions)
print(f"File counts: {file_counts}")

In [None]:
import pandas as pd


# Load the CSV files into pandas DataFrames
def count_fnames(captions_fp, clotho_captions_fp):
    """
    Check whether the file_name in clotho_captions_{split}.csv are present 5 times in the {split}_captions.csv or not
    """
    captions_file = pd.read_csv(captions_fp)  # First file
    clotho_captions_file = pd.read_csv(clotho_captions_fp)  # Second file

    # Count occurrences of each fname in the first file
    fname_counts = captions_file["fname"].value_counts()

    # Extract the file_name column from the second file
    file_names = clotho_captions_file["file_name"]

    # Check if each file_name from the second file occurs exactly 5 times in the first file
    for file_name in file_names:
        count = fname_counts.get(file_name, 0)
        if count != 5:
            print(
                f"{file_name} does NOT occur exactly 5 times in the first file. It occurs {count} times."
            )


count_fnames(
    captions_fp="./data/Clotho/development_captions.csv",
    clotho_captions_fp="./data/clotho_captions_development.csv",
)
count_fnames(
    captions_fp="./data/Clotho/validation_captions.csv",
    clotho_captions_fp="./data/clotho_captions_validation.csv",
)
count_fnames(
    captions_fp="./data/Clotho/evaluation_captions.csv",
    clotho_captions_fp="./data/clotho_captions_evaluation.csv",
)

In [None]:
import pandas as pd


def check_captions(file_name, df1, df2):
    # Filter df1 rows for the given file_name
    df1_filtered = df1[df1["fname"] == file_name]

    # Get all captions for the file_name from df2
    captions_in_df2 = df2[df2["file_name"] == file_name].iloc[0, 1:].tolist()
    # Check if each caption in df2 exists in the 'raw_text' column of the filtered df1
    captions_in_df1 = df1_filtered["raw_text"].tolist()
    missing_captions = [
        caption
        for caption in captions_in_df2
        if caption not in captions_in_df1
    ]

    if not missing_captions:
        print(end="")
    else:
        print(f"Missing captions for '{file_name}': {missing_captions}")


def main_check_function(captions_fp, clotho_captions_fp):
    """
    Check whether all the 5 captions from clotho_captions_{split}.csv file are present iteratively in rows in {split}_captions.csv
    """
    df1 = pd.read_csv(captions_fp)
    df2 = pd.read_csv(clotho_captions_fp)

    # Check for each file_name in df2
    file_names = df2["file_name"]
    for file_name in file_names:
        check_captions(file_name, df1, df2)


main_check_function(
    captions_fp="./data/Clotho/development_captions.csv",
    clotho_captions_fp="./curated_clotho_captions/clotho_captions_development.csv",
)
main_check_function(
    captions_fp="./data/Clotho/validation_captions.csv",
    clotho_captions_fp="./curated_clotho_captions/clotho_captions_validation.csv",
)
main_check_function(
    captions_fp="./data/Clotho/evaluation_captions.csv",
    clotho_captions_fp="./curated_clotho_captions/clotho_captions_evaluation.csv",
)

In [None]:
import pandas as pd


def compare_dataframes(df1, df2):
    # Ensure both DataFrames have the same shape
    if df1.shape != df2.shape:
        raise ValueError("DataFrames must have the same shape to compare.")

    # Initialize an empty DataFrame to store the differences
    diff_df = pd.DataFrame(columns=["col_1", "col_2"])

    # Iterate through each row in the DataFrames
    for idx in range(df1.shape[0]):
        row1 = df1.iloc[idx]
        row2 = df2.iloc[idx]

        # Check if there are any differences between the rows
        if not row1.equals(row2):
            # Append rows from both DataFrames to the diff_df
            diff_df = pd.concat(
                [
                    diff_df,
                    pd.DataFrame(
                        {"col_1": [row1.to_dict()], "col_2": [row2.to_dict()]}
                    ),
                ],
                ignore_index=True,
            )

    return diff_df


df1 = pd.read_csv("data/Clotho_caption_1/transformed_captions.csv")
df2 = pd.read_csv("data/Clotho/development_captions.csv")

# Compare the DataFrames
diff_df = compare_dataframes(df1, df2)

# Save the differences to a CSV file if needed
diff_df.to_csv("./data/differences.csv", index=False)