# Normalize

In [None]:
import numpy as np
import os
import pandas as pd

def normalize_based_on_split(df, output_dir, modalities, split_column='split', split="train"):
    """
    Normalizes the numpy arrays for each modality based on the train split, and saves them to the output directory.
    
    Args:
        df (pandas.DataFrame): DataFrame containing the metadata (paths and labels).
        input_dir (str): Directory containing the original numpy arrays.
        output_dir (str): Directory where normalized numpy arrays will be saved.
        modalities (list): List of column names for each modality.
        split_column (str): Column name that indicates the train/test split.
    """
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Filter the rows that belong to the "train" split
    split_df = df[df[split_column] == split]
    
    # Normalize each modality separately based on the train split
    for modality in modalities:
        # Collect the paths of the modality files for the train split
        modality_paths = split_df[modality].values
        
        # Initialize lists to hold min and max values for normalization
        modality_data = []
        
        # Load the data for all train samples of this modality
        for path in modality_paths:
            data = np.load(path, allow_pickle=True)
            modality_data.append(data)
        
        # Convert to numpy array
        modality_data = np.array(modality_data)
        
        # Calculate min and max for normalization
        min_val = modality_data.min()
        max_val = modality_data.max()
        
        # Normalize each modality file in the train set based on the calculated min/max
        for i, path in enumerate(modality_paths):
            data = modality_data[i]
            # Normalize the data
            normalized_data = (data - min_val) / (max_val - min_val) if max_val > min_val else data
            # Save the normalized data back to the output directory
            output_path = os.path.join(output_dir, os.path.basename(path))
            np.save(output_path, normalized_data)
            print(f"Normalized and saved: {output_path}")

In [None]:
df = pd.read_csv("/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/audio_text_visual_paths.csv")
df.head()

Unnamed: 0,split,label,label.1,gender,audio,text,visual
0,train,22.0,0,female,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...
1,test,23.0,0,male,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...
2,train,19.0,0,male,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...
3,train,67.0,1,female,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...
4,dev,39.0,0,male,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...,/home/hice1/mbibars3/scratch/vlm-debiasing/dat...


In [None]:
# Example usage
output_directory = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings'  # Path to save the normalized numpy arrays
modalities = ["text"]
normalize_based_on_split(df, output_directory, modalities, split="train")


Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/313_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/312_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/468_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/682_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/628_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/483_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/426_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings/438_TEXT.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing

In [None]:
output_directory = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings'  # Path to save the normalized numpy arrays
modalities = ["audio"]
normalize_based_on_split(df, output_directory, modalities, split="dev")


Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/479_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/713_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/350_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/437_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/371_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/347_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/393_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/388_AUDIO.npy
Normalized and saved: /home/hice1/mbibars3/scrat

In [None]:
output_directory = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings'  # Path to save the normalized numpy arrays
modalities = ["visual"]
normalize_based_on_split(df, output_directory, modalities, split="test")


Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/626_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/415_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/601_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/381_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/600_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/427_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/306_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings/609_vis.npy
Normalized and saved: /home/hice1/mbibars3/scratch/vlm-d

In [None]:
visual_dir = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_visual_embeddings'
vis_npy_files = glob.glob(f"{visual_dir}/*.npy")
print(len(vis_npy_files))
df["visual"] = vis_npy_files

275


In [None]:
visual_dir = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings'
vis_npy_files = glob.glob(f"{visual_dir}/*.npy")
print(len(vis_npy_files))
df["audio"] = vis_npy_files

275


In [None]:
visual_dir = '/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_TEXT_embeddings'
vis_npy_files = glob.glob(f"{visual_dir}/*.npy")
print(len(vis_npy_files))
df["text"] = vis_npy_files

275


In [None]:
df["audio"][0]

'/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/normalized_audio_embeddings/432_AUDIO.npy'

In [None]:
df.to_csv("/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/norm_avt.csv", index=False)