# Functions to Save and Read Data in JSON Format Using Python

In [1]:
import json

In [2]:
def save_to_json(data, file_path):
    """
    Save data to a JSON file.
    
    Args:
    - data: Python object (e.g., dict or list) to save.
    - file_path: Path to the JSON file.
    """
    try:
        with open(file_path, "w") as json_file:
            json.dump(data, json_file, indent=4)  # Save with pretty formatting
        print(f"Data successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving data to JSON: {e}")

In [3]:
def read_from_json(file_path):
    """
    Read data from a JSON file.
    
    Args:
    - file_path: Path to the JSON file.
    
    Returns:
    - The Python object (e.g., dict or list) loaded from the JSON file.
    """
    try:
        with open(file_path, "r") as json_file:
            data = json.load(json_file)
        print(f"Data successfully loaded from {file_path}")
        return data
    except Exception as e:
        print(f"Error reading data from JSON: {e}")
        return None

---

# Read MELD Data in JSON Format

In [4]:
if __name__ == "__main__":
    # Filepath
    MELD_video_data_file_path = "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Video_Data.json"

    # Read data from JSON
    MELD_video_data = read_from_json(MELD_video_data_file_path)
    print("Loaded Data:", MELD_video_data)

Data successfully loaded from /kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Video_Data.json
Loaded Data: {'train': [{'0_0': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt0.mp4', 'y': 0, 'label': 'neutral'}, {'0_1': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt1.mp4', 'y': 0, 'label': 'neutral'}, {'0_2': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt2.mp4', 'y': 0, 'label': 'neutral'}, {'0_3': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt3.mp4', 'y': 0, 'label': 'neutral'}, {'0_4': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt4.mp4', 'y': 1, 'label': 'surprise'}, {'0_5': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt5.mp4', 'y': 0, 'label': 'neutral'}, {'0_6': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/

---

# Read MELD Textual Data in JSON Format

In [5]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_textual_data_file_path = "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Textual_Data.json"

    # Read data from JSON
    MELD_textual_data = read_from_json(MELD_textual_data_file_path)
    print("Loaded Data:", MELD_textual_data)

Data successfully loaded from /kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Textual_Data.json


---

# Read MELD Video Data in JSON Format

In [6]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_video_data_file_path = "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Video_Data.json"

    # Read data from JSON
    MELD_video_data = read_from_json(MELD_video_data_file_path)
    print("Loaded Data:", MELD_video_data)

Data successfully loaded from /kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Video_Data.json
Loaded Data: {'train': [{'0_0': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt0.mp4', 'y': 0, 'label': 'neutral'}, {'0_1': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt1.mp4', 'y': 0, 'label': 'neutral'}, {'0_2': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt2.mp4', 'y': 0, 'label': 'neutral'}, {'0_3': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt3.mp4', 'y': 0, 'label': 'neutral'}, {'0_4': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt4.mp4', 'y': 1, 'label': 'surprise'}, {'0_5': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia0_utt5.mp4', 'y': 0, 'label': 'neutral'}, {'0_6': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/

---

# Read MELD Audio Data in JSON Format

In [7]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_audio_data_file_path = "/kaggle/input/meld-audio/MELD_audio_data.json"

    # Read data from JSON
    MELD_audio_data = read_from_json(MELD_audio_data_file_path)
    print("Loaded Data:", MELD_audio_data)

Data successfully loaded from /kaggle/input/meld-audio/MELD_audio_data.json
Loaded Data: {'train': [{'0_0': '/kaggle/working/audio_train/dia0_utt0.wav', 'y': 0, 'label': 'neutral'}, {'0_1': '/kaggle/working/audio_train/dia0_utt1.wav', 'y': 0, 'label': 'neutral'}, {'0_2': '/kaggle/working/audio_train/dia0_utt2.wav', 'y': 0, 'label': 'neutral'}, {'0_3': '/kaggle/working/audio_train/dia0_utt3.wav', 'y': 0, 'label': 'neutral'}, {'0_4': '/kaggle/working/audio_train/dia0_utt4.wav', 'y': 1, 'label': 'surprise'}, {'0_5': '/kaggle/working/audio_train/dia0_utt5.wav', 'y': 0, 'label': 'neutral'}, {'0_6': '/kaggle/working/audio_train/dia0_utt6.wav', 'y': 0, 'label': 'neutral'}, {'0_7': '/kaggle/working/audio_train/dia0_utt7.wav', 'y': 0, 'label': 'neutral'}, {'0_8': '/kaggle/working/audio_train/dia0_utt8.wav', 'y': 0, 'label': 'neutral'}, {'0_9': '/kaggle/working/audio_train/dia0_utt9.wav', 'y': 0, 'label': 'neutral'}, {'0_10': '/kaggle/working/audio_train/dia0_utt10.wav', 'y': 2, 'label': 'fear'}

---

# Read CSV files 

In [8]:
import pandas as pd

In [9]:
train_sent_emo = pd.read_csv('/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_sent_emo.csv')
dev_sent_emo = pd.read_csv('/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/dev_sent_emo.csv')
test_sent_emo = pd.read_csv('/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/test_sent_emo.csv')

---

# Check Video Integrity in MELD Video Dataset

In [10]:
# Error extracting audio from /kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia125_utt3.mp4
# Error extracting audio from /kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/dev/dev_splits_complete/dia110_utt7.mp4

In [11]:
import cv2
import json

In [12]:
# Function to check video integrity
def check_video_integrity(video_path):
    video = cv2.VideoCapture(video_path)
    if not video.isOpened():
        video.release()
        return False
    frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    video.release()
    return frame_count > 0 and fps > 0

In [13]:
# Function to check video integrity
def check_video_integrity(video_path):
    """
    Checks the integrity of a video file.
    
    Args:
        video_path (str): Path to the video file.
    
    Returns:
        bool: True if the video file is valid (non-corrupted), False otherwise.
    """
    # Open the video file
    video = cv2.VideoCapture(video_path)
    
    # Check if the video file could be opened successfully
    if not video.isOpened():
        video.release()  # Release the video object
        return False
    
    # Get the total number of frames in the video
    frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Get the frames per second (FPS) of the video
    fps = video.get(cv2.CAP_PROP_FPS)
    
    # Release the video object after processing
    video.release()
    
    # A valid video should have a frame count > 0 and FPS > 0
    return frame_count > 0 and fps > 0

In [14]:
if __name__ == "__main__":
    # Load the MELD dataset information
    # Replace this with the actual `MELD_video_data` dictionary
    MELD_video_data = MELD_video_data  # Assumes MELD_video_data is defined elsewhere

    # Dictionary to store corrupted video entries by dataset split
    corrupted_videos = {
        "train": [],
        "dev": [],
        "test": []
    }

    # Iterate over each split (train, dev, test) in the MELD dataset
    for split in MELD_video_data:
        # Iterate over each video entry in the current split
        for video_entry in MELD_video_data[split]:
            # Iterate over each key-value pair in the video entry dictionary
            for key, video_path in video_entry.items():
                # Skip keys that are not video paths (e.g., "y" or "label")
                if key not in {"y", "label"}:
                    # Check the integrity of the video file
                    if not check_video_integrity(video_path):
                        # If the video is corrupted, add it to the corrupted videos list for the split
                        corrupted_videos[split].append(video_entry)

    # Output the list of corrupted videos, if any
    print("Corrupted videos:", corrupted_videos)


Corrupted videos: {'train': [{'125_3': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia125_utt3.mp4', 'y': 0, 'label': 'neutral'}], 'dev': [{'110_7': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/dev/dev_splits_complete/dia110_utt7.mp4', 'y': 0, 'label': 'neutral'}], 'test': []}


---

# Save Corrupted Video Data in JSON Format

In [15]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_corrupted_video_data_file_path = "/kaggle/working/MELD_corrupted_video_data.json"

    # Save data to JSON
    save_to_json(corrupted_videos, MELD_corrupted_video_data_file_path)

Data successfully saved to /kaggle/working/MELD_corrupted_video_data.json


---

# Python Script to Update Paths in `MELD_audio_data.json`

In [16]:
# Changing '0_0': '/kaggle/working/audio_train/dia0_utt0.wav' to '0_0': /kaggle/input/meld-audio/audio_train/dia0_utt0.wav'

In [17]:
import json

# Load the JSON file
with open('/kaggle/input/meld-audio/MELD_audio_data.json', 'r') as file:
    data = json.load(file)

In [18]:
# Function to update the paths
def update_paths(data, old_path, new_path):
    for dataset in data.values():
        for item in dataset:
            for key, value in item.items():
                if isinstance(value, str) and old_path in value:
                    item[key] = value.replace(old_path, new_path)
    return data

In [19]:
# Update the paths
old_path = "/kaggle/working"
new_path = "/kaggle/input/meld-audio"
MELD_audio_data_updated = update_paths(data, old_path, new_path)

In [20]:
# Save the updated JSON file
with open('MELD_audio_data_updated.json', 'w') as file:
    json.dump(MELD_audio_data_updated, file, indent=4)

print("Paths updated and saved to 'MELD_audio_data_updated.json'")

Paths updated and saved to 'MELD_audio_data_updated.json'


---

# Read MELD_audio_data_updated in JSON Format 

In [21]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_audio_data_updated_file_path = "/kaggle/working/MELD_audio_data_updated.json"

    # Read data from JSON
    MELD_audio_data_updated = read_from_json(MELD_audio_data_updated_file_path)
    print("Loaded Data:", MELD_audio_data_updated)

Data successfully loaded from /kaggle/working/MELD_audio_data_updated.json
Loaded Data: {'train': [{'0_0': '/kaggle/input/meld-audio/audio_train/dia0_utt0.wav', 'y': 0, 'label': 'neutral'}, {'0_1': '/kaggle/input/meld-audio/audio_train/dia0_utt1.wav', 'y': 0, 'label': 'neutral'}, {'0_2': '/kaggle/input/meld-audio/audio_train/dia0_utt2.wav', 'y': 0, 'label': 'neutral'}, {'0_3': '/kaggle/input/meld-audio/audio_train/dia0_utt3.wav', 'y': 0, 'label': 'neutral'}, {'0_4': '/kaggle/input/meld-audio/audio_train/dia0_utt4.wav', 'y': 1, 'label': 'surprise'}, {'0_5': '/kaggle/input/meld-audio/audio_train/dia0_utt5.wav', 'y': 0, 'label': 'neutral'}, {'0_6': '/kaggle/input/meld-audio/audio_train/dia0_utt6.wav', 'y': 0, 'label': 'neutral'}, {'0_7': '/kaggle/input/meld-audio/audio_train/dia0_utt7.wav', 'y': 0, 'label': 'neutral'}, {'0_8': '/kaggle/input/meld-audio/audio_train/dia0_utt8.wav', 'y': 0, 'label': 'neutral'}, {'0_9': '/kaggle/input/meld-audio/audio_train/dia0_utt9.wav', 'y': 0, 'label': 'n

---

# Check Audio Integrity in `MELD_audio_data_updated_file_path` Dataset

In [22]:
import wave
import os

In [23]:
# Function to check audio integrity
def check_audio_integrity(audio_path):
    """
    Checks the integrity of a .wav audio file.
    
    Args:
        audio_path (str): Path to the audio file.
    
    Returns:
        bool: True if the audio file is valid (non-corrupted), False otherwise.
    """
    try:
        # Open the audio file in read mode
        with wave.open(audio_path, 'rb') as audio:
            # Ensure the audio file has valid parameters
            params = audio.getparams()  # Retrieve audio parameters
            n_frames = audio.getnframes()  # Total number of frames
            if n_frames > 0 and params.nchannels > 0 and params.framerate > 0:
                return True
            return False
    except (wave.Error, FileNotFoundError, EOFError):
        # If an error occurs (e.g., file is corrupted, missing, or not a .wav), it's invalid
        return False

In [24]:
if __name__ == "__main__":
    # Load the MELD dataset information
    # Replace this with the actual `MELD_audio_data` dictionary
    MELD_audio_data = MELD_audio_data_updated  # Assumes MELD_audio_data is similar in structure to MELD_video_data

    # Dictionary to store corrupted audio entries by dataset split
    corrupted_audios = {
        "train": [],
        "dev": [],
        "test": []
    }

    # Iterate over each split (train, dev, test) in the MELD dataset
    for split in MELD_audio_data:
        # Iterate over each audio entry in the current split
        for audio_entry in MELD_audio_data[split]:
            # Iterate over each key-value pair in the audio entry dictionary
            for key, audio_path in audio_entry.items():
                # Skip keys that are not audio paths (e.g., "y" or "label")
                if key not in {"y", "label"}:
                    # Check the integrity of the audio file
                    if not check_audio_integrity(audio_path):
                        # If the audio is corrupted, add it to the corrupted audios list for the split
                        corrupted_audios[split].append(audio_entry)

    # Output the list of corrupted audio files, if any
    print("Corrupted audios:", corrupted_audios)

Corrupted audios: {'train': [{'125_3': '/kaggle/input/meld-audio/audio_train/dia125_utt3.wav', 'y': 0, 'label': 'neutral'}], 'dev': [{'110_7': '/kaggle/input/meld-audio/audio_dev/dia110_utt7.wav', 'y': 0, 'label': 'neutral'}], 'test': []}


In [25]:
# '/kaggle/input/meld-audio/audio_train/dia125_utt3.wav'
# '/kaggle/input/meld-audio/audio_dev/dia110_utt7.wav'

---

# Save Corrupted Audio Data in JSON Format

In [26]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_corrupted_audio_data_file_path = "/kaggle/working/MELD_corrupted_audio_data.json"

    # Save data to JSON
    save_to_json(corrupted_audios, MELD_corrupted_audio_data_file_path)

Data successfully saved to /kaggle/working/MELD_corrupted_audio_data.json


---

# Removing Corrupted Records from MELD Dataset Based on Corrupted Video Data

In [27]:
def remove_corrupted_records(json_file, train_df, dev_df, test_df):
    # Load the corrupted JSON file
    with open(json_file, 'r') as file:
        corrupted_data = json.load(file)

    # Helper function to filter a DataFrame and capture removed records
    def filter_dataframe(df, corrupted_entries):
        # Convert DataFrame columns to integers for comparison
        df['Dialogue_ID'] = df['Dialogue_ID'].astype(int)
        df['Utterance_ID'] = df['Utterance_ID'].astype(int)

        # Create a set of corrupted keys for faster lookup, only using keys like '125_3'
        corrupted_keys = set()

        # Iterate over corrupted entries to build corrupted keys set
        for entry in corrupted_entries:
            for corrupted_key in entry.keys():
                if isinstance(corrupted_key, str) and '_' in corrupted_key:
                    # Only consider keys like '125_3'
                    dialogue_id, utterance_id = map(int, corrupted_key.split('_'))
                    corrupted_keys.add((dialogue_id, utterance_id))

        # Identify rows that match (Dialogue_ID, Utterance_ID) in corrupted_keys
        removed_records = df[df[['Dialogue_ID', 'Utterance_ID']].apply(tuple, axis=1).isin(corrupted_keys)]

        # Filter the DataFrame to remove the corrupted rows
        df_filtered = df[~df[['Dialogue_ID', 'Utterance_ID']].apply(tuple, axis=1).isin(corrupted_keys)]

        return df_filtered, removed_records

    # Process train, dev, and test DataFrames
    train_df_filtered, train_removed = filter_dataframe(train_df, corrupted_data.get('train', []))
    dev_df_filtered, dev_removed = filter_dataframe(dev_df, corrupted_data.get('dev', []))
    test_df_filtered, test_removed = filter_dataframe(test_df, corrupted_data.get('test', []))

    return train_df_filtered, dev_df_filtered, test_df_filtered, train_removed, dev_removed, test_removed

In [28]:
if __name__ == "__main__":
    # Load train_sent_emo, dev_sent_emo, and test_sent_emo from CSV files
    train_sent_emo = pd.read_csv('/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_sent_emo.csv')
    dev_sent_emo = pd.read_csv('/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/dev_sent_emo.csv')
    test_sent_emo = pd.read_csv('/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/test_sent_emo.csv')

    # Path to the JSON file with corrupted videos
    MELD_corrupted_video_data = "/kaggle/working/MELD_corrupted_video_data.json"

    # Call the function to clean data and capture removed records
    train_sent_emo_cleaned, dev_sent_emo_cleaned, test_sent_emo_cleaned, train_sent_emo_removed, dev_sent_emo_removed, test_sent_emo_removed = remove_corrupted_records(
        MELD_corrupted_video_data, train_sent_emo, dev_sent_emo, test_sent_emo
    )

    # Save the cleaned DataFrames to new CSV files
    train_sent_emo_cleaned.to_csv("train_sent_emo_cleaned.csv", index=False)
    dev_sent_emo_cleaned.to_csv("dev_sent_emo_cleaned.csv", index=False)
    test_sent_emo_cleaned.to_csv("test_sent_emo_cleaned.csv", index=False)

    # Save the removed records DataFrames to new CSV files
    train_sent_emo_removed.to_csv("train_sent_emo_removed.csv", index=False)
    dev_sent_emo_removed.to_csv("dev_sent_emo_removed.csv", index=False)
    test_sent_emo_removed.to_csv("test_sent_emo_removed.csv", index=False)

    print("Corrupted records removed, cleaned DataFrames saved, and removed records saved.")

Corrupted records removed, cleaned DataFrames saved, and removed records saved.
