In [1]:
import json
from collections import defaultdict

# Load all feature files
with open('/kaggle/input/extracted-features/Textual_Features_RoBERTa.json') as f:
    text_data = json.load(f)
with open('/kaggle/input/extracted-features/Audio_Features_OpenSMILE.json') as f:
    audio_data = json.load(f)
with open('/kaggle/input/extracted-features/Video_Features_SlowFast.json') as f:  # Assuming video features file
    video_data = json.load(f)

# Initialize the final combined data structure
combined_data = {"train": [], "dev": [], "test": []}

# Process each split (train, dev, test)
for split in ["train", "dev", "test"]:
    # Create lookup dictionaries for audio and video features
    audio_lookup = {}
    video_lookup = {}
    
    # Populate audio lookup dictionary
    for item in audio_data[split]:
        # Find the audio key (e.g., '0_0')
        audio_key = next(k for k in item.keys() 
                         if k != 'y' and k != 'label' and not k.endswith('_opensmile_features'))
        
        # Store audio path and features
        audio_lookup[audio_key] = {
            "path": item[audio_key],
            "features": item[f"{audio_key}_opensmile_features"]
        }
    
    # Populate video lookup dictionary
    for item in video_data[split]:
        # Identify base key (e.g., '0_1')
        video_key = next(k for k in item.keys() 
                         if k not in ['y', 'label', 'frames_dir', 'mask_info'] 
                         and not k.endswith('_slowfast_features'))
        
        # Store video info and features
        video_lookup[video_key] = {
            "video_path": item[video_key],
            "features": item[f"{video_key}__slowfast_features"],
            "frames_dir": item["frames_dir"],
            "mask_info": item["mask_info"]
        }
    
    # Process text items and combine with audio and video
    for item in text_data[split]:
        # Find the text key (e.g., '0_0')
        text_key = next(k for k in item.keys() 
                        if k != 'y' and k != 'label' and not k.endswith('_RoBERTa'))
        
        # Check if we have matching audio and video
        if text_key in audio_lookup and text_key in video_lookup:
            audio_info = audio_lookup[text_key]
            video_info = video_lookup[text_key]
            
            # Concatenate features from all three modalities
            combined_features = (
                item[f"{text_key}_RoBERTa"] + 
                audio_info["features"] + 
                video_info["features"]
            )
            
            # Create new combined entry
            new_entry = {
                text_key: item[text_key],  # Original text
                f"{text_key}_audio_path": audio_info["path"],
                f"{text_key}_video_path": video_info["video_path"],  # Video path
                "frames_dir": video_info["frames_dir"],  # Frames directory
                "mask_info": video_info["mask_info"],   # Mask info
                f"{text_key}_Concatenated": combined_features,
                "y": item["y"],
                "label": item["label"]
            }
            
            combined_data[split].append(new_entry)

# Save the combined data to a new JSON file
with open("Multimodal_Combined_Features.json", "w") as f:
    json.dump(combined_data, f, indent=4)

print("Multimodal combined JSON file created successfully!")

Multimodal combined JSON file created successfully!


In [2]:
import json

# Load the existing text+audio combined features
with open('/kaggle/working/Multimodal_Combined_Features.json') as f:
    combined_features = json.load(f)

# Load the MELD cleaned dataset
with open('/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Final Format/MELD_Data_Cleaned_Processed.json') as f:
    meld_data = json.load(f)

# Create a mapping dictionary for speaker information
# Structure: {(split, dia_utt): speaker}
speaker_mapping = {}

# Process MELD data to create the mapping
for item in meld_data['data']:
    split = item['split']
    dia_utt = f"{item['dialog']}_{item['utterance']}"  # e.g., "0_0"
    speaker = item['speaker']
    speaker_mapping[(split, dia_utt)] = speaker

# Add speaker information to the combined features
for split in ["train", "dev", "test"]:
    for entry in combined_features[split]:
        # Find the base key (e.g., "0_0")
        base_key = None
        for key in entry.keys():
            # Skip known metadata keys and suffixed keys
            if key not in ['y', 'label'] and \
               not key.endswith('_audio_path') and \
               not key.endswith('_Concatenated'):
                base_key = key
                break
        
        if base_key is None:
            print(f"Warning: Couldn't find base key in entry: {entry}")
            continue
        
        # Get speaker from mapping
        map_key = (split, base_key)
        if map_key in speaker_mapping:
            entry['speaker'] = speaker_mapping[map_key]
        else:
            print(f"Warning: No speaker found for {base_key} in {split} split")
            entry['speaker'] = "Unknown"

# Save the enhanced combined features
with open("Enhanced_Combined_Features.json", "w") as f:
    json.dump(combined_features, f, indent=4)

print("Enhanced JSON file created with speaker information!")

Enhanced JSON file created with speaker information!
