This script is designed to process multi-unit activity (MUA) data from macaque monkey experiments. The goals are to:
1. Extract and structure neural responses (MUA) from .mat HDF5 files.
2. Label each electrode response by its cortical region (V1, V4, IT).
3. Associate each stimulus image with memorability scores and object categories.
4. Save the final enriched dataset as a CSV file for further analysis.

In [5]:
import h5py
import pandas as pd
import numpy as np
import os
import re
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set correct base path
base_path = "/content/drive/MyDrive/Uni/msc/Machine_Learning_Project/Data/TVSD"
monkeys = {
    "MonkeyN": {"V1": (1, 512), "V4": (513, 768), "IT": (769, 1024)},
    "MonkeyF": {"V1": (1, 512), "IT": (513, 832), "V4": (833, 1024)}
}

# Function to determine electrode region
def get_region(electrode, region_map):
    for region, (start, end) in region_map.items():
        if start <= electrode <= end:
            return region
    return "Unknown"

# **Fixed Function to Resolve HDF5 References**
def extract_string(f, dataset):
    """Extracts string values from HDF5 datasets, resolving references if needed."""
    if isinstance(dataset, h5py.Reference):
        dataset = f[dataset]  # Dereference first level

    if isinstance(dataset, h5py.Dataset):
        dataset = dataset[()]  # Get actual data

    # **If dataset contains an array with a single reference, resolve it**
    if isinstance(dataset, np.ndarray) and dataset.dtype == object:
        dataset = dataset[0]  # Extract the first reference
        if isinstance(dataset, h5py.Reference):
            dataset = f[dataset][()]  # Follow the reference again

    # **If dataset is an ASCII character array, convert to string**
    if isinstance(dataset, np.ndarray) and dataset.dtype == np.uint16:
        try:
            resolved_str = ''.join(chr(int(c)) for c in dataset.flatten() if int(c) > 0)  # Ignore null characters
            return resolved_str
        except Exception as e:
            print(f"ERROR: Failed to decode dataset {dataset} - {e}")
            return "UNKNOWN"

    if isinstance(dataset, bytes):
        return dataset.decode('utf-8')  # Decode bytes
    elif isinstance(dataset, np.ndarray):
        if dataset.dtype.kind in {'S', 'U'}:
            return dataset.astype(str)[0]

    return str(dataset)  # Convert to string if nothing else works


# **Function to extract object name from Stimulus_Name**
def extract_object_name(stimulus_name):
    """Extracts the object name (e.g. 'aardvark') from the full file path"""
    match = re.search(r'([^/]+)(?:_\d+[a-z]?)?\.jpg$', stimulus_name)
    return match.group(1) if match else stimulus_name  # Extract object or return original


# Function to load memorability data
def load_memorability_data():
    filename = "/content/THINGS_Memorability_Scores.csv"
    print(f"Loading memorability data from {filename}...")
    memo_df = pd.read_csv(filename)
    memo_df["image_name"] = memo_df["image_name"].str.lower().str.strip()
    print(f"Loaded memorability data: {len(memo_df)} rows.")
    return memo_df

# Function to load category linking data
def load_category_data():
    filename = "/content/Concept_to_category_linking.csv"
    print(f"Loading category data from {filename}...")
    category_df = pd.read_csv(filename)
    category_df["concept"] = category_df["concept"].str.lower().str.strip()
    print(f"Loaded category data: {len(category_df)} rows.")
    return category_df

# Function to process each monkey
def process_monkey(monkey, regions, memo_df, category_df):
    print(f"\nProcessing {monkey}...")

    things_imgs_path = os.path.join(base_path, monkey, "things_imgs.mat")
    norm_mua_path = os.path.join(base_path, monkey, "THINGS_normMUA.mat")

    if not os.path.exists(things_imgs_path) or not os.path.exists(norm_mua_path):
        print(f"Error: One or both files for {monkey} are missing.")
        return

    print(f"Opening {things_imgs_path} and {norm_mua_path}...")

    with h5py.File(things_imgs_path, 'r') as f_things, h5py.File(norm_mua_path, 'r') as f_mua:
        print(f"Available keys in things_imgs.mat → {list(f_things.keys())}")

        train_paths = [extract_string(f_things, f_things['train_imgs']['things_path'][i]) for i in range(len(f_things['train_imgs']['things_path']))]
        train_classes = [extract_string(f_things, f_things['train_imgs']['class'][i]) for i in range(len(f_things['train_imgs']['class']))]

        test_paths = [extract_string(f_things, f_things['test_imgs']['things_path'][i]) for i in range(len(f_things['test_imgs']['things_path']))]
        test_classes = [extract_string(f_things, f_things['test_imgs']['class'][i]) for i in range(len(f_things['test_imgs']['class']))]

        print("Extracting MUA responses...")
        train_MUA = f_mua['train_MUA'][()].T  # Transpose to (1024, 22248)
        test_MUA = f_mua['test_MUA'][()].T  # Transpose to (1024, 100)

    num_electrodes, num_train_stimuli = train_MUA.shape
    num_electrodes_test, num_test_stimuli = test_MUA.shape

    assert num_electrodes == num_electrodes_test == 1024, "Mismatch in electrode count!"

    print(f"Train: {num_train_stimuli} stimuli, {num_electrodes} electrodes")
    print(f"Test: {num_test_stimuli} stimuli, {num_electrodes_test} electrodes")

    combined_data = []

    def process_data(paths, classes, mua_data, trial_type):
        """Helper function to process train/test data."""
        for i in range(mua_data.shape[1]):
            stim_name = paths[i].lower().strip().replace("\\", "/")  # Standardize paths
            stim_class = classes[i].lower().strip()

            row = {'Stimulus_Name': stim_name, 'Class': stim_class, 'Trial_Type': trial_type}

            # Add electrode responses and regions
            for j in range(num_electrodes):
                region = get_region(j + 1, regions)
                row[f'Electrode_{j+1}_Region'] = region
                row[f'Electrode_{j+1}'] = mua_data[j, i]

            combined_data.append(row)

    print("Processing training data...")
    process_data(train_paths, train_classes, train_MUA, "train")

    print("Processing test data (averaging repetitions)...")
    # Create a DataFrame for easier grouping and averaging
    test_df = pd.DataFrame({
        'Stimulus_Name': [p.lower().strip().replace("\\", "/") for p in test_paths],
        'Class': [c.lower().strip() for c in test_classes]
    })

    # Repeat electrode data to match
    for j in range(num_electrodes):
        test_df[f'Electrode_{j+1}'] = test_MUA[j, :]

    # Group by stimulus and class (each stimulus shown 30 times), and average
    grouped_test_df = test_df.groupby(['Stimulus_Name', 'Class']).mean().reset_index()

    # Assign regions and flatten into desired format
    for _, row in grouped_test_df.iterrows():
        out_row = {
            'Stimulus_Name': row['Stimulus_Name'],
            'Class': row['Class'],
            'Trial_Type': 'test'
        }
        for j in range(num_electrodes):
            region = get_region(j + 1, regions)
            out_row[f'Electrode_{j+1}_Region'] = region
            out_row[f'Electrode_{j+1}'] = row[f'Electrode_{j+1}']
        combined_data.append(out_row)


    print("Converting to DataFrame...")
    combined_df = pd.DataFrame(combined_data)

    # **Standardize File Paths for Matching**
    print("Standardizing File Paths for Matching...")

    # Extract only the filename to match memorability dataset
    combined_df['Image_File'] = combined_df['Stimulus_Name'].apply(lambda x: x.split("/")[-1].lower().strip())
    memo_df['image_name'] = memo_df['image_name'].str.lower().str.strip()

    # Extract only the object name for category matching
    combined_df['Base_Object'] = combined_df['Image_File'].apply(lambda x: re.sub(r'_\d+[a-z]?\.jpg$', '', x))

    # **Debugging Output: First 5 Processed Entries**
    print("\nFirst 5 Standardized Stimulus Names (after processing):")
    print(combined_df[['Stimulus_Name', 'Image_File', 'Base_Object']].head())

    print("\nFirst 5 Entries in Memorability Dataset:")
    print(memo_df[['image_name', 'cr']].head())

    print("\nFirst 5 Entries in Category Dataset:")
    print(category_df[['concept', 'category_label']].head())

    # **Check for Unmatched Entries Before Merging**
    unmatched_memo = combined_df[~combined_df['Image_File'].isin(memo_df['image_name'])]
    unmatched_category = combined_df[~combined_df['Base_Object'].isin(category_df['concept'])]

    print(f"\nUnmatched Memorability Stimuli: {len(unmatched_memo)}")
    print(f"Unmatched Category Labels: {len(unmatched_category)}")

    # **Debugging Output: First 5 Unmatched Entries**
    if len(unmatched_memo) > 0:
        print("\nFirst 5 Unmatched Memorability Stimuli:")
        print(unmatched_memo[['Stimulus_Name', 'Image_File']].head())

    if len(unmatched_category) > 0:
        print("\nFirst 5 Unmatched Category Labels:")
        print(unmatched_category[['Stimulus_Name', 'Base_Object']].head())

    # **Merge with Memorability Scores**
    print("Merging with memorability scores...")
    merged_df = pd.merge(combined_df, memo_df[['image_name', 'cr']], left_on='Image_File', right_on='image_name', how='left')
    merged_df = merged_df.drop(columns=['image_name', 'Image_File'])

    # **Merge with Category Labels Using Extracted Object Name**
    print("Merging with category labels...")
    merged_df = pd.merge(merged_df, category_df[['concept', 'category_label']], left_on='Base_Object', right_on='concept', how='left')
    merged_df = merged_df.drop(columns=['concept', 'Base_Object'])

    # **Save CSV File**
    csv_path = os.path.join(base_path, f"{monkey}_MUA_responses.csv")
    merged_df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path} | Total Rows: {len(merged_df)}")

    return csv_path



# Load memorability and category data
memorability_df = load_memorability_data()
category_df = load_category_data()

# Process both monkeys
for monkey, region_map in monkeys.items():
    process_monkey(monkey, region_map, memorability_df, category_df)

print("Processing complete!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading memorability data from /content/THINGS_Memorability_Scores.csv...
Loaded memorability data: 26107 rows.
Loading category data from /content/Concept_to_category_linking.csv...
Loaded category data: 1854 rows.

Processing MonkeyN...
Opening /content/drive/MyDrive/Uni/msc/Machine_Learning_Project/Data/TVSD/MonkeyN/things_imgs.mat and /content/drive/MyDrive/Uni/msc/Machine_Learning_Project/Data/TVSD/MonkeyN/THINGS_normMUA.mat...
Available keys in things_imgs.mat → ['#refs#', 'test_imgs', 'train_imgs']
Extracting MUA responses...
Train: 22248 stimuli, 1024 electrodes
Test: 100 stimuli, 1024 electrodes
Processing training data...
Processing test data (averaging repetitions)...


  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'E

Converting to DataFrame...
Standardizing File Paths for Matching...

First 5 Standardized Stimulus Names (after processing):
               Stimulus_Name        Image_File Base_Object
0  aardvark/aardvark_01b.jpg  aardvark_01b.jpg    aardvark
1  aardvark/aardvark_02s.jpg  aardvark_02s.jpg    aardvark
2  aardvark/aardvark_03s.jpg  aardvark_03s.jpg    aardvark
3  aardvark/aardvark_04s.jpg  aardvark_04s.jpg    aardvark
4  aardvark/aardvark_05s.jpg  aardvark_05s.jpg    aardvark

First 5 Entries in Memorability Dataset:
         image_name        cr
0  aardvark_01b.jpg  0.825000
1  aardvark_02s.jpg  0.800000
2  aardvark_03s.jpg  0.878049
3  aardvark_04s.jpg  0.731707
4  aardvark_05s.jpg  0.825000

First 5 Entries in Category Dataset:
           concept      category_label
0         aardvark              animal
1           abacus          home decor
2        accordion  musical instrument
3            acorn                 NaN
4  air_conditioner   electronic device

Unmatched Memorability Sti

  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'Electrode_{j+1}'] = test_MUA[j, :]
  test_df[f'E

Converting to DataFrame...
Standardizing File Paths for Matching...

First 5 Standardized Stimulus Names (after processing):
               Stimulus_Name        Image_File Base_Object
0  aardvark/aardvark_01b.jpg  aardvark_01b.jpg    aardvark
1  aardvark/aardvark_02s.jpg  aardvark_02s.jpg    aardvark
2  aardvark/aardvark_03s.jpg  aardvark_03s.jpg    aardvark
3  aardvark/aardvark_04s.jpg  aardvark_04s.jpg    aardvark
4  aardvark/aardvark_05s.jpg  aardvark_05s.jpg    aardvark

First 5 Entries in Memorability Dataset:
         image_name        cr
0  aardvark_01b.jpg  0.825000
1  aardvark_02s.jpg  0.800000
2  aardvark_03s.jpg  0.878049
3  aardvark_04s.jpg  0.731707
4  aardvark_05s.jpg  0.825000

First 5 Entries in Category Dataset:
           concept      category_label
0         aardvark              animal
1           abacus          home decor
2        accordion  musical instrument
3            acorn                 NaN
4  air_conditioner   electronic device

Unmatched Memorability Sti