# Produce npy dataset required for fine-tuning

In [1]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# Load the pkl file
pkl_file = 'pt_decoding_data_S62.pkl'

with open(pkl_file, 'rb') as f:
    data = pickle.load(f)

print(f"Loaded: {pkl_file}")
print(f"Type: {type(data)}")

  data = pickle.load(f)


Loaded: pt_decoding_data_S62.pkl
Type: <class 'dict'>


In [3]:
data.keys()

dict_keys(['S14', 'S26', 'S23', 'S33', 'S22', 'S39', 'S58', 'S62'])

In [4]:
data['S14'].keys()

dict_keys(['ID', 'X1', 'X1_map', 'y1', 'X2', 'X2_map', 'y2', 'X3', 'X3_map', 'y3', 'y_full_phon', 'X_collapsed', 'y_phon_collapsed', 'y_artic_collapsed', 'pre_pts'])

In [20]:
X = []
y = []

for ID in data:
    for item in [data[ID]['X1'],data[ID]['X2'],data[ID]['X3']]:
        X_item = data[ID]['X1']
        X_item = np.transpose(X_item, (0, 2, 1)) # trials, channels, timepoints
        X.append(X_item)
    for item in [data[ID]['y1'],data[ID]['y2'],data[ID]['y3']]:
        y_item = data[ID]['y1']
        y.append(y_item)

X = np.array(X)
X = X.reshape(X.shape[0] * X.shape[1], X.shape[2], X.shape[3])

y = np.array(y)
y = y.reshape(y.shape[0] * y.shape[1])

# Convert labels from 1-9 to 0-8
y = y - 1

print(X.shape) # total trials (X1-3, all patients), channels, timesteps
print(y.shape) # total trials
print(f"Label range: {y.min()} to {y.max()}")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (24,) + inhomogeneous part.

In [18]:
np.save('X.npy', X)
np.save('y.npy', y)

In [17]:
# Load the saved npy files to verify
X_loaded = np.load('X.npy')
y_loaded = np.load('y.npy')

print("Loaded X.npy and y.npy")
print(f"X shape: {X_loaded.shape}")
print(f"y shape: {y_loaded.shape}")
print(f"\nLabel verification:")
print(f"Min label: {y_loaded.min()}")
print(f"Max label: {y_loaded.max()}")
print(f"Unique labels: {np.unique(y_loaded)}")
print(f"\nLabel distribution:")
unique, counts = np.unique(y_loaded, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  Label {label}: {count} samples")

Loaded X.npy and y.npy
X shape: (3456, 111, 200)
y shape: (3456,)

Label verification:
Min label: 0
Max label: 8
Unique labels: [0 1 2 3 4 5 6 7 8]

Label distribution:
  Label 0: 408 samples
  Label 1: 408 samples
  Label 2: 408 samples
  Label 3: 504 samples
  Label 4: 240 samples
  Label 5: 288 samples
  Label 6: 360 samples
  Label 7: 576 samples
  Label 8: 264 samples


## Create per-patient datasets

In [19]:
# Create folder for per-patient data
import os

output_dir = 'mydata_patients'
os.makedirs(output_dir, exist_ok=True)

print(f"Created directory: {output_dir}\n")

# Process each patient
for patient_id in data.keys():
    print(f"Processing {patient_id}...")
    
    # Collect X data from X1, X2, X3
    X_patient = []
    for x_key in ['X1', 'X2', 'X3']:
        X_item = data[patient_id][x_key]
        X_item = np.transpose(X_item, (0, 2, 1))  # trials, channels, timepoints
        X_patient.append(X_item)
    
    # Concatenate all trials
    X_patient = np.concatenate(X_patient, axis=0)
    
    # Collect y data from y1, y2, y3
    y_patient = []
    for y_key in ['y1', 'y2', 'y3']:
        y_item = data[patient_id][y_key]
        y_patient.append(y_item)
    
    # Concatenate all labels
    y_patient = np.concatenate(y_patient, axis=0)
    
    # Convert labels from 1-9 to 0-8
    y_patient = y_patient - 1
    
    # Save to files
    np.save(f'{output_dir}/X_{patient_id}.npy', X_patient)
    np.save(f'{output_dir}/y_{patient_id}.npy', y_patient)
    
    print(f"  X_{patient_id}.npy: shape {X_patient.shape}")
    print(f"  y_{patient_id}.npy: shape {y_patient.shape}, labels {y_patient.min()}-{y_patient.max()}\n")

print(f"All patient data saved to {output_dir}/")

Created directory: mydata_patients

Processing S14...
  X_S14.npy: shape (432, 111, 200)
  y_S14.npy: shape (432,), labels 0-8

Processing S26...
  X_S26.npy: shape (444, 111, 200)
  y_S26.npy: shape (444,), labels 0-8

Processing S23...
  X_S23.npy: shape (453, 63, 200)
  y_S23.npy: shape (453,), labels 0-8

Processing S33...
  X_S33.npy: shape (138, 149, 200)
  y_S33.npy: shape (138,), labels 0-8

Processing S22...
  X_S22.npy: shape (453, 74, 200)
  y_S22.npy: shape (453,), labels 0-8

Processing S39...
  X_S39.npy: shape (411, 144, 200)
  y_S39.npy: shape (411,), labels 0-8

Processing S58...
  X_S58.npy: shape (423, 171, 200)
  y_S58.npy: shape (423,), labels 0-8

Processing S62...
  X_S62.npy: shape (534, 201, 200)
  y_S62.npy: shape (534,), labels 0-8

All patient data saved to mydata_patients/
