In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import json
import glob
import random
from btbench_config import *
from braintreebank_subject import Subject
from btbench_datasets import BrainTreebankSubjectTrialBenchmarkDataset

# all possible pairs of (subject_id, trial_id)
all_subject_trials = [(1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (3, 0), (3, 1), (3, 2), (4, 0), (4, 1), (4, 2), (5, 0), (6, 0), (6, 1), (6, 4), (7, 0), (7, 1), (8, 0), (9, 0), (10, 0), (10, 1)]

# all possible evaluations for now
all_evaluations = ["pitch", "rms", "onset", "speech"]

# the evaluation pairs of (subject_id, trial_id) based on the Population Transformer paper
all_eval_subject_trials = [(1, 2), (2, 6), (3, 0), (4, 2), (5, 0), (6, 1), (10, 0)] # made to match PopT paper

In [2]:
subject_id, trial_id = 3, 0
subject = Subject(subject_id, cache=True) # use cache=True to load this trial's neural data into RAM, if you have enough memory!
dataset = BrainTreebankSubjectTrialBenchmarkDataset(subject, trial_id, dtype=torch.float32, eval_name="rms") # eval_name can be "pitch", "rms", "onset", or "speech"

In [None]:
print("Items in the dataset:", len(dataset))
print("Shape of the first item: features.shape =", dataset[0][0].shape, "label =", dataset[0][1])

Items in the dataset: 5379
Shape of the first item: features.shape = torch.Size([124, 5120]) label = tensor(0)


In [4]:
word_onset_idx = int(START_NEURAL_DATA_BEFORE_WORD_ONSET * SAMPLING_RATE) # 1024
word_onset_idx_end = int((START_NEURAL_DATA_BEFORE_WORD_ONSET + END_NEURAL_DATA_AFTER_WORD_ONSET) * SAMPLING_RATE) # 3072

print("Loading dataset...")
# Convert PyTorch dataset to numpy arrays for scikit-learn
X = []
y = []
for i in range(len(dataset)):
    features, label = dataset[i]
    X.append(features.numpy()[:, word_onset_idx:word_onset_idx_end].flatten()) # take 1 second of raw voltage starting with the word onset
    y.append(label)
X = np.array(X)
y = np.array(y)
print("Dataset loaded")

Loading dataset...
Dataset loaded


In [5]:
# Train logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
train_test_split_sequential = True
test_size = 0.2

# Split data into train and test sets
if train_test_split_sequential:
    # Use sequential split instead of random split
    split_idx = int((1 - test_size) * len(X))  # 80% for training
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Create and train logistic regression model
print("Training logistic regression model...")
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)
print("Logistic regression model trained")

# Evaluate model
print("Evaluating model...")
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print("Model evaluated")

print(f"Training accuracy: {train_score:.3f}")
print(f"Test accuracy: {test_score:.3f}")

Training logistic regression model...
Logistic regression model trained
Evaluating model...
Model evaluated
Training accuracy: 1.000
Test accuracy: 0.625
