## Part 6: Full Protocol
#### With GPU-accelerated Ridge Regression Using the Himalaya Library

This notebook tutorial walks through the full crossmodal fMRI prediction process using the BridgeTower model. We will walk through extracting features from natural stimuli using BridgeTower layers, building voxelwise encoding models to predict fMRI data from stimuli features, and finally predicting language fMRI data using the vision encoding model and predicting visual fMRI data using the language encoding model.

In [1]:
# Select parameters
subject = 'S1'  # S1-S5
modality = 'vision'  # vision or language
layer = 8  # 1-13

## 1 Feature Extraction
We'll begin by putting our natural stimuli through the BridgeTower model and extracting feature representations from the layer specified above

### 1.1 Load Stimuli

#### 1.1.1 Movie Stimuli
Our movie data are stored in HDF format so we need a helper function to load them

In [2]:
def load_hdf5_array(file_name, key=None, slice=slice(0, None)):
    """Function to load data from an hdf file.

    Parameters
    ----------
    file_name: string
        hdf5 file name.
    key: string
        Key name to load. If not provided, all keys will be loaded.
    slice: slice, or tuple of slices
        Load only a slice of the hdf5 array. It will load `array[slice]`.
        Use a tuple of slices to get a slice in multiple dimensions.

    Returns
    -------
    result : array or dictionary
        Array, or dictionary of arrays (if `key` is None).
    """
    with h5py.File(file_name, mode='r') as hf:
        if key is None:
            data = dict()
            for k in hf.keys():
                data[k] = hf[k][slice]
            return data
        else:
            return hf[key][slice]

In [None]:
test = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/test.hdf', key='stimuli')
train_00 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_00.hdf', key='stimuli')
train_01 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_01.hdf', key='stimuli')
train_02 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_02.hdf', key='stimuli')
train_03 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_03.hdf', key='stimuli')
train_04 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_04.hdf', key='stimuli')
train_05 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_05.hdf', key='stimuli')
train_06 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_06.hdf', key='stimuli')
train_07 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_07.hdf', key='stimuli')
train_08 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_08.hdf', key='stimuli')
train_09 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_09.hdf', key='stimuli')
train_10 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_10.hdf', key='stimuli')
train_11 = load_hdf5_array('data/raw_stimuli/shortclips/stimuli/train_11.hdf', key='stimuli')

### 1.1.2 Story Stimuli
Our story transcripts are in TextGrid format so we want to load them into a list, we'll create a helper function to do so.

In [3]:
def textgrid_to_array(textgrid):
    """Function to load transcript from textgrid into a list.

    Parameters
    ----------
    textgrid: string
        TextGrid file name.

    Returns
    -------
    full_transcript : Array
        Array with each word in the story.
    """
    if textgrid == 'data/raw_stimuli/textgrids/stimuli/legacy.TextGrid':
        with open(textgrid, 'r')as file:
            data = file.readlines()

        full_transcript = []
        # Important info starts at line 5
        for line in data[5:]:
            if line.startswith('2'):
                index = data.index(line)
                word = re.search(r'"([^"]*)"', data[index+1].strip()).group(1)
                full_transcript.append(word)
    elif textgrid == 'data/raw_stimuli/textgrids/stimuli/life.TextGrid':
        with open(textgrid, 'r') as file:
            data = file.readlines()

        full_transcript = []
        for line in data:
            if "word" in line:
                index = data.index(line)
                words = data[index+6:]  # this is where first word starts

        for i, word in enumerate(words):
            if i % 3 == 0:
                word = re.search(r'"([^"]*)"', word.strip()).group(1)
                full_transcript.append(word)
    else:
        with open(textgrid, 'r') as file:
            data = file.readlines()

        # Important info starts at line 8
        for line in data[8:]:
            # We only want item [2] info because those are the words instead
            # of phonemes
            if "item [2]" in line:
                index = data.index(line)

        summary_info = [line.strip() for line in data[index+1:index+6]]
        print(summary_info)

        word_script = data[index+6:]
        full_transcript = []
        for line in word_script:
            if "intervals" in line:
                # keep track of which interval we're on
                ind = word_script.index(line)
                word = re.search(r'"([^"]*)"',
                                 word_script[ind+3].strip()).group(1)
                full_transcript.append(word)

    return np.array(full_transcript)

In [None]:
alternateithicatom = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/alternateithicatom.TextGrid")
avatar = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/avatar.TextGrid")
howtodraw = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/howtodraw.TextGrid")
legacy = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/legacy.TextGrid")
life = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/life.TextGrid")
myfirstdaywiththeyankees = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/myfirstdaywiththeyankees.TextGrid")
naked = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/naked.TextGrid")
odetostepfather = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/odetostepfather.TextGrid")
souls = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/souls.TextGrid")
undertheinfluence = textgrid_to_array("data/raw_stimuli/textgrids/stimuli/undertheinfluence.TextGrid")

### 1.2 Run Stimuli through Feature Extraction

### 1.2.1 Helper Functions
We need three functions, one to set-up the BridgeTower model, one to run the movie stimuli, and one to run the story stimuli

**Note**: These functions use GPU acceleration through PyTorch. They will take significantly longer using CPUs only.

In [4]:
def setup_model(layer):
    """Function to setup transformers model with layer hooks.

    Parameters
    ----------
    layer: int
        A layer reference for the BridgeTower model. Set's the forward
        hook on the relevant layer

    Returns
    -------
    device : cuda or cpu for gpu acceleration if accessible.
    model: BridgeTower model.
    processor: BridgeTower processor.
    features: Dictionary
        A placeholder for batch features, one for each forward
        hook.
    layer_selected: Relevant layer chosen for forward hook.
    """
    # Define Model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")
    model = model.to(device)

    # Define layers
    model_layers = {
            1: model.cross_modal_text_transform,
            2: model.cross_modal_image_transform,
            3: model.token_type_embeddings,
            4: model.vision_model.visual.ln_post,
            5: model.text_model.encoder.layer[-1].output.LayerNorm,
            6: model.cross_modal_image_layers[-1].output,
            7: model.cross_modal_text_layers[-1].output,
            8: model.cross_modal_image_pooler,
            9: model.cross_modal_text_pooler,
            10: model.cross_modal_text_layernorm,
            11: model.cross_modal_image_layernorm,
            12: model.cross_modal_text_link_tower[-1],
            13: model.cross_modal_image_link_tower[-1],
        }

    # placeholder for batch features
    features = {}

    def get_features(name):
        def hook(model, input, output):
            # detached_outputs = [tensor.detach() for tensor in output]
            last_output = output[-1].detach()
            features[name] = last_output  # detached_outputs
        return hook

    # register forward hooks with layers of choice
    layer_selected = model_layers[layer].register_forward_hook(
        get_features(f'layer_{layer}'))

    processor = BridgeTowerProcessor.from_pretrained(
        "BridgeTower/bridgetower-base")

    return device, model, processor, features, layer_selected

In [5]:
def get_movie_features(movie_data, layer, n=30):
    """Function to average feature vectors over every n inputs.

    Parameters
    ----------
    movie_data: Array
        An array of shape (n_images, 512, 512). Represents frames from
        a color movie.
    n (optional): int
        Number of frames to average over. Set at 30 to mimick an MRI
        TR = 2 with a 15 fps movie.

    Returns
    -------
    data : Dictionary
        Dictionary where keys are the model layer from which activations are
        extracted. Values are lists representing activations of 768 dimensions
        over the course of n_images / 30.
    """
    print("loading HDF array")
    movie_data = load_hdf5_array(movie_data, key='stimuli')

    print("Running movie through model")

    # Define Model
    device, model, processor, features, layer_selected = setup_model(layer)

    # create overall data structure for average feature vectors
    # a dictionary with layer names as keys and a list of vectors as it values
    data = {}

    # a dictionary to store vectors for n consecutive trials
    avg_data = {}

    # loop through all inputs
    for i, image in enumerate(movie_data):

        model_input = processor(image, "", return_tensors="pt")
        # Assuming model_input is a dictionary of tensors
        model_input = {key: value.to(device) for key,
                       value in model_input.items()}

        _ = model(**model_input)

        for name, tensor in features.items():
            if name not in avg_data:
                avg_data[name] = []
            avg_data[name].append(tensor)

        # check if average should be stored
        if (i + 1) % n == 0:
            for name, tensors in avg_data.items():
                first_size = tensors[0].size()

                if all(tensor.size() == first_size for tensor in tensors):
                    avg_feature = torch.mean(torch.stack(tensors), dim=0)
                else:
                    # Find problem dimension
                    for dim in range(tensors[0].dim()):
                        first_dim = tensors[0].size(dim)

                        if not all(tensor.size(dim) == first_dim
                                   for tensor in tensors):
                            # Specify place to pad
                            p_dim = (tensors[0].dim()*2) - (dim + 2)
                            # print(p_dim)
                            max_size = max(tensor.size(dim)
                                           for tensor in tensors)
                            padded_tensors = []

                            for tensor in tensors:
                                # Make a list with length of 2*dimensions - 1
                                # to insert pad later
                                pad_list = [0] * ((2*tensor[0].dim()) - 1)
                                pad_list.insert(
                                    p_dim, max_size - tensor.size(dim))
                                # print(tuple(pad_list))
                                padded_tensor = pad(tensor, tuple(pad_list))
                                padded_tensors.append(padded_tensor)

                    avg_feature = torch.mean(torch.stack(padded_tensors),
                                             dim=0)

                if name not in data:
                    data[name] = []
                data[name].append(avg_feature)

            avg_data = {}

    layer_selected.remove()

    # Save data
    data = data[f'layer_{layer}'].cpu()
    data = data.numpy()

    return data

In [6]:
def get_story_features(story_data, layer, n=20):
    """Function to extract feature vectors for each word of a story.

    Parameters
    ----------
    story_data: Array
        An array containing each word of the story in order.
    n (optional): int
        Number of words to to pad the target word with for
        context (before and after).

    Returns
    -------
    data : Dictionary
        Dictionary where keys are the model layer from which activations are
        extracted. Values are lists representing activations of 768 dimensions
        over the course of each word in the story.
    """
    print("loading textgrid")
    story_data = textgrid_to_array(story_data)

    print("Running story through model")
    # Define Model
    device, model, processor, features, layer_selected = setup_model(layer)

    # Create a numpy array filled with gray values (128 in this case)
    # THis will act as tthe zero image input***
    gray_value = 128
    image_array = np.full((512, 512, 3), gray_value, dtype=np.uint8)

    # create overall data structure for average feature vectors
    # a dictionary with layer names as keys and a list of vectors as it values
    data = {}

    # loop through all inputs
    for i, word in enumerate(story_data):
        # if one of first 20 words, just pad with all the words before it
        if i < n:
            # collapse list of strings into a single one
            word_with_context = ' '.join(story_data[:(i+n)])
        # if one of last 20 words, just pad with all the words after it
        elif i > (len(story_data) - n):
            # collapse list of strings into a single one
            word_with_context = ' '.join(story_data[(i-n):])
            # collapse list of strings into a single one
        else:
            word_with_context = ' '.join(story_data[(i-n):(i+n)])

        model_input = processor(image_array, word_with_context,
                                return_tensors="pt")
        # Assuming model_input is a dictionary of tensors
        model_input = {key: value.to(device) for key,
                       value in model_input.items()}

        _ = model(**model_input)

        for name, tensor in features.items():
            if name not in data:
                data[name] = []
            data[name].append(tensor)

    layer_selected.remove()

    # Save data
    data = data[f'layer_{layer}'].cpu()
    data = data.numpy()

    return data

### 1.2.2 Run Stimuli

In [None]:
test_features = get_movie_features(test, layer)
train00_features = get_movie_features(train_00, layer)
train01_features = get_movie_features(train_01, layer)
train02_features = get_movie_features(train_02, layer)
train03_features = get_movie_features(train_03, layer)
train04_features = get_movie_features(train_04, layer)
train05_features = get_movie_features(train_05, layer)
train06_features = get_movie_features(train_06, layer)
train07_features = get_movie_features(train_07, layer)
train08_features = get_movie_features(train_08, layer)
train09_features = get_movie_features(train_09, layer)
train10_features = get_movie_features(train_10, layer)
train11_features = get_movie_features(train_11, layer)

In [None]:
ai_features = get_story_features(alternateithicatom, layer)
avatar_features = get_story_features(avatar, layer)
howtodraw_features = get_story_features(howtodraw, layer)
legacy_features = get_story_features(legacy, layer)
life_features = get_story_features(life, layer)
yankees_features = get_story_features(myfirstdaywiththeyankees, layer)
naked_features = get_story_features(naked, layer)
ode_features = get_story_features(odetostepfather, layer)
souls_features = get_story_features(souls, layer)
under_features = get_story_features(undertheinfluence, layer)

## 2 Voxelwise Encoding Models
Now that we have our stimuli features we can build linear models to map the relationship to the fMRI data

### 2.1 Vision Encoding Model

#### 2.1.1 Load fMRI data

In [None]:
train_fmri = np.load("data/fmri_data/moviedata/" + subject + "/train.npy")
test_fmri = np.load("data/fmri_data/moviedata/" + subject + "/test.npy")

#### 2.1.2 Prep the data
We need a few helper functions to help us prep the data. One is to remove NaNs from the fMRI data (those are out of range values) and one is to generate the leave one out protocol for the Ridge Regression model.

In [7]:
def remove_nan(data):
    mask = ~np.isnan(data)

    # Apply the mask and then flatten
    # This will keep only the non-NaN values
    data_reshaped = data[mask].reshape(data.shape[0], -1)

    print("fMRI shape:", data_reshaped.shape)
    return data_reshaped

In [8]:
def generate_leave_one_run_out(n_samples, run_onsets, random_state=None,
                               n_runs_out=1):
    """Generate a leave-one-run-out split for cross-validation.

    Generates as many splits as there are runs.

    Parameters
    ----------
    n_samples : int
        Total number of samples in the training set.
    run_onsets : array of int of shape (n_runs, )
        Indices of the run onsets.
    random_state : None | int | instance of RandomState
        Random state for the shuffling operation.
    n_runs_out : int
        Number of runs to leave out in the validation set. Default to one.

    Yields
    ------
    train : array of int of shape (n_samples_train, )
        Training set indices.
    val : array of int of shape (n_samples_val, )
        Validation set indices.
    """
    random_state = check_random_state(random_state)

    n_runs = len(run_onsets)
    # With permutations, we are sure that all runs are used as validation runs.
    # However here for n_runs_out > 1, a run can be chosen twice as validation
    # in the same split.
    all_val_runs = np.array(
        [random_state.permutation(n_runs) for _ in range(n_runs_out)])

    all_samples = np.arange(n_samples)
    runs = np.split(all_samples, run_onsets[1:])
    if any(len(run) == 0 for run in runs):
        raise ValueError("Some runs have no samples. Check that run_onsets "
                         "does not include any repeated index, nor the last "
                         "index.")

    for val_runs in all_val_runs.T:
        train = np.hstack(
            [runs[jj] for jj in range(n_runs) if jj not in val_runs])
        val = np.hstack([runs[jj] for jj in range(n_runs) if jj in val_runs])
        yield train, val

In [9]:
# Remove nans from fMRI data
train_fmri_clean = remove_nan(train_fmri)
test_fmri_clean = remove_nan(test_fmri)

NameError: name 'train_fmri' is not defined

In [None]:
# Specify feature data and fMRI data
vision_fmri_arrays = [train_fmri_clean, test_fmri_clean]
vision_feature_arrays = [train00_features, train01_features, train02_features,
                  train03_features, train04_features, train05_features,
                  train06_features, train07_features, train08_features,
                  train09_features, train10_features, train11_features,
                  test_features]

Since we're creating voxelwise encoding models that will be used to predict fMRI data for the opposite modality, we don't need to hold out any data from training

In [None]:
# Combine data
vision_Y_train = np.vstack(vision_fmri_arrays)
vision_X_train = np.vstack(vision_feature_arrays)

#### 2.1.3 Set-Up Ridge Regression

We also don't want to split a single feature in half during cross validation so we will split them based on their onsets in the X_train data

In [None]:
def run_onsets(feature_arrays):
    """Function to get run onsets from feature arrays.

    Parameters
    ----------
    feature_arrays: List
        List of feature arrays.

    Returns
    -------
    run_onsets : Array
        Array of run onsets.
    """
    run_onsets = [0]
    for array in feature_arrays:
        run_onsets.append(run_onsets[-1] + array.shape[0])

    return run_onsets

In [10]:
# Define cross-validation
vision_run_onsets = run_onsets(vision_feature_arrays)

NameError: name 'feature_arrays' is not defined

In [None]:
def setup_ridge(train_data, run_onsets):
    n_samples_train = train_data.shape[0]
    cv = generate_leave_one_run_out(n_samples_train, run_onsets)
    cv = check_cv(cv)  # cross-validation splitter into a reusable list
    # Define the model
    scaler = StandardScaler(with_mean=True, with_std=False)

    delayer = Delayer(delays=[1, 2, 3, 4])

    backend = set_backend("torch_cuda", on_error="warn")
    print(backend)

    vision_X_train = vision_X_train.astype("float32")

    alphas = np.logspace(1, 20, 20)

    ridge_cv = RidgeCV(
        alphas=alphas, cv=cv,
        solver_params=dict(n_targets_batch=500, n_alphas_batch=5,
                            n_targets_batch_refit=100))

    pipeline = make_pipeline(
        scaler,
        delayer,
        ridge_cv,
    )

    return pipeline

In [None]:
vision_pipeline = setup_ridge(vision_X_train, vision_run_onsets)

#### 2.1.4 Run Ridge Regression

In [None]:
# Fit the model
_ = vision_pipeline.fit(vision_X_train, vision_Y_train)

In [None]:
# Get coefficients
vision_coef = vision_pipeline[-1].coef_
vision_coef = backend.to_numpy(vision_coef)
print("(n_delays * n_features, n_voxels) =", vision_coef.shape)

# Regularize coefficients
vision_coef /= np.linalg.norm(vision_coef, axis=0)[None]
# coef *= np.sqrt(np.maximum(0, scores))[None]

# split the ridge coefficients per delays
delayer = vision_pipeline.named_steps['delayer']
vision_coef_per_delay = delayer.reshape_by_delays(vision_coef, axis=0)
print("(n_delays, n_features, n_voxels) =", vision_coef_per_delay.shape)
del vision_coef

# average over delays
vision_average_coef = np.mean(vision_coef_per_delay, axis=0)
print("(n_features, n_voxels) =", vision_average_coef.shape)
del vision_coef_per_delay

### 2.2 Language Encoding Model

#### 2.2.1 Load fMRI data

In [None]:
ai_fmri = np.load("data/fmri_data/storydata" + subject + "/alternateithicatom.npy")
avatar_fmri = np.load("data/fmri_data/storydata/" + subject + "/avatar.npy")
howtodraw_fmri = np.load("data/fmri_data/storydata/" + subject + "/howtodraw.npy")
legacy_fmri = np.load("data/fmri_data/storydata/" + subject + "/legacy.npy")
life_fmri = np.load("data/fmri_data/storydata/" + subject + "/life.npy")
yankees_fmri = np.load("data/fmri_data/storydata/" + subject + "/myfirstdaywiththeyankees.npy")
naked_fmri = np.load("data/fmri_data/storydata/" + subject + "/naked.npy")
ode_fmri = np.load("data/fmri_data/storydata/" + subject + "/odetostepfather.npy")
souls_fmri = np.load("data/fmri_data/storydata/" + subject + "/souls.npy")
under_fmri = np.load("data/fmri_data/storydata/" + subject + "/undertheinfluence.npy")

#### 2.2.2 Prep the data
We need a few additional helper functions to prep this data. A resampler function will help us resample the feature data to fMRI acquisition time (the movie features were already set to this). And a prep_data function will remove NaNs and do resampling together.

In [1]:
def resample_to_acq(feature_data, fmri_data):
    dimensions = fmri_data.shape[0]
    data_transposed = feature_data.T
    data_resampled = np.empty((data_transposed.shape[0], dimensions))

    for i in range(data_transposed.shape[0]):
        data_resampled[i, :] = resample(data_transposed[i, :],
                                        dimensions, window=('kaiser', 14))

    print("Shape after resampling:", data_resampled.T.shape)
    return data_resampled.T

In [2]:
def prep_data(fmri_data, feature_data):
    fmri_reshaped = remove_nan(fmri_data)

    feature_resampled = resample_to_acq(feature_data, fmri_reshaped)

    return fmri_reshaped, feature_resampled

In [None]:
# Resample feature data and remove nans from fmri data
ai_fmri_clean, ai_feat_resamp = prep_data(ai_fmri, ai_features)
avatar_fmri, avatar_feat_resamp = prep_data(avatar_fmri, avatar_features)
howtodraw_fmri_clean, howtodraw_feat_resamp = prep_data(howtodraw_fmri, howtodraw_features)
legacy_fmri_clean, legacy_feat_resamp = prep_data(legacy_fmri, legacy_features)
life_fmri_clean, life_feat_resamp = prep_data(life_fmri, life_features)
yankees_fmri_clean, yankees_feat_resamp = prep_data(yankees_fmri, yankees_features)
naked_fmri_clean, naked_feat_resamp = prep_data(naked_fmri, naked_features)
ode_fmri_clean, ode_feat_resamp = prep_data(ode_fmri, ode_features)
souls_fmri_clean, souls_feat_resamp = prep_data(souls_fmri, souls_features)
under_fmri_clean, under_feat_resamp = prep_data(under_fmri, under_features)

In [None]:
# Sepcify feature data and fMRI data
language_fmri_arrays = [ai_fmri_clean, avatar_fmri, howtodraw_fmri_clean,
                        legacy_fmri_clean, life_fmri_clean, yankees_fmri_clean,
                        naked_fmri_clean, ode_fmri_clean, souls_fmri_clean,
                        under_fmri_clean]
language_feature_arrays = [ai_feat_resamp, avatar_feat_resamp, howtodraw_feat_resamp,
                           legacy_feat_resamp, life_feat_resamp, yankees_feat_resamp,
                           naked_feat_resamp, ode_feat_resamp, souls_feat_resamp,
                           under_feat_resamp]

Again, since we're creating voxelwise encoding models that will be used to predict fMRI data for the opposite modality, we don't need to hold out any data from training

In [None]:
# Combine data
language_Y_train = np.vstack(language_fmri_arrays)
language_X_train = np.vstack(language_feature_arrays)

#### 2.2.3 Set up Ridge Regression

In [None]:
# Define cross-validation
language_run_onsets = run_onsets(language_feature_arrays)

In [None]:
language_pipeline = setup_ridge(language_X_train, language_run_onsets)

#### 2.2.4 Run Ridge Regression

In [None]:
# Fit the model
_ = language_pipeline.fit(language_X_train, language_Y_train)

In [None]:
# Get coefficients
language_coef = language_pipeline[-1].coef_
language_coef = backend.to_numpy(language_coef)
print("(n_delays * n_features, n_voxels) =", language_coef.shape)

# Regularize coefficients
language_coef /= np.linalg.norm(language_coef, axis=0)[None]
# coef *= np.sqrt(np.maximum(0, scores))[None]

# split the ridge coefficients per delays
delayer = language_pipeline.named_steps['delayer']
language_coef_per_delay = delayer.reshape_by_delays(language_coef, axis=0)
print("(n_delays, n_features, n_voxels) =", language_coef_per_delay.shape)
del language_coef

# average over delays
language_average_coef = np.mean(language_coef_per_delay, axis=0)
print("(n_features, n_voxels) =", language_average_coef.shape)
del language_coef_per_delay

## 3 Crossmodal Prediction

### 3.1 Project features into alternate modality
Before we predict fMRI data using BridgeTower features we want to transform our features into the crossmodal space. To do this, we will use the Flickr dataset consisting of caption-image pairs to build two Ridge Regression models. One that maps BridgeTower visual features from images to BridgeTower language features from captions, and one that maps caption features to image features. This way we will have a image to caption matrix to project visual features into the language space and a caption to image matrix to project language features into the visual space.

#### 3.1.1 Extract Flickr features from BridgeTower

In [None]:
flickr_dataset = load_dataset("nlphuji/flickr30k")
flickr_test = dataset['test']

In [None]:
# Define Model
device, model, processor, features, layer_selected = setup_model(layer)

flickr_features = []

for item in range(len(flickr_test)):
    image = flickr_test[item]['image']
    image_array = np.array(image)
    caption = " ".join(flickr_test[item]['caption'])

    # Run image
    image_input = processor(image_array, "", return_tensors="pt")
    image_input = {key: value.to(device) for key,
                    value in image_input.items()}

    _ = model(**image_input)

    image_vector = features[f'layer_{layer}']

    # Run caption
    # Create a numpy array filled with gray values (128 in this case)
    # THis will act as tthe zero image input***
    gray_value = 128
    gray_image_array = np.full((512, 512, 3), gray_value, dtype=np.uint8)

    caption_input = processor(gray_image_array, caption,
                                return_tensors="pt")
    caption_input = {key: value.to(device) for key,
                        value in caption_input.items()}
    _ = model(**caption_input)

    caption_vector = features[f'layer_{layer}']

    flickr_features.append([image_vector, caption_vector])

In [None]:
# Run encoding model
backend = set_backend("torch_cuda", on_error="warn")
print(backend)

# Variables
captions = data[:, 1, :]
images = data[:, 0, :]

alphas = np.logspace(1, 20, 20)
scaler = StandardScaler(with_mean=True, with_std=False)

ridge_cv = RidgeCV(
    alphas=alphas, cv=5,
    solver_params=dict(n_targets_batch=500, n_alphas_batch=5,
                        n_targets_batch_refit=100))

pipeline = make_pipeline(
    scaler,
    ridge_cv
)

_ = pipeline.fit(images, captions)
coef_images_to_captions = backend.to_numpy(pipeline[-1].coef_)

_ = pipeline.fit(captions, images)
coef_captions_to_images = backend.to_numpy(pipeline[-1].coef_)

#### 3.1.2 Project movie features to language space

In [None]:
test_transformed = np.dot(test_features, coef_images_to_captions.T)
train00_transformed = np.dot(train00_features, coef_images_to_captions.T)
train01_transformed = np.dot(train01_features, coef_images_to_captions.T)
train02_transformed = np.dot(train02_features, coef_images_to_captions.T)
train03_transformed = np.dot(train03_features, coef_images_to_captions.T)
train04_transformed = np.dot(train04_features, coef_images_to_captions.T)
train05_transformed = np.dot(train05_features, coef_images_to_captions.T)
train06_transformed = np.dot(train06_features, coef_images_to_captions.T)
train07_transformed = np.dot(train07_features, coef_images_to_captions.T)
train08_transformed = np.dot(train08_features, coef_images_to_captions.T)
train09_transformed = np.dot(train09_features, coef_images_to_captions.T)
train10_transformed = np.dot(train10_features, coef_images_to_captions.T)
train11_transformed = np.dot(train11_features, coef_images_to_captions.T)

In [None]:
def create_flatmap(subject, layer, correlations, modality):
    """Function to run the vision encoding model. Predicts brain activity
    to story listening and return correlations between predictions and real
    brain activity.

    Parameters
    ----------
    subject: string
        A reference to the subject for analysis. Used to load fmri data.
    layer: int
        A layer reference for the BridgeTower model. Set's the forward
        hook on the relevant layer.
    correlations: array
        Generated by story_prediction() or movie_prediction() function.
        Contains the correlation between predicted and real brain activity
        for each voxel.
    modality: string
        Which modality was used for the base encoding model: vision or
        language.

    Returns
    -------
    Flatmaps:
        Saves flatmap visualizations as pngs
    """
    # Reverse flattening and masking
    fmri_alternateithicatom = np.load("data/storydata/" + subject +
                                      "/alternateithicatom.npy")

    mask = ~np.isnan(fmri_alternateithicatom[0])  # reference for the mask
    # Initialize an empty 3D array with NaNs for the correlation data
    reconstructed_correlations = np.full((31, 100, 100), np.nan)

    # Flatten the mask to get the indices of the non-NaN data points
    valid_indices = np.where(mask.flatten())[0]

    # Assign the correlation coefficients to their original spatial positions
    for index, corr_value in zip(valid_indices, correlations):
        # Convert the 1D index back to 3D index in the spatial dimensions
        z, x, y = np.unravel_index(index, (31, 100, 100))
        reconstructed_correlations[z, x, y] = corr_value

    flattened_correlations = reconstructed_correlations.flatten()

    # Load mappers
    lh_mapping_matrix = load_npz("data/mappers/" + subject +
                                 "_listening_forVL_lh.npz")
    lh_vertex_correlation_data = lh_mapping_matrix.dot(flattened_correlations)
    lh_vertex_coords = np.load("data/mappers/" + subject +
                               "_vertex_coords_lh.npy")

    rh_mapping_matrix = load_npz("data/mappers/" + subject +
                                 "_listening_forVL_rh.npz")
    rh_vertex_correlation_data = rh_mapping_matrix.dot(flattened_correlations)
    rh_vertex_coords = np.load("data/mappers/" + subject +
                               "_vertex_coords_rh.npy")

    vmin, vmax = -0.1, 0.1
    fig, axs = plt.subplots(1, 2, figsize=(7, 4))

    # Plot the first flatmap
    sc1 = axs[0].scatter(lh_vertex_coords[:, 0], lh_vertex_coords[:, 1],
                         c=lh_vertex_correlation_data, cmap='RdBu_r',
                         vmin=vmin, vmax=vmax, s=.005)
    axs[0].set_aspect('equal', adjustable='box')  # Ensure equal scaling
    # axs[0].set_title('Left Hemisphere')
    axs[0].set_frame_on(False)
    axs[0].set_xticks([])  # Remove x-axis ticks
    axs[0].set_yticks([])  # Remove y-axis ticks

    # Plot the second flatmap
    _ = axs[1].scatter(rh_vertex_coords[:, 0], rh_vertex_coords[:, 1],
                       c=rh_vertex_correlation_data, cmap='RdBu_r',
                       vmin=vmin, vmax=vmax, s=.005)
    axs[1].set_aspect('equal', adjustable='box')  # Ensure equal scaling
    # axs[1].set_title('Right Hemisphere')
    axs[1].set_frame_on(False)
    axs[1].set_xticks([])  # Remove x-axis ticks
    axs[1].set_yticks([])  # Remove y-axis ticks

    # Adjust layout to make space for the top colorbar
    plt.subplots_adjust(top=0.85, wspace=0)

    # Add a single horizontal colorbar at the top
    cbar_ax = fig.add_axes([0.25, 0.9, 0.5, 0.03])
    cbar = fig.colorbar(sc1, cax=cbar_ax, orientation='horizontal')

    # Set the color bar to only display min and max values
    cbar.set_ticks([vmin, vmax])
    cbar.set_ticklabels([f'{vmin}', f'{vmax}'])

    # Remove the color bar box
    cbar.outline.set_visible(False)
    if modality == 'vision':
        latex = r"$r_{\mathit{movie \rightarrow story}}"
        plt.title(f'{subject}\n{latex}$')

        plt.savefig('results/movie_to_story/' + subject + '/layer' + layer +
                    '_visual.png', format='png')
    elif modality == 'language':
        latex = r"$r_{\mathit{story \rightarrow movie}}"
        plt.title(f'{subject}\n{latex}$')
        plt.savefig('results/story_to_movie/' + subject + '/layer' + layer +
                    '_visual.png', format='png')
    plt.show()