# Notebook to analyze CC2017 data and create train/test splits

CC2017 can be found at the following paths in the NFS:
- Path to betas: `/data/vision/oliva/scratch/datasets/CC2017/video_fmri_dataset/TSTrialEstimates/subjectX/estimates-prepared/prepared_allvoxel_pkl/`

- Path to stimuli: `/data/vision/oliva/scratch/datasets/CC2017/video_fmri_dataset/stimuli/clipped2s/mp4/`

Use the download_datasets.py scripts to download them if you have a CSAIL id.

Iniside the betas folder, there are 181 pkl files corresponding to 181 ROIs, defined with the CIFTI format, for subject X. The pkl contain the following keys:
- 'train_data_allvoxel': the normalized fmri data of size (numvideos, numvoxels). The different repetitions are individual elements along this array, identified by the train_stim_order list.
- 'test_data_allvoxel', 
- 'train_stim_order', 
- 'test_stim_order', 
- 'test_noiseceiling_allvoxel', 
- 'roi_indices_hcp'

In [3]:
import pickle as pkl
import numpy as np
import os

In [6]:

filename_cc2017 = '../data/betas_cifti_cc2017/sub01/prepared_allvoxel_pkl/Group41_estimates-TSTrialEstimates_z=1.pkl'

with open(filename_cc2017, 'rb') as f:
    data = pkl.load(f)

# size in megabytes of pkl
print(os.path.getsize(filename_cc2017)/(1024*1024))

print(data.keys())
print(data['train_data_allvoxel'].shape)
print(data['test_data_allvoxel'].shape)
print(len(data['train_stim_order']))
print(data['train_stim_order'])
print(data['test_stim_order'])
print(len(set(data['train_stim_order'])))

print(data['train_data_allvoxel'].max())
print(data['train_data_allvoxel'].min())

print(data['test_data_allvoxel'].max())
print(data['test_data_allvoxel'].min())

2070.406481742859
dict_keys(['train_data_allvoxel', 'test_data_allvoxel', 'train_stim_order', 'test_stim_order', 'test_noiseceiling_allvoxel', 'roi_indices_hcp'])
(8604, 13156)
(11990, 13156)
8604
['seg1_begin-2_end-4', 'seg1_begin-4_end-6', 'seg1_begin-6_end-8', 'seg1_begin-8_end-10', 'seg1_begin-10_end-12', 'seg1_begin-12_end-14', 'seg1_begin-14_end-16', 'seg1_begin-16_end-18', 'seg1_begin-18_end-20', 'seg1_begin-20_end-22', 'seg1_begin-22_end-24', 'seg1_begin-24_end-26', 'seg1_begin-26_end-28', 'seg1_begin-28_end-30', 'seg1_begin-30_end-32', 'seg1_begin-32_end-34', 'seg1_begin-34_end-36', 'seg1_begin-36_end-38', 'seg1_begin-38_end-40', 'seg1_begin-40_end-42', 'seg1_begin-42_end-44', 'seg1_begin-44_end-46', 'seg1_begin-46_end-48', 'seg1_begin-48_end-50', 'seg1_begin-50_end-52', 'seg1_begin-52_end-54', 'seg1_begin-54_end-56', 'seg1_begin-56_end-58', 'seg1_begin-58_end-60', 'seg1_begin-60_end-62', 'seg1_begin-62_end-64', 'seg1_begin-64_end-66', 'seg1_begin-66_end-68', 'seg1_begin-68_en

In [12]:
# remove all files that contain z=0 in their name

target_path = '../data/betas_cifti_cc2017/sub03/prepared_allvoxel_pkl/'
for filename in os.listdir(target_path):
    if 'z=0' in filename:
        os.remove(target_path + filename)

# Define Train and Test sets

For this dataset, train and test sets have already been identified by previous authors.

The train and test stimuli names are contained in the pickles, under keys `train_stim_order` and `test_stim_order`

In [18]:
# Testing if filenames have the same stim_orders

f1 = '../data/betas_cifti_cc2017/sub01/prepared_allvoxel_pkl/Group41_estimates-TSTrialEstimates_z=1.pkl'
f2 = '../data/betas_cifti_cc2017/sub01/prepared_allvoxel_pkl/1_estimates-TSTrialEstimates_z=1.pkl'

with open(f1, 'rb') as f:
    data1 = pkl.load(f)

print(data1['train_stim_order'])
print(data1['train_data_allvoxel'].shape)

with open(f2, 'rb') as f:
    data2 = pkl.load(f)

print(data2['train_stim_order'])
print(data2['train_data_allvoxel'].shape)
assert data1['test_stim_order'] == data2['test_stim_order']

unique_train_vids = set(data1['train_stim_order'])
print(len(unique_train_vids))

unique_test_vids = set(data1['test_stim_order'])
print(len(unique_test_vids))

['seg1_begin-2_end-4', 'seg1_begin-4_end-6', 'seg1_begin-6_end-8', 'seg1_begin-8_end-10', 'seg1_begin-10_end-12', 'seg1_begin-12_end-14', 'seg1_begin-14_end-16', 'seg1_begin-16_end-18', 'seg1_begin-18_end-20', 'seg1_begin-20_end-22', 'seg1_begin-22_end-24', 'seg1_begin-24_end-26', 'seg1_begin-26_end-28', 'seg1_begin-28_end-30', 'seg1_begin-30_end-32', 'seg1_begin-32_end-34', 'seg1_begin-34_end-36', 'seg1_begin-36_end-38', 'seg1_begin-38_end-40', 'seg1_begin-40_end-42', 'seg1_begin-42_end-44', 'seg1_begin-44_end-46', 'seg1_begin-46_end-48', 'seg1_begin-48_end-50', 'seg1_begin-50_end-52', 'seg1_begin-52_end-54', 'seg1_begin-54_end-56', 'seg1_begin-56_end-58', 'seg1_begin-58_end-60', 'seg1_begin-60_end-62', 'seg1_begin-62_end-64', 'seg1_begin-64_end-66', 'seg1_begin-66_end-68', 'seg1_begin-68_end-70', 'seg1_begin-70_end-72', 'seg1_begin-72_end-74', 'seg1_begin-74_end-76', 'seg1_begin-76_end-78', 'seg1_begin-78_end-80', 'seg1_begin-80_end-82', 'seg1_begin-82_end-84', 'seg1_begin-84_end-86'

In [None]:
# Saving train_stim_order and test_stim_order as json lists, to follow HAD format

import json

with open('../data/betas_cifti_cc2017/sub01/prepared_allvoxel_pkl/1_estimates-TSTrialEstimates_z=1.pkl', 'rb') as f:
    data = pkl.load(f)

train_dataset = data['train_stim_order']
test_dataset = data['test_stim_order']

with open('../data/metadata_cc2017/train_set.json', 'w') as f:
    json.dump(train_dataset, f)