# Script for creating sentence-meaning manipulation datasets from the Pereira2018 fMRI stimuli

In [1]:
import re
from pathlib import Path
from os.path import abspath
import os
import numpy as np
import random
import pickle
import csv
import subprocess
import collections
from random import shuffle

In [2]:
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation


In [3]:
importpath = abspath('../..')
os.chdir(importpath)
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp


In [7]:
#add seeds for reproducability
np.random.seed(42)
random.seed(42)

# settings
save = False # if storing pkl dataframes

# Load base stimulus dataframe (Pereira 2018)

In [8]:
from neural_nlp.benchmarks.neural import *
import neural_nlp
from neural_nlp.stimuli import StimulusSet
import xarray as xr

benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# fetch stimulus set
benchmark = benchmark_pool['Pereira2018-encoding']
stimuli_df = benchmark._target_assembly.attrs['stimulus_set']
stimuli_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [9]:
# benchmark content
benchmark._target_assembly.values.shape
benchmark._target_assembly.coords

Coordinates:
  * presentation      (presentation) MultiIndex
  - stimulus_num      (presentation) int64 0 0 1 1 2 2 3 ... 12 12 13 13 14 14
  - passage_index     (presentation) int64 1 1 1 1 1 1 1 1 2 ... 3 4 4 4 4 4 4 4
  - passage_label     (presentation) object 'Accordion' ... 'dreams'
  - passage_category  (presentation) object 'music' 'beekeeping' ... 'dreams'
  - stimulus_id       (presentation) object '384sentences.0' ... '243sentences.14'
  - story             (presentation) object '384sentences.music' ... '243sentences.dreams'
  - experiment        (presentation) object '384sentences' ... '243sentences'
  * neuroid           (neuroid) MultiIndex
  - subject           (neuroid) object '018' '018' '018' ... '018' '018' '018'
  - voxel_num         (neuroid) int64 28 29 31 32 38 42 ... 152 153 154 159 160
  - atlas             (neuroid) object 'language' 'language' ... 'language'
  - filter_strategy   (neuroid) object '' '' '' '' '' '' ... '' '' '' '' '' ''
  - atlas_selection   (

In [10]:
stimuli_path = os.path.join(os.getcwd(),'ressources/stimuli_creation')
os.chdir(stimuli_path)
print(os.getcwd())

savedir = abspath('../scrambled_stimuli_dfs')
print(savedir)
os.makedirs(savedir, exist_ok=True)

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation
/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/scrambled_stimuli_dfs


## Check how many topics there are

In [17]:
stimuli_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


Check that number of passages is correct:

In [15]:
len(np.unique(stimuli_df.loc[stimuli_df.experiment == '243sentences'].passage_index))

72

In [24]:
stimuli_df.loc[stimuli_df.experiment == '243sentences'].passage_label.values

array(['beekeeping', 'beekeeping', 'beekeeping', 'beekeeping',
       'beekeeping', 'beekeeping', 'beekeeping', 'beekeeping',
       'beekeeping', 'beekeeping', 'beekeeping', 'dreams', 'dreams',
       'dreams', 'dreams', 'dreams', 'dreams', 'dreams', 'dreams',
       'dreams', 'dreams', 'gambling', 'gambling', 'gambling', 'gambling',
       'gambling', 'gambling', 'gambling', 'gambling', 'gambling',
       'gambling', 'hurricane', 'hurricane', 'hurricane', 'hurricane',
       'hurricane', 'hurricane', 'hurricane', 'hurricane', 'hurricane',
       'hurricane', 'ice_cream', 'ice_cream', 'ice_cream', 'ice_cream',
       'ice_cream', 'ice_cream', 'ice_cream', 'ice_cream', 'ice_cream',
       'ice_cream', 'lawn_mower', 'lawn_mower', 'lawn_mower',
       'lawn_mower', 'lawn_mower', 'lawn_mower', 'lawn_mower',
       'lawn_mower', 'lawn_mower', 'lawn_mower', 'astronaut', 'astronaut',
       'astronaut', 'astronaut', 'astronaut', 'astronaut', 'astronaut',
       'astronaut', 'astronaut', 'ast

In [16]:
len(np.unique(stimuli_df.loc[stimuli_df.experiment == '384sentences'].passage_index))

96

In [25]:
stimuli_df.loc[stimuli_df.experiment == '384sentences'].passage_label.values

array(['Accordion', 'Accordion', 'Accordion', 'Accordion', 'Apartment',
       'Apartment', 'Apartment', 'Apartment', 'Apple', 'Apple', 'Apple',
       'Apple', 'Arson', 'Arson', 'Arson', 'Arson', 'Automobile',
       'Automobile', 'Automobile', 'Automobile', 'Axe', 'Axe', 'Axe',
       'Axe', 'Banana', 'Banana', 'Banana', 'Banana', 'Bed', 'Bed', 'Bed',
       'Bed', 'Bee', 'Bee', 'Bee', 'Bee', 'Bicycle', 'Bicycle', 'Bicycle',
       'Bicycle', 'Blacksmith', 'Blacksmith', 'Blacksmith', 'Blacksmith',
       'Blender', 'Blender', 'Blender', 'Blender', 'Boy', 'Boy', 'Boy',
       'Boy', 'Broccoli', 'Broccoli', 'Broccoli', 'Broccoli', 'Butterfly',
       'Butterfly', 'Butterfly', 'Butterfly', 'Carpenter', 'Carpenter',
       'Carpenter', 'Carpenter', 'Cat', 'Cat', 'Cat', 'Cat', 'Chair',
       'Chair', 'Chair', 'Chair', 'Clarinet', 'Clarinet', 'Clarinet',
       'Clarinet', 'Cod', 'Cod', 'Cod', 'Cod', 'Coffee', 'Coffee',
       'Coffee', 'Coffee', 'Desert', 'Desert', 'Desert', 'Desert', 'D

Check topics (passage category)

In [19]:
len(np.unique(stimuli_df.loc[stimuli_df.experiment == '243sentences'].passage_category))

24

In [32]:
(np.unique(stimuli_df.loc[stimuli_df.experiment == '243sentences'].passage_category))

array(['astronaut', 'beekeeping', 'blindness', 'bone_fracture', 'castle',
       'computer_graphics', 'dreams', 'gambling', 'hurricane',
       'ice_cream', 'infection', 'law_school', 'lawn_mower', 'opera',
       'owl', 'painter', 'pharmacist', 'polar_bear', 'pyramid',
       'rock_climbing', 'skiing', 'stress', 'taste', 'tuxedo'],
      dtype=object)

In [34]:
((stimuli_df.loc[stimuli_df.passage_category == 'astronaut'].sentence)).values

array(['Astronauts train a long time for their spacewalks.',
       'Much of their training is conducted underwater.',
       'They may spend 8 to 10 hours in the pool for every hour they will spend floating in space.',
       'Astronauts practice to be able to perform construction and repair work on the outside of the space station.',
       'The commanders of shuttle flights are always pilots, and many have backgrounds as military test pilots.',
       'Other astronauts are trained as doctors, engineers, and scientists, who can run experiments in space.',
       'Early crews were all young men, but astronauts now are much more diverse.',
       'The team of astronauts floated out together to the exterior of the space shuttle.',
       'They carried tools needed to repair the broken part on the huge telescope.',
       'One astronaut loosened the bolts on the pipe, while the other fitted the replacement part into place.'],
      dtype=object)

In [20]:
len(np.unique(stimuli_df.loc[stimuli_df.experiment == '384sentences'].passage_category))

24

In [26]:
(np.unique(stimuli_df.loc[stimuli_df.experiment == '384sentences'].passage_category))

array(['animal', 'appliance', 'bird', 'body_part', 'building_part',
       'clothing', 'crime', 'disaster', 'drink_non_alcoholic', 'dwelling',
       'fish', 'fruit', 'furniture', 'human', 'insect', 'kitchen_utensil',
       'landscape', 'music', 'place', 'profession', 'tool', 'vegetable',
       'vehicles_transport', 'weapon'], dtype=object)

Get examples of passage labels (expt384)

In [29]:
(np.unique(stimuli_df.loc[stimuli_df.passage_category == 'animal'].passage_label))

array(['Cat', 'Elephant', 'Horse', 'Whale'], dtype=object)

In [31]:
(np.unique(stimuli_df.loc[stimuli_df.passage_category == 'clothing'].passage_label))

array(['Dress', 'Glove', 'Shoe', 'Sweater'], dtype=object)