# Script for creating sentence-meaning manipulation datasets from the Pereira2018 fMRI stimuli

In [1]:
import re
from pathlib import Path
from os.path import abspath
import os
import numpy as np
import random
import pickle
import csv
import subprocess
from random import shuffle

In [2]:
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation


In [3]:
importpath = abspath('../..')
os.chdir(importpath)
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp


In [4]:
#add seeds for reproducability
np.random.seed(42)
random.seed(42)

# settings
save = True # if storing pkl dataframes

# Load base stimulus dataframe (Pereira 2018)

In [5]:
from neural_nlp.benchmarks.neural import *
import neural_nlp
from neural_nlp.stimuli import StimulusSet
import xarray as xr

benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# fetch stimulus set
benchmark = benchmark_pool['Pereira2018-encoding']
stimuli_df = benchmark._target_assembly.attrs['stimulus_set']
stimuli_df

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)


Loading lookup from /opt/anaconda3/lib/python3.8/site-packages/brainio_collection/lookup.csv
/Users/gt/Documents/GitHub/perturbed-neural-nlp/neural_nlp/../ressources/stimuli


 We're running in the NEW version of the implementations.py script.




  xr_data.set_index(append=True, inplace=True, **coords_d)
  elif isinstance(data, pd.Panel):


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [6]:
# benchmark content
benchmark._target_assembly.values.shape
benchmark._target_assembly.coords

Coordinates:
  * presentation      (presentation) MultiIndex
  - stimulus_num      (presentation) int64 0 0 1 1 2 2 3 ... 12 12 13 13 14 14
  - passage_index     (presentation) int64 1 1 1 1 1 1 1 1 2 ... 3 4 4 4 4 4 4 4
  - passage_label     (presentation) object 'Accordion' ... 'dreams'
  - passage_category  (presentation) object 'music' 'beekeeping' ... 'dreams'
  - stimulus_id       (presentation) object '384sentences.0' ... '243sentences.14'
  - story             (presentation) object '384sentences.music' ... '243sentences.dreams'
  - experiment        (presentation) object '384sentences' ... '243sentences'
  * neuroid           (neuroid) MultiIndex
  - subject           (neuroid) object '018' '018' '018' ... '018' '018' '018'
  - voxel_num         (neuroid) int64 28 29 31 32 38 42 ... 152 153 154 159 160
  - atlas             (neuroid) object 'language' 'language' ... 'language'
  - filter_strategy   (neuroid) object '' '' '' '' '' '' ... '' '' '' '' '' ''
  - atlas_selection   (

In [7]:
stimuli_path = os.path.join(os.getcwd(),'ressources/stimuli_creation')
os.chdir(stimuli_path)
print(os.getcwd())

savedir = abspath('../scrambled_stimuli_dfs')
print(savedir)
os.makedirs(savedir, exist_ok=True)

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation
/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/scrambled_stimuli_dfs


# Create different perturbed versions of the benchmark

`get original dataset in correct formatting. NOTE: "stim_243sentences_scrambled.txt" and "stim_384sentences_scrambled.txt" are created via running "get_original_sentenceset.ipynb"`

In [8]:
def get_original_sentenceset(filename):
    with open(os.path.join(stimuli_path,filename),"r") as f:
        reader = csv.reader(f, delimiter="\t")
        sentences = list(reader)
    Original = [sentence[1] + '.' for sentence in sentences if int(sentence[0]) == 0]
    
    return Original

In [9]:
Original_243 = get_original_sentenceset("stim_243sentences_scrambled.txt")
Original_384 = get_original_sentenceset("stim_384sentences_scrambled.txt")

Original = Original_243 + Original_384

In [10]:
#This is the list of stimuli we create the perturbations from
original_stimuli = Original
print(original_stimuli[:5])

['beekeeping encourages the conservation of local habitats.', "it is in every beekeeper's interest to conserve local plants that produce pollen.", 'as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'artisanal beekeepers go to extremes for their craft but their product is worth the effort.']


## Create sentence shuffling conditions

### 1. random shuffling of sentences across the dataset, not respecting passages, sentence length, etc.

In [307]:
shuffled_sentences = original_stimuli.copy()
shuffle(shuffled_sentences)

#quick test that all sentences have been shuffled
for ind in range(len(original_stimuli)):
    if original_stimuli[ind] != shuffled_sentences[ind]:
        continue
    else:
        print(ind)
print("Done")

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-random.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(shuffled_sentences[:5])
perturbed_df

100
106
168
372
Done
['gloves to protect against cold are made of wool or lined waterproof material.', 'the driver steers the car on roads other passengers just sit.', 'a knife can be used to attack by slashing stabbing or throwing.', 'the computer graphics specialist works with doctors to visualize medical conditions and surgical procedures.', 'i finally came to a stop at a flat part of the slope.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,gloves to protect against cold are made of woo...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,the driver steers the car on roads other passe...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,a knife can be used to attack by slashing stab...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,the computer graphics specialist works with do...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,i finally came to a stop at a flat part of the...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,the adults set up climbing routes appropriate ...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,the building can have a garage a laundry facil...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,the wheels have rubber tires with an inner tub...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,good data on the social and economic effects o...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [310]:
# check that all are unique
assert(len(np.unique(shuffled_sentences)) == 627)

### 1b. random shuffling of sentences across the dataset, making sure that sentences from the same topic are not placed in the original slot

Info from Pereira et al., 2018: Experiment 2 used 96
passages, each consisting of 4 sentences about a particular concept, spanning a
broad range of content areas from 24 broad topics (e.g., professions, clothing, birds,
musical instruments, natural disasters, crimes, etc.), with 4 passages per topic (e.g.,
clarinet, accordion, piano, and violin for musical instruments; Supplementary
Figure 1). All passages were Wikipedia-style texts that provided basic information
about the relevant concept. Experiment 3 used 72 passages, each consisting of 3 or
4 sentences about a particular concept. As in experiment 2, the passages spanned a
broad range of content areas from 24 broad topics, unrelated to the topics in
experiment 2 (e.g., skiing, dreams, opera, bone fractures, etc.), with 3 passages per
topic

In [278]:
print(len(np.unique(stimuli_df.passage_label)))
print(len(np.unique(stimuli_df.passage_index)))
print(len(np.unique(stimuli_df.passage_category)))

120
96
48


In [311]:
def shuffle_topic_criteria(stimuli_df):
    '''
    Shuffle sentences across the entire set, but do not allow sentences from the same topic (passage category)
    to land in the same spot.
    '''
    np.random.seed(42)
    random.seed(42)
    
    avail_sents = stimuli_df.copy(deep=True) # for storing(=popping) which sentences have already been used
    stimuli_df_copy = stimuli_df.copy(deep=True)
    new_sents = []
    new_topics = [] # for asserting that topics did not repeat
    for i, sent in enumerate(stimuli_df.itertuples()):
        # find a sentence that is from a different topic & that has NOT been used before
        q_str = f'`passage_category` != "{sent.passage_category}"'
        sents_to_pick_from = avail_sents.query(q_str)

        # pick a random sentence 
        rand_idx = np.random.choice(sents_to_pick_from.index.values)
        picked_sent = stimuli_df_copy.iloc[rand_idx]

        # store the picked sentence topic
        new_topics.append(picked_sent.passage_category)

        # remove from avail_sents
        avail_sents = avail_sents.drop(index=rand_idx)
        assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

        # append to the new sents list
        new_sents.append(picked_sent.sentence)

    return new_sents, new_topics

In [312]:
# obtain new sents 
new_sents, new_topics = shuffle_topic_criteria(stimuli_df)

In [313]:
# show that topics are now different!
for idx, topic in enumerate(stimuli_df.passage_category.values):
    print(topic, new_topics[idx])
    assert(topic != new_topics[idx])

beekeeping tuxedo
beekeeping dwelling
beekeeping vehicles_transport
beekeeping tuxedo
beekeeping law_school
beekeeping hurricane
beekeeping taste
beekeeping bird
beekeeping rock_climbing
beekeeping disaster
beekeeping crime
dreams pharmacist
dreams kitchen_utensil
dreams tuxedo
dreams bone_fracture
dreams landscape
dreams body_part
dreams fish
dreams vegetable
dreams tool
dreams castle
gambling vegetable
gambling painter
gambling music
gambling building_part
gambling hurricane
gambling furniture
gambling disaster
gambling appliance
gambling astronaut
gambling profession
hurricane weapon
hurricane castle
hurricane bird
hurricane building_part
hurricane weapon
hurricane clothing
hurricane bone_fracture
hurricane profession
hurricane dreams
hurricane owl
ice_cream profession
ice_cream dwelling
ice_cream disaster
ice_cream dreams
ice_cream fruit
ice_cream body_part
ice_cream building_part
ice_cream disaster
ice_cream stress
ice_cream tool
lawn_mower bird
lawn_mower crime
lawn_mower furnitu

In [314]:
# check that all are unique
assert(len(np.unique(new_sents)) == 627)

In [315]:
#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-topic-criteria.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(new_sents[:5])
perturbed_df

['Clearly he had it tailored, as it fit the man perfectly.', 'A log cabin is a small house built from round logs.', 'People use bicycles for transportation, recreation and racing.', 'A tuxedo jacket is a tailless dinner jacket with black silk lapels.', 'He wanted to change things for the better.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,clearly he had it tailored as it fit the man p...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,a log cabin is a small house built from round ...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,people use bicycles for transportation recreat...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,a tuxedo jacket is a tailless dinner jacket wi...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,he wanted to change things for the better.,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,the elephant flaps its large ears to cool the ...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,the adults set up climbing routes appropriate ...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,as a painter i learned to focus less on the ac...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,some people collect butterflies because of the...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


### 1c. random shuffling of sentences across the dataset, making sure that sentences from the same topic are not placed in the original slot and match for length

In [316]:
def shuffle_topic_and_length_criteria(stimuli_df):
    '''
    Shuffle sentences across the entire set, but do not allow sentences from the same topic (passage category)
    to land in the same spot. Match for length as well.
    '''
    np.random.seed(42)
    random.seed(42)

    stimuli_df_copy = stimuli_df.copy(deep=True)
    stimuli_df_copy['sent_len'] = [len(x.split(' ')) for x in stimuli_df_copy.sentence]
    avail_sents = stimuli_df_copy.copy(deep=True) # for storing(=popping) which sentences have already been used


    new_sents = []
    new_topics = [] # for asserting that topics did not repeat
    new_lens = [] # for asserting that lens did not repeat
    len_threshold = 0 # start by 1
    stored_len_thresholds = [0] # start at 1
    stored_idx_threshold = [0] # for storing at which idx the threshold changed. start at 0

    for i, sent in enumerate(stimuli_df_copy.itertuples()):
    #     print(i)
        if i <= 617: # still possible to find reasonable length matches
            # find a sentence that is from a different topic & that has NOT been used before
            q_str = f'`passage_category` != "{sent.passage_category}"'
            avail_topic_criteria = avail_sents.query(q_str)

            # add length requirement
            q_str_len = f'sent_len >= {sent.sent_len - len_threshold} & sent_len <= {sent.sent_len + len_threshold}'
            print(f'Current sent length is {sent.sent_len}')
            avail_topic_AND_len_criteria = avail_topic_criteria.query(q_str_len)
            print(f'Unique lengths available: {np.unique(avail_topic_AND_len_criteria.sent_len.values)}')
            print(f'{len(avail_topic_AND_len_criteria)} sentences available!')

            if len(avail_topic_AND_len_criteria) == 0: # if no sentences available
                len_threshold += 1
                stored_len_thresholds.append(len_threshold)
                stored_idx_threshold.append(i)
                print(f'________________ Changing length threshold to {len_threshold} ___________________')
                # rerun for the current index, so all sentences will be paired
                q_str_len = f'sent_len >= {sent.sent_len - len_threshold} & sent_len <= {sent.sent_len + len_threshold}'
                avail_topic_AND_len_criteria = avail_topic_criteria.query(q_str_len)
                print(f'{len(avail_topic_AND_len_criteria)} sentences available!')

            # pick a random sentence 
            rand_idx = np.random.choice(avail_topic_AND_len_criteria.index.values)
            picked_sent = stimuli_df_copy.iloc[rand_idx]

            # store the picked sentence topic and len
            new_topics.append(picked_sent.passage_category)
            new_lens.append(picked_sent.sent_len)

            # remove from avail_sents
            avail_sents = avail_sents.drop(index=rand_idx)
            assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

            # append to the new sents list
            new_sents.append(picked_sent.sentence)

        else: # if i larger than 621. get rid of length requirement, only topic
            print(i)

            # find a sentence that is from a different topic & that has NOT been used before
            q_str = f'`passage_category` != "{sent.passage_category}"'
            avail_topic_criteria = avail_sents.query(q_str)

            # pick a random sentence 
            rand_idx = np.random.choice(avail_topic_criteria.index.values)
            picked_sent = stimuli_df_copy.iloc[rand_idx]

            # store the picked sentence topic and len
            new_topics.append(picked_sent.passage_category)
            new_lens.append(picked_sent.sent_len)

            # remove from avail_sents
            avail_sents = avail_sents.drop(index=rand_idx)
            assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

            # append to the new sents list
            new_sents.append(picked_sent.sentence)
            
    print('Length increments performed at:')
    print(stored_idx_threshold)
    print(stored_len_thresholds)
            
    return new_sents, new_topics, new_lens

In [317]:
new_sents, new_topics, new_lens = shuffle_topic_and_length_criteria(stimuli_df)

Current sent length is 7
Unique lengths available: [7]
11 sentences available!
Current sent length is 13
Unique lengths available: [13]
94 sentences available!
Current sent length is 20
Unique lengths available: [20]
2 sentences available!
Current sent length is 15
Unique lengths available: [15]
51 sentences available!
Current sent length is 15
Unique lengths available: [15]
50 sentences available!
Current sent length is 10
Unique lengths available: [10]
67 sentences available!
Current sent length is 11
Unique lengths available: [11]
95 sentences available!
Current sent length is 16
Unique lengths available: [16]
27 sentences available!
Current sent length is 15
Unique lengths available: [15]
49 sentences available!
Current sent length is 11
Unique lengths available: [11]
94 sentences available!
Current sent length is 15
Unique lengths available: [15]
48 sentences available!
Current sent length is 14
Unique lengths available: [14]
58 sentences available!
Current sent length is 19
Uniqu

Unique lengths available: [13]
69 sentences available!
Current sent length is 15
Unique lengths available: [15]
37 sentences available!
Current sent length is 15
Unique lengths available: [15]
36 sentences available!
Current sent length is 15
Unique lengths available: [15]
35 sentences available!
Current sent length is 11
Unique lengths available: [11]
82 sentences available!
Current sent length is 13
Unique lengths available: [13]
68 sentences available!
Current sent length is 15
Unique lengths available: [15]
36 sentences available!
Current sent length is 15
Unique lengths available: [15]
35 sentences available!
Current sent length is 12
Unique lengths available: [12]
98 sentences available!
Current sent length is 16
Unique lengths available: [16]
16 sentences available!
Current sent length is 20
Unique lengths available: []
0 sentences available!
________________ Changing length threshold to 1 ___________________
6 sentences available!
Current sent length is 17
Unique lengths availa

Current sent length is 12
Unique lengths available: [10 11 12 13 14]
276 sentences available!
Current sent length is 16
Unique lengths available: [14 15 16 17 18]
77 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
169 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
246 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
268 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
268 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
159 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
216 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
268 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
239 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 1

Current sent length is 14
Unique lengths available: [12 13 14 15 16]
135 sentences available!
Current sent length is 17
Unique lengths available: [15 16 17 18 19]
33 sentences available!
Current sent length is 16
Unique lengths available: [14 15 16 17 18]
54 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
179 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
153 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
173 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
152 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
171 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
154 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
142 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 

Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
95 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
71 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
93 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
105 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
71 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
91 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
88 sentences available!
Current sent length is 14
Unique lengths available: [12 13 14 15 16]
76 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
100 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
99 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
1

Unique lengths available: [ 9 10 11 12 13]
14 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
15 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
13 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
14 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
13 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
12 sentences available!
Current sent length is 14
Unique lengths available: [12 13 14 15 16]
12 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
9 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
8 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
8 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
9 sentences available!
Current 

In [319]:
# check that all are unique
assert(len(np.unique(new_sents)) == 627)

### Checks

In [320]:
# show that topics are now different!
for idx, topic in enumerate(stimuli_df_copy.passage_category.values):
    print(topic, new_topics[idx])
    assert(topic != new_topics[idx])

beekeeping castle
beekeeping crime
beekeeping ice_cream
beekeeping taste
beekeeping place
beekeeping landscape
beekeeping rock_climbing
beekeeping computer_graphics
beekeeping opera
beekeeping place
beekeeping tuxedo
dreams law_school
dreams pyramid
dreams profession
dreams profession
dreams ice_cream
dreams painter
dreams skiing
dreams beekeeping
dreams weapon
dreams music
gambling painter
gambling taste
gambling beekeeping
gambling tool
gambling disaster
gambling polar_bear
gambling insect
gambling human
gambling music
gambling tool
hurricane drink_non_alcoholic
hurricane bird
hurricane insect
hurricane owl
hurricane clothing
hurricane music
hurricane furniture
hurricane crime
hurricane tool
hurricane beekeeping
ice_cream dwelling
ice_cream fish
ice_cream computer_graphics
ice_cream insect
ice_cream beekeeping
ice_cream crime
ice_cream skiing
ice_cream astronaut
ice_cream place
ice_cream blindness
lawn_mower music
lawn_mower astronaut
lawn_mower kitchen_utensil
lawn_mower gambling
la

In [321]:
# show that lengths are now different!
diff_in_len = []
for idx, length in enumerate(stimuli_df_copy.sent_len.values):
    print(length, new_lens[idx])
    diff_in_len.append(np.abs(length - new_lens[idx]))

7 7
13 13
20 20
15 15
15 15
10 10
11 11
16 16
15 15
11 11
15 15
14 14
19 19
9 9
10 10
17 17
15 15
19 19
15 15
12 12
14 14
13 13
17 17
11 11
17 17
16 16
13 13
11 11
13 13
11 11
8 8
14 14
16 16
11 11
16 16
13 13
10 10
16 16
11 11
15 15
15 15
14 14
10 10
14 14
10 10
20 20
13 13
14 14
17 17
13 13
11 11
14 14
13 13
13 13
13 13
16 16
14 14
13 13
13 13
11 11
11 11
8 8
7 7
19 19
18 18
16 16
15 15
13 13
14 14
13 13
17 17
13 13
16 16
13 13
14 14
14 14
13 13
17 17
9 9
15 15
15 15
13 13
8 8
17 17
12 12
14 14
13 13
14 14
11 11
10 10
16 16
7 7
12 12
11 11
15 15
15 15
10 10
14 14
16 16
13 13
17 17
12 12
14 14
9 9
13 13
9 9
14 14
16 16
11 11
14 14
12 12
10 10
11 11
11 11
15 15
15 15
12 12
13 13
12 12
12 12
18 18
7 7
13 13
12 12
14 14
13 13
15 15
15 15
15 15
11 11
13 13
15 15
15 15
12 12
16 16
20 19
17 16
17 18
18 19
15 14
9 9
15 14
10 9
13 12
13 14
13 12
7 8
11 10
9 9
14 14
17 16
11 12
11 12
12 12
13 12
10 9
16 17
12 11
15 15
13 13
12 12
10 11
12 11
12 12
12 12
18 18
15 15
13 13
14 13
7 8
9 10
12 12
1

In [322]:
diff_in_len

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 0,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 0,
 1,
 2,
 1,
 1,
 0,
 0,
 1,
 2,


In [323]:
print(f'Sentences matched with a mean difference in length: {np.mean(diff_in_len)} and median difference: {np.median(diff_in_len)} and std {np.std(diff_in_len)}')

Sentences matched with a mean difference in length: 0.9059011164274322 and median difference: 1.0 and std 1.1334047490416954


### Store

In [324]:
#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-topic-length-criteria.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(new_sents[:5])
perturbed_df

['Its purpose was to dominate its surroundings.', 'Drunk driving of any vehicle is a crime in most of the world.', 'We poured the cream mixture into a frozen tub, then start turning the crank to expose it to the cold.', 'No one really knows exactly how a taste gets from your mouth to your brain.', 'A library is a place where a collection of books, documents and media is kept.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,its purpose was to dominate its surroundings.,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,drunk driving of any vehicle is a crime in mos...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,we poured the cream mixture into a frozen tub ...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,no one really knows exactly how a taste gets f...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,a library is a place where a collection of boo...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,a bed is made of a mattress and a box spring p...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,seeing the internal structures in an animation...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,flood can be caused by heavy rain or rapid sno...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,my sense of taste isn't very good so i tend to...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


### 2. random shuffling of sentences within a passage

In [48]:
new_sents = []
stimuli_df_copy = stimuli_df.copy(deep=True)

for exp in list(np.unique(stimuli_df_copy["experiment"])):
    for ind in np.unique(stimuli_df_copy.loc[stimuli_df_copy["experiment"] == exp].passage_index.values): # only take the indices from that exp df
        print(ind)
        curr_df = stimuli_df_copy.loc[(stimuli_df["experiment"] == exp) & (stimuli_df["passage_index"] == ind)]
        shuffled_df = curr_df.copy(deep=True)

        while True:
            shuffled_df = shuffled_df.sample(frac=1) 
    #         print('permuting')
            if not any(curr_df.index.values[i] == shuffled_df.index.values[i] for i in range(len(curr_df.index.values))):
                break

        new_sents.append(list(shuffled_df.sentence))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96


In [47]:
np.unique(stimuli_df_copy.loc[stimuli_df_copy["experiment"] == exp].passage_index.values)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72])

In [343]:
#index by experiment and passage_label/passage_index >> shuffle within those
def get_shuffled_within_passage(stimuli_df):
    np.random.seed(42)
    random.seed(42)

    shuffled_sentences = []
    for exp in list(np.unique(stimuli_df["experiment"])):
        for ind in list(np.unique(stimuli_df["passage_index"])):
            if exp == "243sentences" and ind > 72: #hot fix, 243sentences only has 72 passages
                continue
            else:
                curr_df = stimuli_df.loc[(stimuli_df["experiment"] == exp) & (stimuli_df["passage_index"] == ind)]
                curr_sent = list(curr_df["sentence"])
                while True:
                    shuffle(curr_sent)
                    if curr_sent != list(curr_df["sentence"]):
                        break
                shuffled_sentences += curr_sent
    return shuffled_sentences
new_sents = get_shuffled_within_passage(stimuli_df)

#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-withinpassage.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(shuffled_sentences[:5])
perturbed_df

['as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', "it is in every beekeeper's interest to conserve local plants that produce pollen.", 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'beekeeping encourages the conservation of local habitats.', 'they scout the fields know when nectar flows and select the best ways to extract honey.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,as a passive form of agriculture it does not r...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,it is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,beekeepers also discourage the use of pesticid...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,beekeeping encourages the conservation of loca...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,they scout the fields know when nectar flows a...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,a woman can become pregnant and bear children.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,a woman is a female human adult.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,a woman is stereotypically seen as a caregiver.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [349]:
new_sents

['As a passive form of agriculture, it does not require that native vegetation be cleared to make way for crops.',
 "It is in every beekeeper's interest to conserve local plants that produce pollen.",
 'Beekeepers also discourage the use of pesticides on crops, because they could kill the honeybees.',
 'Beekeeping encourages the conservation of local habitats.',
 'They scout the fields, know when nectar flows, and select the best ways to extract honey.',
 'To produce the finest honey, beekeepers become micromanagers of their honeybees.',
 'Artisanal beekeepers go to extremes for their craft, but their product is worth the effort.',
 'Artisanal honey-making emphasizes quality and character over quantity and consistency.',
 'The beekeeper checks honey stores, pollen supplies, and the bee nursery.',
 'As the beekeeper opens the hive, the deep hum of 40,000 bees fills the air.',
 "Bees crawl across his bare arms and hands, but they don't sting, because they're gentle.",
 'I know many other

In [350]:
# check that no sentences are placed in the same spot
for idx, sent in enumerate(stimuli_df.sentence.values):
#     print(sent, new_sents[idx])
    if (sent == new_sents[idx]):
        print(sent, new_sents[idx])


It is in every beekeeper's interest to conserve local plants that produce pollen. It is in every beekeeper's interest to conserve local plants that produce pollen.
Bees crawl across his bare arms and hands, but they don't sting, because they're gentle. Bees crawl across his bare arms and hands, but they don't sting, because they're gentle.
In my dream, it's the day of my final exam and I suddenly realize I've never gone to class. In my dream, it's the day of my final exam and I suddenly realize I've never gone to class.
I've never even done any of the reading assignments. I've never even done any of the reading assignments.
My friends were sick of watching me gamble my savings away. My friends were sick of watching me gamble my savings away.
Gambling may also be associated with personal bankruptcies and marriage problems. Gambling may also be associated with personal bankruptcies and marriage problems.
Most states have instituted lotteries, and many have casinos as well. Most states ha

In [326]:
# check that all are unique
assert(len(np.unique(shuffled_sentences)) == 627)

### 3. random shuffling of sentences within a topic (=category)

In [30]:
new_sents = []
stimuli_df_copy = stimuli_df.copy(deep=True)

for topic in list(np.unique(stimuli_df_copy["passage_category"])): # all topics are unique
#     print(f'Topic: {topic}')
    curr_df = stimuli_df_copy.loc[(stimuli_df_copy["passage_category"] == topic)]
    shuffled_df = curr_df.copy(deep=True)
    
    while True:
        shuffled_df = shuffled_df.sample(frac=1) 
#         print('permuting')
        if not any(curr_df.index.values[i] == shuffled_df.index.values[i] for i in range(len(curr_df.index.values))):
            break
    
    new_sents.append(list(shuffled_df.sentence))

In [32]:
# for topic in list(np.unique(stimuli_df_copy["passage_category"])): # all topics are unique
#     print(f'Topic: {topic}')
#     curr_df = stimuli_df_copy.loc[(stimuli_df_copy["passage_category"] == topic)]
#     shuffled_df = curr_df.copy(deep=True)
#     shuffled_df = shuffled_df.sample(frac=1) 


In [35]:
shuffled_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
553,Spears were used for stabbing or throwing when...,310,384sentences.310,384sentences,384sentences.weapon,78,Spear,weapon
265,"Historically, axe heads were made of stone and...",22,384sentences.22,384sentences,384sentences.weapon,6,Axe,weapon
513,Rifles are used in hunting and shooting sports...,270,384sentences.270,384sentences,384sentences.weapon,68,Rifle,weapon
551,Spears are weapons that consist of a pole and ...,308,384sentences.308,384sentences,384sentences.weapon,78,Spear,weapon
512,The grooves make the bullets spin around the r...,269,384sentences.269,384sentences,384sentences.weapon,68,Rifle,weapon
411,A hand grenade is an explosive used as a weapo...,168,384sentences.168,384sentences,384sentences.weapon,43,Hand_grenade,weapon
263,The axe is a tool used to split and cut wood.,20,384sentences.20,384sentences,384sentences.weapon,6,Axe,weapon
511,A rifle is a firearm that rests on the shoulde...,268,384sentences.268,384sentences,384sentences.weapon,68,Rifle,weapon
414,Tear gas grenades are thrown into the middle o...,171,384sentences.171,384sentences,384sentences.weapon,43,Hand_grenade,weapon
552,Spears were the most common personal weapon be...,309,384sentences.309,384sentences,384sentences.weapon,78,Spear,weapon


In [36]:
new_sents = [item for sublist in new_sents for item in sublist]


In [37]:
new_sents

['Whales have been hunted for meat, whale oil and ambergris.',
 'The elephant has a pair of ivory tusks, for moving objects or digging into trees.',
 'A whale is a very large mammal that lives in the ocean.',
 'Cats can hunt mice or birds, but are often fed by their owners.',
 'An elephant has a long nose called a trunk, which can grab things or food.',
 'A human can ride a horse while it walks, trots or gallops.',
 'Elephants are the largest kind of land mammal, weighing several tons.',
 'The claws of cats are retractable so that they keep sharp.',
 'Whales breathe through blowholes on their heads when surfacing.',
 'The blubber in a whale serves as an energy reservoir and as insulation.',
 'A horse has a tail and a mane on its neck, and is usually gray or brown.',
 'Cats like to groom themselves by licking their fur.',
 'Horses have been used for draft work, travel and entertainment.',
 'A horse is a large hoofed mammal with four long, muscular legs.',
 'The elephant flaps its large 

In [41]:
# check that no sentences are placed in the same spot
for idx, sent in enumerate(stimuli_df.sentence.values):
#     print(sent, new_sents[idx])
    if (sent == new_sents[idx]):
        print(sent, new_sents[idx])
    assert(sent != new_sents[idx])

In [360]:
np.unique(stimuli_df_copy["passage_category"])

array(['animal', 'appliance', 'astronaut', 'beekeeping', 'bird',
       'blindness', 'body_part', 'bone_fracture', 'building_part',
       'castle', 'clothing', 'computer_graphics', 'crime', 'disaster',
       'dreams', 'drink_non_alcoholic', 'dwelling', 'fish', 'fruit',
       'furniture', 'gambling', 'human', 'hurricane', 'ice_cream',
       'infection', 'insect', 'kitchen_utensil', 'landscape',
       'law_school', 'lawn_mower', 'music', 'opera', 'owl', 'painter',
       'pharmacist', 'place', 'polar_bear', 'profession', 'pyramid',
       'rock_climbing', 'skiing', 'stress', 'taste', 'tool', 'tuxedo',
       'vegetable', 'vehicles_transport', 'weapon'], dtype=object)

In [340]:
shuffled_sentences

['An elephant has a long nose called a trunk, which can grab things or food.',
 'A human can ride a horse while it walks, trots or gallops.',
 'A horse has a tail and a mane on its neck, and is usually gray or brown.',
 'Whales have been hunted for meat, whale oil and ambergris.',
 'Elephants are the largest kind of land mammal, weighing several tons.',
 'The blubber in a whale serves as an energy reservoir and as insulation.',
 'A horse is a large hoofed mammal with four long, muscular legs.',
 'Whales breathe through blowholes on their heads when surfacing.',
 'Cats like to groom themselves by licking their fur.',
 'Cats are small furry animals with four legs and a tail.',
 'Horses have been used for draft work, travel and entertainment.',
 'Cats can hunt mice or birds, but are often fed by their owners.',
 'The claws of cats are retractable so that they keep sharp.',
 'The elephant has a pair of ivory tusks, for moving objects or digging into trees.',
 'The elephant flaps its large 

In [333]:
stimuli_df_copy.loc[(stimuli_df_copy["experiment"] == exp) & (stimuli_df_copy["passage_index"] == topic)]

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category,sent_len


In [335]:
len(np.unique(stimuli_df_copy["passage_category"]))

48