# Script for creating sentence-meaning manipulation datasets from the Pereira2018 fMRI stimuli

In [3]:
import re
from pathlib import Path
from os.path import abspath
import os
import numpy as np
import random
import pickle
import csv
import subprocess
import collections
from random import shuffle

In [4]:
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation


In [5]:
importpath = abspath('../..')
os.chdir(importpath)
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp


In [6]:
#add seeds for reproducability
np.random.seed(42)
random.seed(42)

# settings
save = True # if storing pkl dataframes

# Load base stimulus dataframe (Pereira 2018)

In [7]:
from neural_nlp.benchmarks.neural import *
import neural_nlp
from neural_nlp.stimuli import StimulusSet
import xarray as xr

benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# fetch stimulus set
benchmark = benchmark_pool['Pereira2018-encoding']
stimuli_df = benchmark._target_assembly.attrs['stimulus_set']
stimuli_df

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)


Loading lookup from /opt/anaconda3/lib/python3.8/site-packages/brainio_collection/lookup.csv
/Users/gt/Documents/GitHub/perturbed-neural-nlp/neural_nlp/../ressources/stimuli


 We're running in the NEW version of the implementations.py script.




  xr_data.set_index(append=True, inplace=True, **coords_d)
  elif isinstance(data, pd.Panel):


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [8]:
# benchmark content
benchmark._target_assembly.values.shape
benchmark._target_assembly.coords

Coordinates:
  * presentation      (presentation) MultiIndex
  - stimulus_num      (presentation) int64 0 0 1 1 2 2 3 ... 12 12 13 13 14 14
  - passage_index     (presentation) int64 1 1 1 1 1 1 1 1 2 ... 3 4 4 4 4 4 4 4
  - passage_label     (presentation) object 'Accordion' ... 'dreams'
  - passage_category  (presentation) object 'music' 'beekeeping' ... 'dreams'
  - stimulus_id       (presentation) object '384sentences.0' ... '243sentences.14'
  - story             (presentation) object '384sentences.music' ... '243sentences.dreams'
  - experiment        (presentation) object '384sentences' ... '243sentences'
  * neuroid           (neuroid) MultiIndex
  - subject           (neuroid) object '018' '018' '018' ... '018' '018' '018'
  - voxel_num         (neuroid) int64 28 29 31 32 38 42 ... 152 153 154 159 160
  - atlas             (neuroid) object 'language' 'language' ... 'language'
  - filter_strategy   (neuroid) object '' '' '' '' '' '' ... '' '' '' '' '' ''
  - atlas_selection   (

In [9]:
stimuli_path = os.path.join(os.getcwd(),'ressources/stimuli_creation')
os.chdir(stimuli_path)
print(os.getcwd())

savedir = abspath('../scrambled_stimuli_dfs')
print(savedir)
os.makedirs(savedir, exist_ok=True)

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation
/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/scrambled_stimuli_dfs


# Create different perturbed versions of the benchmark

`get original dataset in correct formatting. NOTE: "stim_243sentences_scrambled.txt" and "stim_384sentences_scrambled.txt" are created via running "get_original_sentenceset.ipynb"`

In [10]:
def get_original_sentenceset(filename):
    with open(os.path.join(stimuli_path,filename),"r") as f:
        reader = csv.reader(f, delimiter="\t")
        sentences = list(reader)
    Original = [sentence[1] + '.' for sentence in sentences if int(sentence[0]) == 0]
    
    return Original

In [11]:
Original_243 = get_original_sentenceset("stim_243sentences_scrambled.txt")
Original_384 = get_original_sentenceset("stim_384sentences_scrambled.txt")

Original = Original_243 + Original_384

In [13]:
#This is the list of stimuli we create the perturbations from
# GT: I use the stimuli_df to adhere as closely as possible to the original benchmark? and then assimilate sentences re formatting after
original_stimuli = Original
print(original_stimuli)

['beekeeping encourages the conservation of local habitats.', "it is in every beekeeper's interest to conserve local plants that produce pollen.", 'as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'artisanal beekeepers go to extremes for their craft but their product is worth the effort.', 'artisanal honey-making emphasizes quality and character over quantity and consistency.', 'to produce the finest honey beekeepers become micromanagers of their honeybees.', 'they scout the fields know when nectar flows and select the best ways to extract honey.', 'as the beekeeper opens the hive the deep hum of 40000 bees fills the air.', 'the beekeeper checks honey stores pollen supplies and the bee nursery.', "bees crawl across his bare arms and hands but they don't sting because they're gentle.", "i have a recurring dream about exams eve

In [12]:
print(stimuli_df.sentence.values)

['Beekeeping encourages the conservation of local habitats.'
 "It is in every beekeeper's interest to conserve local plants that produce pollen."
 'As a passive form of agriculture, it does not require that native vegetation be cleared to make way for crops.'
 'Beekeepers also discourage the use of pesticides on crops, because they could kill the honeybees.'
 'Artisanal beekeepers go to extremes for their craft, but their product is worth the effort.'
 'Artisanal honey-making emphasizes quality and character over quantity and consistency.'
 'To produce the finest honey, beekeepers become micromanagers of their honeybees.'
 'They scout the fields, know when nectar flows, and select the best ways to extract honey.'
 'As the beekeeper opens the hive, the deep hum of 40,000 bees fills the air.'
 'The beekeeper checks honey stores, pollen supplies, and the bee nursery.'
 "Bees crawl across his bare arms and hands, but they don't sting, because they're gentle."
 "I have a recurring dream abo

## Create sentence shuffling conditions

### 1. random shuffling of sentences across the dataset, not respecting passages, sentence length, etc. Making sure a sentence does not land in its own, original spot.

In [138]:
def random_sentence_shuffle(stimuli_df):
    '''
    Randomly shuffle sentences, make sure a sentence does not land in its own, original spot.
    '''
    np.random.seed(42)
    random.seed(42)
    
    stimuli_df_copy = stimuli_df.copy(deep=True)
    new_sents = list(stimuli_df_copy.sentence.values)
    
    while True:
        shuffle(new_sents) 
        if not any(list(stimuli_df_copy.sentence.values)[i] == new_sents[i] for i in range(len(new_sents))):
            break
            
    return new_sents

In [139]:
new_sents = random_sentence_shuffle(stimuli_df)

In [140]:
#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-random.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(shuffled_sentences[:5])
perturbed_df

['and for society hip fractures involve substantial expense and time in the hospital.', 'mosquitos are thin small flying insects that emit a highpitched sound.', 'elephants are the largest kind of land mammal weighing several tons.', 'a raven is a large black bird that thrives in many climates.', 'a mug is a sturdy cup with a handle for drinking hot liquids.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,and for society hip fractures involve substant...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,mosquitos are thin small flying insects that e...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,elephants are the largest kind of land mammal ...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,a raven is a large black bird that thrives in ...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,a mug is a sturdy cup with a handle for drinki...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,drunk drivers are caught by field sobriety and...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,fortunately most skin infections can easily be...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,some computer science conferences have art exh...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,hospitals are staffed by professional physicia...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


### Checks

In [141]:
# check that all are unique
assert(len(np.unique(shuffled_sentences)) == 627)

In [142]:
# check that no sentences are placed in the same spot
for idx, sent in enumerate(list(stimuli_df_copy.sentence.values)):
#     print(sent, new_sents[idx])
    if (sent == new_sents[idx]):
        print(sent, new_sents[idx])
    assert(sent != new_sents[idx])

### 1b. random shuffling of sentences across the dataset, making sure that sentences from the same topic are not placed in the original slot. Making sure a sentence does not land in its own, original spot.

Info from Pereira et al., 2018: Experiment 2 used 96
passages, each consisting of 4 sentences about a particular concept, spanning a
broad range of content areas from 24 broad topics (e.g., professions, clothing, birds,
musical instruments, natural disasters, crimes, etc.), with 4 passages per topic (e.g.,
clarinet, accordion, piano, and violin for musical instruments; Supplementary
Figure 1). All passages were Wikipedia-style texts that provided basic information
about the relevant concept. Experiment 3 used 72 passages, each consisting of 3 or
4 sentences about a particular concept. As in experiment 2, the passages spanned a
broad range of content areas from 24 broad topics, unrelated to the topics in
experiment 2 (e.g., skiing, dreams, opera, bone fractures, etc.), with 3 passages per
topic

In [143]:
print(len(np.unique(stimuli_df.passage_label)))
print(len(np.unique(stimuli_df.passage_index)))
print(len(np.unique(stimuli_df.passage_category)))

120
96
48


In [144]:
def shuffle_topic_criteria(stimuli_df):
    '''
    Shuffle sentences across the entire set, but do not allow sentences from the same topic (passage category)
    to land in the same spot.
    '''
    np.random.seed(42)
    random.seed(42)
    
    avail_sents = stimuli_df.copy(deep=True) # for storing(=popping) which sentences have already been used
    stimuli_df_copy = stimuli_df.copy(deep=True)
    new_sents = []
    new_topics = [] # for asserting that topics did not repeat
    for i, sent in enumerate(stimuli_df.itertuples()):
        # find a sentence that is from a different topic & that has NOT been used before
        q_str = f'`passage_category` != "{sent.passage_category}"'
        sents_to_pick_from = avail_sents.query(q_str)

        # pick a random sentence 
        rand_idx = np.random.choice(sents_to_pick_from.index.values)
        picked_sent = stimuli_df_copy.iloc[rand_idx]

        # store the picked sentence topic
        new_topics.append(picked_sent.passage_category)

        # remove from avail_sents
        avail_sents = avail_sents.drop(index=rand_idx)
        assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

        # append to the new sents list
        new_sents.append(picked_sent.sentence)

    return new_sents, new_topics

In [145]:
# obtain new sents 
new_sents, new_topics = shuffle_topic_criteria(stimuli_df)

In [146]:
# show that topics are now different!
for idx, topic in enumerate(stimuli_df.passage_category.values):
    print(topic, new_topics[idx])
    assert(topic != new_topics[idx])

beekeeping tuxedo
beekeeping dwelling
beekeeping vehicles_transport
beekeeping tuxedo
beekeeping law_school
beekeeping hurricane
beekeeping taste
beekeeping bird
beekeeping rock_climbing
beekeeping disaster
beekeeping crime
dreams pharmacist
dreams kitchen_utensil
dreams tuxedo
dreams bone_fracture
dreams landscape
dreams body_part
dreams fish
dreams vegetable
dreams tool
dreams castle
gambling vegetable
gambling painter
gambling music
gambling building_part
gambling hurricane
gambling furniture
gambling disaster
gambling appliance
gambling astronaut
gambling profession
hurricane weapon
hurricane castle
hurricane bird
hurricane building_part
hurricane weapon
hurricane clothing
hurricane bone_fracture
hurricane profession
hurricane dreams
hurricane owl
ice_cream profession
ice_cream dwelling
ice_cream disaster
ice_cream dreams
ice_cream fruit
ice_cream body_part
ice_cream building_part
ice_cream disaster
ice_cream stress
ice_cream tool
lawn_mower bird
lawn_mower crime
lawn_mower furnitu

In [147]:
# check that all are unique
assert(len(np.unique(new_sents)) == 627)

In [148]:
# check that no sentences are placed in the same spot
for idx, sent in enumerate(list(stimuli_df_copy.sentence.values)):
#     print(sent, new_sents[idx])
    if (sent == new_sents[idx]):
        print(sent, new_sents[idx])
    assert(sent != new_sents[idx])

In [149]:
#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-topic-criteria.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(new_sents[:5])
perturbed_df

['Clearly he had it tailored, as it fit the man perfectly.', 'A log cabin is a small house built from round logs.', 'People use bicycles for transportation, recreation and racing.', 'A tuxedo jacket is a tailless dinner jacket with black silk lapels.', 'He wanted to change things for the better.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,clearly he had it tailored as it fit the man p...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,a log cabin is a small house built from round ...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,people use bicycles for transportation recreat...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,a tuxedo jacket is a tailless dinner jacket wi...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,he wanted to change things for the better.,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,the elephant flaps its large ears to cool the ...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,the adults set up climbing routes appropriate ...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,as a painter i learned to focus less on the ac...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,some people collect butterflies because of the...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


### 1c. random shuffling of sentences across the dataset, making sure that sentences from the same topic are not placed in the original slot and match for length

In [150]:
def shuffle_topic_and_length_criteria(stimuli_df):
    '''
    Shuffle sentences across the entire set, but do not allow sentences from the same topic (passage category)
    to land in the same spot. Match for length as well.
    '''
    np.random.seed(42)
    random.seed(42)

    stimuli_df_copy = stimuli_df.copy(deep=True)
    stimuli_df_copy['sent_len'] = [len(x.split(' ')) for x in stimuli_df_copy.sentence]
    avail_sents = stimuli_df_copy.copy(deep=True) # for storing(=popping) which sentences have already been used


    new_sents = []
    new_topics = [] # for asserting that topics did not repeat
    new_lens = [] # for asserting that lens did not repeat
    len_threshold = 0 # start by 1
    stored_len_thresholds = [0] # start at 1
    stored_idx_threshold = [0] # for storing at which idx the threshold changed. start at 0

    for i, sent in enumerate(stimuli_df_copy.itertuples()):
    #     print(i)
        if i <= 617: # still possible to find reasonable length matches
            # find a sentence that is from a different topic & that has NOT been used before
            q_str = f'`passage_category` != "{sent.passage_category}"'
            avail_topic_criteria = avail_sents.query(q_str)

            # add length requirement
            q_str_len = f'sent_len >= {sent.sent_len - len_threshold} & sent_len <= {sent.sent_len + len_threshold}'
            print(f'Current sent length is {sent.sent_len}')
            avail_topic_AND_len_criteria = avail_topic_criteria.query(q_str_len)
            print(f'Unique lengths available: {np.unique(avail_topic_AND_len_criteria.sent_len.values)}')
            print(f'{len(avail_topic_AND_len_criteria)} sentences available!')

            if len(avail_topic_AND_len_criteria) == 0: # if no sentences available
                len_threshold += 1
                stored_len_thresholds.append(len_threshold)
                stored_idx_threshold.append(i)
                print(f'________________ Changing length threshold to {len_threshold} ___________________')
                # rerun for the current index, so all sentences will be paired
                q_str_len = f'sent_len >= {sent.sent_len - len_threshold} & sent_len <= {sent.sent_len + len_threshold}'
                avail_topic_AND_len_criteria = avail_topic_criteria.query(q_str_len)
                print(f'{len(avail_topic_AND_len_criteria)} sentences available!')

            # pick a random sentence 
            rand_idx = np.random.choice(avail_topic_AND_len_criteria.index.values)
            picked_sent = stimuli_df_copy.iloc[rand_idx]

            # store the picked sentence topic and len
            new_topics.append(picked_sent.passage_category)
            new_lens.append(picked_sent.sent_len)

            # remove from avail_sents
            avail_sents = avail_sents.drop(index=rand_idx)
            assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

            # append to the new sents list
            new_sents.append(picked_sent.sentence)

        else: # if i larger than 621. get rid of length requirement, only topic
            print(i)

            # find a sentence that is from a different topic & that has NOT been used before
            q_str = f'`passage_category` != "{sent.passage_category}"'
            avail_topic_criteria = avail_sents.query(q_str)

            # pick a random sentence 
            rand_idx = np.random.choice(avail_topic_criteria.index.values)
            picked_sent = stimuli_df_copy.iloc[rand_idx]

            # store the picked sentence topic and len
            new_topics.append(picked_sent.passage_category)
            new_lens.append(picked_sent.sent_len)

            # remove from avail_sents
            avail_sents = avail_sents.drop(index=rand_idx)
            assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

            # append to the new sents list
            new_sents.append(picked_sent.sentence)
            
    print('Length increments performed at:')
    print(stored_idx_threshold)
    print(stored_len_thresholds)
            
    return new_sents, new_topics, new_lens, stimuli_df_copy

In [151]:
new_sents, new_topics, new_lens, stimuli_df_copy = shuffle_topic_and_length_criteria(stimuli_df)

Current sent length is 7
Unique lengths available: [7]
11 sentences available!
Current sent length is 13
Unique lengths available: [13]
94 sentences available!
Current sent length is 20
Unique lengths available: [20]
2 sentences available!
Current sent length is 15
Unique lengths available: [15]
51 sentences available!
Current sent length is 15
Unique lengths available: [15]
50 sentences available!
Current sent length is 10
Unique lengths available: [10]
67 sentences available!
Current sent length is 11
Unique lengths available: [11]
95 sentences available!
Current sent length is 16
Unique lengths available: [16]
27 sentences available!
Current sent length is 15
Unique lengths available: [15]
49 sentences available!
Current sent length is 11
Unique lengths available: [11]
94 sentences available!
Current sent length is 15
Unique lengths available: [15]
48 sentences available!
Current sent length is 14
Unique lengths available: [14]
58 sentences available!
Current sent length is 19
Uniqu

Unique lengths available: [15]
36 sentences available!
Current sent length is 15
Unique lengths available: [15]
35 sentences available!
Current sent length is 12
Unique lengths available: [12]
98 sentences available!
Current sent length is 16
Unique lengths available: [16]
16 sentences available!
Current sent length is 20
Unique lengths available: []
0 sentences available!
________________ Changing length threshold to 1 ___________________
6 sentences available!
Current sent length is 17
Unique lengths available: [16 17 18]
31 sentences available!
Current sent length is 17
Unique lengths available: [16 17 18]
30 sentences available!
Current sent length is 18
Unique lengths available: [17 18 19]
20 sentences available!
Current sent length is 15
Unique lengths available: [14 15 16]
90 sentences available!
Current sent length is 9
Unique lengths available: [ 8  9 10]
114 sentences available!
Current sent length is 15
Unique lengths available: [14 15 16]
91 sentences available!
Current sen

Current sent length is 12
Unique lengths available: [10 11 12 13 14]
265 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
266 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
265 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
264 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
263 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
260 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
263 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
262 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
211 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
154 sentences available!
Current sent length is 15
Unique lengths available: [13 14 15

Current sent length is 12
Unique lengths available: [10 11 12 13 14]
171 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
154 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
142 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
173 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
103 sentences available!
Current sent length is 14
Unique lengths available: [12 13 14 15 16]
126 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
169 sentences available!
Current sent length is 15
Unique lengths available: [13 14 15 16 17]
88 sentences available!
Current sent length is 9
Unique lengths available: [ 7  8  9 10 11]
99 sentences available!
Current sent length is 15
Unique lengths available: [13 14 15 16 17]
84 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 

Unique lengths available: [10 11 12 13 14]
78 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
69 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
68 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
81 sentences available!
Current sent length is 12
Unique lengths available: [10 11 12 13 14]
75 sentences available!
Current sent length is 10
Unique lengths available: [ 8  9 10 11 12]
66 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
65 sentences available!
Current sent length is 13
Unique lengths available: [11 12 13 14 15]
64 sentences available!
Current sent length is 17
Unique lengths available: [15 16 17 19]
19 sentences available!
Current sent length is 16
Unique lengths available: [14 15 16 17]
26 sentences available!
Current sent length is 11
Unique lengths available: [ 9 10 11 12 13]
76 sentences available!
Current s

### Checks

In [152]:
# check that all are unique
assert(len(np.unique(new_sents)) == 627)

In [153]:
# check that no sentences are placed in the same spot
for idx, sent in enumerate(list(stimuli_df_copy.sentence.values)):
#     print(sent, new_sents[idx])
    if (sent == new_sents[idx]):
        print(sent, new_sents[idx])
    assert(sent != new_sents[idx])

In [154]:
# show that topics are now different!
for idx, topic in enumerate(stimuli_df_copy.passage_category.values):
    print(topic, new_topics[idx])
    assert(topic != new_topics[idx])

beekeeping castle
beekeeping crime
beekeeping ice_cream
beekeeping taste
beekeeping place
beekeeping landscape
beekeeping rock_climbing
beekeeping computer_graphics
beekeeping opera
beekeeping place
beekeeping tuxedo
dreams law_school
dreams pyramid
dreams profession
dreams profession
dreams ice_cream
dreams painter
dreams skiing
dreams beekeeping
dreams weapon
dreams music
gambling painter
gambling taste
gambling beekeeping
gambling tool
gambling disaster
gambling polar_bear
gambling insect
gambling human
gambling music
gambling tool
hurricane drink_non_alcoholic
hurricane bird
hurricane insect
hurricane owl
hurricane clothing
hurricane music
hurricane furniture
hurricane crime
hurricane tool
hurricane beekeeping
ice_cream dwelling
ice_cream fish
ice_cream computer_graphics
ice_cream insect
ice_cream beekeeping
ice_cream crime
ice_cream skiing
ice_cream astronaut
ice_cream place
ice_cream blindness
lawn_mower music
lawn_mower astronaut
lawn_mower kitchen_utensil
lawn_mower gambling
la

In [155]:
# show that lengths are now different!
diff_in_len = []
for idx, length in enumerate(stimuli_df_copy.sent_len.values):
    print(length, new_lens[idx])
    diff_in_len.append(np.abs(length - new_lens[idx]))

7 7
13 13
20 20
15 15
15 15
10 10
11 11
16 16
15 15
11 11
15 15
14 14
19 19
9 9
10 10
17 17
15 15
19 19
15 15
12 12
14 14
13 13
17 17
11 11
17 17
16 16
13 13
11 11
13 13
11 11
8 8
14 14
16 16
11 11
16 16
13 13
10 10
16 16
11 11
15 15
15 15
14 14
10 10
14 14
10 10
20 20
13 13
14 14
17 17
13 13
11 11
14 14
13 13
13 13
13 13
16 16
14 14
13 13
13 13
11 11
11 11
8 8
7 7
19 19
18 18
16 16
15 15
13 13
14 14
13 13
17 17
13 13
16 16
13 13
14 14
14 14
13 13
17 17
9 9
15 15
15 15
13 13
8 8
17 17
12 12
14 14
13 13
14 14
11 11
10 10
16 16
7 7
12 12
11 11
15 15
15 15
10 10
14 14
16 16
13 13
17 17
12 12
14 14
9 9
13 13
9 9
14 14
16 16
11 11
14 14
12 12
10 10
11 11
11 11
15 15
15 15
12 12
13 13
12 12
12 12
18 18
7 7
13 13
12 12
14 14
13 13
15 15
15 15
15 15
11 11
13 13
15 15
15 15
12 12
16 16
20 19
17 16
17 18
18 19
15 14
9 9
15 14
10 9
13 12
13 14
13 12
7 8
11 10
9 9
14 14
17 16
11 12
11 12
12 12
13 12
10 9
16 17
12 11
15 15
13 13
12 12
10 11
12 11
12 12
12 12
18 18
15 15
13 13
14 13
7 8
9 10
12 12
1

In [156]:
diff_in_len

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 0,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 0,
 1,
 2,
 1,
 1,
 0,
 0,
 1,
 2,


In [157]:
print(f'Sentences matched with a mean difference in length: {np.mean(diff_in_len)} and median difference: {np.median(diff_in_len)} and std {np.std(diff_in_len)}')

Sentences matched with a mean difference in length: 0.9059011164274322 and median difference: 1.0 and std 1.1334047490416954


### Store

In [158]:
#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-topic-length-criteria.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(new_sents[:5])
perturbed_df

['Its purpose was to dominate its surroundings.', 'Drunk driving of any vehicle is a crime in most of the world.', 'We poured the cream mixture into a frozen tub, then start turning the crank to expose it to the cold.', 'No one really knows exactly how a taste gets from your mouth to your brain.', 'A library is a place where a collection of books, documents and media is kept.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,its purpose was to dominate its surroundings.,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,drunk driving of any vehicle is a crime in mos...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,we poured the cream mixture into a frozen tub ...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,no one really knows exactly how a taste gets f...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,a library is a place where a collection of boo...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,a bed is made of a mattress and a box spring p...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,seeing the internal structures in an animation...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,flood can be caused by heavy rain or rapid sno...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,my sense of taste isn't very good so i tend to...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


### 2. random shuffling of sentences within a passage. Make sure a sentence does not land in its original spot.

In [39]:
def passage_shuffle(stimuli_df):
    '''
    Shuffle sentences randomly within their passage. Make sure a sentence does not land in its original spot.
    '''
    
    np.random.seed(42)
    random.seed(42)
    
    new_sents = []
    new_passidx = [] # for checking that the passage indices indeed are correct and NOT changed
    stimuli_df_copy = stimuli_df.copy(deep=True)

    for exp in list(np.unique(stimuli_df_copy["experiment"])):
        for ind in np.unique(stimuli_df_copy.loc[stimuli_df_copy["experiment"] == exp].passage_index.values): # only take the indices from that exp df
    #         print(ind)
            curr_df = stimuli_df_copy.loc[(stimuli_df["experiment"] == exp) & (stimuli_df["passage_index"] == ind)]
            shuffled_df = curr_df.copy(deep=True)

            while True:
                shuffled_df = shuffled_df.sample(frac=1) 
                if not any(curr_df.index.values[i] == shuffled_df.index.values[i] for i in range(len(curr_df.index.values))):
                    new_sents.append(list(shuffled_df.sentence))
                    new_passidx.append(list(shuffled_df.passage_index))
                    break

    new_sents = [item for sublist in new_sents for item in sublist]
    new_passidx = [item for sublist in new_passidx for item in sublist]
    
    return new_sents, new_passidx

In [40]:
new_sents, new_passidx = passage_shuffle(stimuli_df)

#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-withinpassage.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(shuffled_sentences[:5])
perturbed_df

["it is in every beekeeper's interest to conserve local plants that produce pollen.", 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'beekeeping encourages the conservation of local habitats.', 'as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', 'artisanal honeymaking emphasizes quality and character over quantity and consistency.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,it is in every beekeeper's interest to conserv...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,beekeepers also discourage the use of pesticid...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,beekeeping encourages the conservation of loca...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,as a passive form of agriculture it does not r...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,artisanal honeymaking emphasizes quality and c...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,when open a window will also let air and sound...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,a woman can become pregnant and bear children.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,a woman has different reproductive organs than...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,a woman is a female human adult.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [41]:
# check that no sentences are placed in the same spot
for idx, sent in enumerate(stimuli_df.sentence.values):
#     print(sent, new_sents[idx])
    if (sent == new_sents[idx]):
        print(sent, new_sents[idx])
    assert(sent != new_sents[idx])

In [42]:
# check that passage idx are placed in the same spot
for idx, t in enumerate(stimuli_df.passage_index.values):
    print(t, new_passidx[idx])
    if (t != new_passidx[idx]):
        print(t, new_passidx[idx])
    assert(t == new_passidx[idx])

1 1
1 1
1 1
1 1
2 2
2 2
2 2
2 2
3 3
3 3
3 3
4 4
4 4
4 4
4 4
5 5
5 5
5 5
6 6
6 6
6 6
7 7
7 7
7 7
7 7
8 8
8 8
8 8
9 9
9 9
9 9
10 10
10 10
10 10
10 10
11 11
11 11
11 11
12 12
12 12
12 12
13 13
13 13
13 13
14 14
14 14
14 14
14 14
15 15
15 15
15 15
16 16
16 16
16 16
16 16
17 17
17 17
17 17
18 18
18 18
18 18
19 19
19 19
19 19
19 19
20 20
20 20
20 20
21 21
21 21
21 21
22 22
22 22
22 22
22 22
23 23
23 23
23 23
24 24
24 24
24 24
25 25
25 25
25 25
25 25
26 26
26 26
26 26
27 27
27 27
27 27
28 28
28 28
28 28
28 28
29 29
29 29
29 29
30 30
30 30
30 30
31 31
31 31
31 31
31 31
32 32
32 32
32 32
33 33
33 33
33 33
34 34
34 34
34 34
34 34
35 35
35 35
35 35
36 36
36 36
36 36
37 37
37 37
37 37
37 37
38 38
38 38
38 38
39 39
39 39
39 39
40 40
40 40
40 40
41 41
41 41
41 41
42 42
42 42
42 42
43 43
43 43
43 43
43 43
44 44
44 44
44 44
44 44
45 45
45 45
45 45
46 46
46 46
46 46
46 46
47 47
47 47
47 47
48 48
48 48
48 48
49 49
49 49
49 49
49 49
50 50
50 50
50 50
51 51
51 51
51 51
51 51
52 52
52 52
52 52
52 52
53 53


In [43]:
# check that all are unique
assert(len(np.unique(new_sents)) == 627)

### 3. random shuffling of sentences within a topic (=category), no length criteria

In [44]:
def topic_shuffle(stimuli_df):
    '''
    Shuffle sentences within a topic, making sure a sentence does not land in its own spot.
    OBS: it is a problem that passages are interleaved within topic for expt 3. Taken care of in this function.
    OBS: it is problematic to iterate over topics alphabetically. taken care of.
    '''
    np.random.seed(42)
    random.seed(42)
        
    d = {}
    new_sents_243 = []
    new_sents_384 = []
    new_topics_243 = [] # for checking that the topics indeed are correct and NOT changed
    new_topics_384 = [] # for checking that the topics indeed are correct and NOT changed
    stimuli_df_copy = stimuli_df.copy(deep=True)
    seen = set()

    for exp in list(np.unique(stimuli_df_copy["experiment"])):

        # run topics in the order they are seen in the stimulusset (default is alphabetical, which is wrong)
        for topic in (stimuli_df_copy.loc[stimuli_df_copy["experiment"] == exp].passage_category.values): # only take the indices from that exp df    

            if topic not in seen:
                seen.add(topic)

                if exp == '243sentences': # sorted nicely
                    print(f'Running expt {exp} and topic: {topic}')
                    curr_df = stimuli_df_copy.loc[(stimuli_df_copy["passage_category"] == topic)]
                    shuffled_df = curr_df.copy(deep=True)

                    while True:
                        shuffled_df = shuffled_df.sample(frac=1)
                        if not any(curr_df.sentence.values[i] == shuffled_df.sentence.values[i] for i in range(len(curr_df.index.values))):
                            new_sents_243.append(list(shuffled_df.sentence.values))
                            new_topics_243.append(list(shuffled_df.passage_category.values))
                            break

                if exp == '384sentences': # passages interleaved within a topic, make sure indices are somehow retained
                    print(f'Running expt {exp} and topic: {topic}')
                    curr_df = stimuli_df_copy.loc[(stimuli_df_copy["passage_category"] == topic)]
                    shuffled_df = curr_df.copy(deep=True)

                    while True:
                        shuffled_df = shuffled_df.sample(frac=1)
                        if not any(curr_df.sentence.values[i] == shuffled_df.sentence.values[i] for i in range(len(curr_df.index.values))):
                            for idx, orig_idx in enumerate(curr_df.index.values):
        #                         print(idx, orig_idx)
                                d[orig_idx] = list(shuffled_df.index.values)[idx]
                            break
    #         else:
    #             print('Already saw this topic')

    # Flatten
    new_sents_243 = [item for sublist in new_sents_243 for item in sublist]
    new_topics_243 = [item for sublist in new_topics_243 for item in sublist]

    # now, put the sentences into the new df according to the shuffled indices
    stimuli_df_384 = stimuli_df_copy.loc[stimuli_df_copy["experiment"] == '384sentences']

    d_sorted = collections.OrderedDict(sorted(d.items())) # stores the old versus new idx mapping
    for k,v  in d_sorted.items():
        # get the new, shuffled sentence based on the shuffled idx
        sent = (stimuli_df_384.loc[stimuli_df_384.index == v].sentence.values)[0] # get str, not lst
        topic = (stimuli_df_384.loc[stimuli_df_384.index == v].passage_category.values)[0] # get str, not lst
        new_sents_384.append(sent)
        new_topics_384.append(topic)

    # now merge new sents for 243 and 384
    new_sents = new_sents_243 + new_sents_384
    new_topics = new_topics_243 + new_topics_384
    
    return new_sents, new_topics

In [45]:
new_sents, new_topics = topic_shuffle(stimuli_df)

#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in new_sents]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-withintopic.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(shuffled_sentences[:5])
perturbed_df

Running expt 243sentences and topic: beekeeping
Running expt 243sentences and topic: dreams
Running expt 243sentences and topic: gambling
Running expt 243sentences and topic: hurricane
Running expt 243sentences and topic: ice_cream
Running expt 243sentences and topic: lawn_mower
Running expt 243sentences and topic: astronaut
Running expt 243sentences and topic: computer_graphics
Running expt 243sentences and topic: law_school
Running expt 243sentences and topic: pharmacist
Running expt 243sentences and topic: stress
Running expt 243sentences and topic: tuxedo
Running expt 243sentences and topic: blindness
Running expt 243sentences and topic: taste
Running expt 243sentences and topic: bone_fracture
Running expt 243sentences and topic: infection
Running expt 243sentences and topic: opera
Running expt 243sentences and topic: painter
Running expt 243sentences and topic: owl
Running expt 243sentences and topic: polar_bear
Running expt 243sentences and topic: castle
Running expt 243sentences

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,artisanal honeymaking emphasizes quality and c...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,beekeeping encourages the conservation of loca...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,the beekeeper checks honey stores pollen suppl...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,bees crawl across his bare arms and hands but ...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,as a passive form of agriculture it does not r...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,a window is a transparent opening in a wall to...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,a woman can become pregnant and bear children.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,a boy becomes a man after passing through pube...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,at puberty the body of a girl develops into th...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [47]:
# check that no sentences are placed in the same spot
for idx, sent in enumerate(stimuli_df.sentence.values):
#     print(sent, new_sents[idx])
#     print('\n')
    if (sent == new_sents[idx]):
        print(sent, new_sents[idx])
    assert(sent != new_sents[idx])

In [48]:
# check that topics are placed in the same spot
for idx, t in enumerate(stimuli_df.passage_category.values):
    print(t, new_topics[idx])
    if (t != new_topics[idx]):
        print(t, new_topics[idx])
    assert(t == new_topics[idx])

beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
beekeeping beekeeping
dreams dreams
dreams dreams
dreams dreams
dreams dreams
dreams dreams
dreams dreams
dreams dreams
dreams dreams
dreams dreams
dreams dreams
gambling gambling
gambling gambling
gambling gambling
gambling gambling
gambling gambling
gambling gambling
gambling gambling
gambling gambling
gambling gambling
gambling gambling
hurricane hurricane
hurricane hurricane
hurricane hurricane
hurricane hurricane
hurricane hurricane
hurricane hurricane
hurricane hurricane
hurricane hurricane
hurricane hurricane
hurricane hurricane
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
ice_cream ice_cream
lawn_mower lawn_mower
lawn_mower lawn_

### 3a. random shuffling of sentences within a topic (=category), with length criteria