# Script for creating sentence-meaning manipulation datasets from the Pereira2018 fMRI stimuli

In [121]:
import re
from pathlib import Path
from os.path import abspath
import os
import numpy as np
import random
import pickle
import csv
import subprocess
from random import shuffle

In [122]:
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation


In [123]:
importpath = abspath('../..')
os.chdir(importpath)
print(os.getcwd())

/Users/gt/Documents/GitHub/perturbed-neural-nlp


In [125]:
#add seeds for reproducability
np.random.seed(42)
random.seed(42)

# settings
save = True # if storing pkl dataframes

# Load base stimulus dataframe (Pereira 2018)

In [5]:
from neural_nlp.benchmarks.neural import *
import neural_nlp
from neural_nlp.stimuli import StimulusSet
import xarray as xr

benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# fetch stimulus set
benchmark = benchmark_pool['Pereira2018-encoding']
stimuli_df = benchmark._target_assembly.attrs['stimulus_set']
stimuli_df

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)


Loading lookup from /opt/anaconda3/lib/python3.8/site-packages/brainio_collection/lookup.csv
/Users/gt/Documents/GitHub/perturbed-neural-nlp/neural_nlp/../ressources/stimuli


 We're running in the NEW version of the implementations.py script.




  xr_data.set_index(append=True, inplace=True, **coords_d)
  elif isinstance(data, pd.Panel):


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [6]:
# benchmark content
benchmark._target_assembly.values.shape
benchmark._target_assembly.coords

Coordinates:
  * presentation      (presentation) MultiIndex
  - stimulus_num      (presentation) int64 0 0 1 1 2 2 3 ... 12 12 13 13 14 14
  - passage_index     (presentation) int64 1 1 1 1 1 1 1 1 2 ... 3 4 4 4 4 4 4 4
  - passage_label     (presentation) object 'Accordion' ... 'dreams'
  - passage_category  (presentation) object 'music' 'beekeeping' ... 'dreams'
  - stimulus_id       (presentation) object '384sentences.0' ... '243sentences.14'
  - story             (presentation) object '384sentences.music' ... '243sentences.dreams'
  - experiment        (presentation) object '384sentences' ... '243sentences'
  * neuroid           (neuroid) MultiIndex
  - subject           (neuroid) object '018' '018' '018' ... '018' '018' '018'
  - voxel_num         (neuroid) int64 28 29 31 32 38 42 ... 152 153 154 159 160
  - atlas             (neuroid) object 'language' 'language' ... 'language'
  - filter_strategy   (neuroid) object '' '' '' '' '' '' ... '' '' '' '' '' ''
  - atlas_selection   (

In [126]:
stimuli_path = os.path.join(os.getcwd(),'ressources/stimuli_creation')
os.chdir(stimuli_path)
print(os.getcwd())

savedir = abspath('../scrambled_stimuli_dfs')
print(savedir)
os.makedirs(savedir, exist_ok=True)

/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/stimuli_creation
/Users/gt/Documents/GitHub/perturbed-neural-nlp/ressources/scrambled_stimuli_dfs


# Create different perturbed versions of the benchmark

`get original dataset in correct formatting. NOTE: "stim_243sentences_scrambled.txt" and "stim_384sentences_scrambled.txt" are created via running "get_original_sentenceset.ipynb"`

In [8]:
def get_original_sentenceset(filename):
    with open(os.path.join(stimuli_path,filename),"r") as f:
        reader = csv.reader(f, delimiter="\t")
        sentences = list(reader)
    Original = [sentence[1] + '.' for sentence in sentences if int(sentence[0]) == 0]
    
    return Original

In [9]:
Original_243 = get_original_sentenceset("stim_243sentences_scrambled.txt")
Original_384 = get_original_sentenceset("stim_384sentences_scrambled.txt")

Original = Original_243 + Original_384

In [10]:
#This is the list of stimuli we create the perturbations from
original_stimuli = Original
print(original_stimuli[:5])

['beekeeping encourages the conservation of local habitats.', "it is in every beekeeper's interest to conserve local plants that produce pollen.", 'as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'artisanal beekeepers go to extremes for their craft but their product is worth the effort.']


## Create sentence shuffling conditions

### 1. random shuffling of sentences across the dataset, not respecting passages, sentence length, etc.

In [13]:
shuffled_sentences = original_stimuli.copy()
shuffle(shuffled_sentences)

#quick test that all sentences have been shuffled
for ind in range(len(original_stimuli)):
    if original_stimuli[ind] != shuffled_sentences[ind]:
        continue
    else:
        print(ind)
print("Done")

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-random.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(shuffled_sentences[:5])
perturbed_df

100
106
168
372
Done
['gloves to protect against cold are made of wool or lined waterproof material.', 'the driver steers the car on roads other passengers just sit.', 'a knife can be used to attack by slashing stabbing or throwing.', 'the computer graphics specialist works with doctors to visualize medical conditions and surgical procedures.', 'i finally came to a stop at a flat part of the slope.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,gloves to protect against cold are made of woo...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,the driver steers the car on roads other passe...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,a knife can be used to attack by slashing stab...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,the computer graphics specialist works with do...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,i finally came to a stop at a flat part of the...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,the adults set up climbing routes appropriate ...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,the building can have a garage a laundry facil...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,the wheels have rubber tires with an inner tub...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,good data on the social and economic effects o...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


### 1b. random shuffling of sentences across the dataset, making sure that sentences from the same topic are not placed in the original slot

Info from Pereira et al., 2018: Experiment 2 used 96
passages, each consisting of 4 sentences about a particular concept, spanning a
broad range of content areas from 24 broad topics (e.g., professions, clothing, birds,
musical instruments, natural disasters, crimes, etc.), with 4 passages per topic (e.g.,
clarinet, accordion, piano, and violin for musical instruments; Supplementary
Figure 1). All passages were Wikipedia-style texts that provided basic information
about the relevant concept. Experiment 3 used 72 passages, each consisting of 3 or
4 sentences about a particular concept. As in experiment 2, the passages spanned a
broad range of content areas from 24 broad topics, unrelated to the topics in
experiment 2 (e.g., skiing, dreams, opera, bone fractures, etc.), with 3 passages per
topic

In [106]:
print(len(np.unique(stimuli_df.passage_label)))
print(len(np.unique(stimuli_df.passage_index)))
print(len(np.unique(stimuli_df.passage_category)))

120
96
48


In [127]:
def shuffle_topic_criteria(stimuli_df):
    '''
    Shuffle sentences across the entire set, but do not allow sentences from the same topic (passage category)
    to land in the same spot.
    '''
    np.random.seed(42)
    random.seed(42)
    
    avail_sents = stimuli_df.copy(deep=True) # for storing(=popping) which sentences have already been used
    stimuli_df_copy = stimuli_df.copy(deep=True)
    new_sents = []
    new_topics = [] # for asserting that topics did not repeat
    for i, sent in enumerate(stimuli_df.itertuples()):
        # find a sentence that is from a different topic & that has NOT been used before
        q_str = f'`passage_category` != "{sent.passage_category}"'
        sents_to_pick_from = avail_sents.query(q_str)

        # pick a random sentence 
        rand_idx = np.random.choice(sents_to_pick_from.index.values)
        picked_sent = stimuli_df_copy.iloc[rand_idx]

        # store the picked sentence topic
        new_topics.append(picked_sent.passage_category)

        # remove from avail_sents
        avail_sents = avail_sents.drop(index=rand_idx)
        assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

        # append to the new sents list
        new_sents.append(picked_sent.sentence)

    return new_sents, new_topics

In [128]:
# obtain new sents 
new_sents, new_topics = shuffle_topic_criteria(stimuli_df)

In [129]:
# show that topics are now different!
for idx, topic in enumerate(stimuli_df.passage_category.values):
    print(topic, new_topics[idx])
    assert(topic != new_topics[idx])

beekeeping tuxedo
beekeeping dwelling
beekeeping vehicles_transport
beekeeping tuxedo
beekeeping law_school
beekeeping hurricane
beekeeping taste
beekeeping bird
beekeeping rock_climbing
beekeeping disaster
beekeeping crime
dreams pharmacist
dreams kitchen_utensil
dreams tuxedo
dreams bone_fracture
dreams landscape
dreams body_part
dreams fish
dreams vegetable
dreams tool
dreams castle
gambling vegetable
gambling painter
gambling music
gambling building_part
gambling hurricane
gambling furniture
gambling disaster
gambling appliance
gambling astronaut
gambling profession
hurricane weapon
hurricane castle
hurricane bird
hurricane building_part
hurricane weapon
hurricane clothing
hurricane bone_fracture
hurricane profession
hurricane dreams
hurricane owl
ice_cream profession
ice_cream dwelling
ice_cream disaster
ice_cream dreams
ice_cream fruit
ice_cream body_part
ice_cream building_part
ice_cream disaster
ice_cream stress
ice_cream tool
lawn_mower bird
lawn_mower crime
lawn_mower furnitu

In [132]:
perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = new_sents

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-topic-criteria.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(new_sents[:5])
perturbed_df

['Clearly he had it tailored, as it fit the man perfectly.', 'A log cabin is a small house built from round logs.', 'People use bicycles for transportation, recreation and racing.', 'A tuxedo jacket is a tailless dinner jacket with black silk lapels.', 'He wanted to change things for the better.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,"Clearly he had it tailored, as it fit the man ...",0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,A log cabin is a small house built from round ...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"People use bicycles for transportation, recrea...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,A tuxedo jacket is a tailless dinner jacket wi...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,He wanted to change things for the better.,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,The elephant flaps its large ears to cool the ...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,The adults set up climbing routes appropriate ...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,"As a painter, I learned to focus less on the a...",381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,Some people collect butterflies because of the...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [222]:
stimuli_df_copy = stimuli_df.copy(deep=True)
stimuli_df_copy['sent_len'] = [len(x.split(' ')) for x in stimuli_df_copy.sentence]
avail_sents = stimuli_df_copy.copy(deep=True) # for storing(=popping) which sentences have already been used


new_sents = []
new_topics = [] # for asserting that topics did not repeat
len_threshold = 1 # start by 1
stored_len_thresholds = [1] # start at 1
stored_idx_threshold = [0] # for storing at which idx the threshold changed. start at 0

for i, sent in enumerate(stimuli_df_copy.itertuples()):
#     print(i)
    # find a sentence that is from a different topic & that has NOT been used before
    q_str = f'`passage_category` != "{sent.passage_category}"'
    avail_topic_criteria = avail_sents.query(q_str)
    
    # add length requirement
    q_str_len = f'sent_len >= {sent.sent_len - len_threshold} & sent_len <= {sent.sent_len + len_threshold}'
    print(f'Current sent length is {sent.sent_len}')
    avail_topic_AND_len_criteria = avail_topic_criteria.query(q_str_len)
    print(f'{len(avail_topic_AND_len_criteria)} sentences available!')
    
    if len(avail_topic_AND_len_criteria) == 0: # if no sentences available
        len_threshold += 1
        stored_len_thresholds.append(len_threshold)
        stored_idx_threshold.append(i)
        print(f'________________ Changing length threshold to {len_threshold} ___________________')
        # rerun for the current index, so all sentences will be paired
        q_str_len = f'sent_len >= {sent.sent_len - len_threshold} & sent_len <= {sent.sent_len + len_threshold}'
        avail_topic_AND_len_criteria = avail_topic_criteria.query(q_str_len)
        print(f'{len(avail_topic_AND_len_criteria)} sentences available!')
        
        # it might happen again (in sentence number 624)
        if len(avail_topic_AND_len_criteria) == 0: # if still no sentences available
#             len_threshold += 1
            print(f'________________ LEN THRESHOLD CHANGE DID NOT HELP ___________________')
            print(f'{i + 1}/{len(stimuli_df_copy)+1} sentences have been assigned, omitting length threshold criteria')
            # rerun for the current index, so all sentences will be paired
            avail_topic_AND_len_criteria = avail_topic_criteria
            print(f'{len(avail_topic_AND_len_criteria)} sentences available after omitting length criteria!')
    
    # pick a random sentence 
    rand_idx = np.random.choice(avail_topic_AND_len_criteria.index.values)
    picked_sent = stimuli_df_copy.iloc[rand_idx]

    # store the picked sentence topic
    new_topics.append(picked_sent.passage_category)

    # remove from avail_sents
    avail_sents = avail_sents.drop(index=rand_idx)
    assert(len(avail_sents.loc[avail_sents.index == rand_idx].sentence.values) == 0) # assert that the chosen sentence was removed 

    # append to the new sents list
    new_sents.append(picked_sent.sentence)

Current sent length is 7
27 sentences available!
Current sent length is 13
262 sentences available!
Current sent length is 20
11 sentences available!
Current sent length is 15
138 sentences available!
Current sent length is 15
137 sentences available!
Current sent length is 10
210 sentences available!
Current sent length is 11
270 sentences available!
Current sent length is 16
96 sentences available!
Current sent length is 15
135 sentences available!
Current sent length is 11
269 sentences available!
Current sent length is 15
134 sentences available!
Current sent length is 14
203 sentences available!
Current sent length is 19
17 sentences available!
Current sent length is 9
126 sentences available!
Current sent length is 10
207 sentences available!
Current sent length is 17
51 sentences available!
Current sent length is 15
133 sentences available!
Current sent length is 19
15 sentences available!
Current sent length is 15
132 sentences available!
Current sent length is 12
298 sentences

Current sent length is 7
20 sentences available!
Current sent length is 9
101 sentences available!
Current sent length is 16
59 sentences available!
Current sent length is 12
227 sentences available!
Current sent length is 10
175 sentences available!
Current sent length is 10
174 sentences available!
Current sent length is 12
225 sentences available!
Current sent length is 13
174 sentences available!
Current sent length is 13
173 sentences available!
Current sent length is 13
172 sentences available!
Current sent length is 12
224 sentences available!
Current sent length is 16
54 sentences available!
Current sent length is 15
76 sentences available!
Current sent length is 17
24 sentences available!
Current sent length is 19
6 sentences available!
Current sent length is 13
172 sentences available!
Current sent length is 12
222 sentences available!
Current sent length is 11
211 sentences available!
Current sent length is 19
5 sentences available!
Current sent length is 16
51 sentences ava

Current sent length is 13
170 sentences available!
Current sent length is 17
38 sentences available!
Current sent length is 12
188 sentences available!
Current sent length is 9
120 sentences available!
Current sent length is 13
165 sentences available!
Current sent length is 12
186 sentences available!
Current sent length is 11
189 sentences available!
Current sent length is 15
90 sentences available!
Current sent length is 15
89 sentences available!
Current sent length is 15
88 sentences available!
Current sent length is 15
89 sentences available!
Current sent length is 12
184 sentences available!
Current sent length is 16
56 sentences available!
Current sent length is 13
160 sentences available!
Current sent length is 13
158 sentences available!
Current sent length is 13
157 sentences available!
Current sent length is 12
179 sentences available!
Current sent length is 13
155 sentences available!
Current sent length is 11
182 sentences available!
Current sent length is 10
156 sentence

Current sent length is 10
48 sentences available!
Current sent length is 12
48 sentences available!
Current sent length is 14
35 sentences available!
Current sent length is 13
40 sentences available!
Current sent length is 10
44 sentences available!
Current sent length is 15
32 sentences available!
Current sent length is 10
43 sentences available!
Current sent length is 13
37 sentences available!
Current sent length is 10
41 sentences available!
Current sent length is 14
33 sentences available!
Current sent length is 11
41 sentences available!
Current sent length is 13
34 sentences available!
Current sent length is 10
37 sentences available!
Current sent length is 14
30 sentences available!
Current sent length is 12
38 sentences available!
Current sent length is 11
40 sentences available!
Current sent length is 11
39 sentences available!
Current sent length is 9
32 sentences available!
Current sent length is 11
36 sentences available!
Current sent length is 9
31 sentences available!
Cu

In [223]:
stored_idx_threshold

[0, 226, 610, 615, 621, 622, 623, 624, 625, 626]

In [224]:
stored_len_thresholds

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [192]:
avail_sents = stimuli_df_copy.copy(deep=True) # for storing(=popping) which sentences have already been used
avail_sents

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category,sent_len
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,7
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,13
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,20
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,15
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping,15
...,...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part,9
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human,7
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human,8
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human,8


In [193]:
np.min([len(x.split(' ')) for x in stimuli_df_copy.sentence])

5

In [194]:
avail_topic_criteria = avail_sents.query(q_str)

In [205]:
q_str_len = f'sent_len >= {sent.sent_len - 1} & sent_len <= {sent.sent_len + 1}'

In [206]:
print(f'Current sent length is {sent.sent_len}')

Current sent length is 9


In [207]:
avail_topic_AND_len_criteria = avail_topic_criteria.query(q_str_len)

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category,sent_len
5,Artisanal honey-making emphasizes quality and ...,5,243sentences.5,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping,10
13,I've never even done any of the reading assign...,13,243sentences.13,243sentences,243sentences.dreams,4,dreams,dreams,9
14,I know many other people who have the same nig...,14,243sentences.14,243sentences,243sentences.dreams,4,dreams,dreams,10
30,Gambling has become very big but controversial...,30,243sentences.30,243sentences,243sentences.gambling,9,gambling,gambling,8
36,They dump heavy rains that can trigger floods ...,36,243sentences.36,243sentences,243sentences.hurricane,11,hurricane,hurricane,10
...,...,...,...,...,...,...,...,...,...
598,A tropical cyclone can move inland and serious...,355,384sentences.355,384sentences,384sentences.disaster,89,Tropical_cyclone,disaster,10
613,Water freezes at a low temperature and becomes...,370,384sentences.370,384sentences,384sentences.drink_non_alcoholic,93,Water,drink_non_alcoholic,10
616,Whales breathe through blowholes on their head...,373,384sentences.373,384sentences,384sentences.animal,94,Whale,animal,9
618,"Whales have been hunted for meat, whale oil an...",375,384sentences.375,384sentences,384sentences.animal,94,Whale,animal,10


In [188]:
avail_topic_criteria.sent_len >

SyntaxError: invalid syntax (<ipython-input-188-d19f59ad92d7>, line 1)

### 2. random shuffling of sentences within a passage

In [14]:
#index by experiment and passage_label/passage_index >> shuffle within those
def get_shuffled_within_passage(stimuli_df):
    np.random.seed(42)
    random.seed(42)

    shuffled_sentences = []
    for exp in list(np.unique(stimuli_df["experiment"])):
        for ind in list(np.unique(stimuli_df["passage_index"])):
            if exp == "243sentences" and ind > 72: #hot fix, 243sentences only has 72 passages
                continue
            else:
                curr_df = stimuli_df.loc[(stimuli_df["experiment"] == exp) & (stimuli_df["passage_index"] == ind)]
                curr_sent = list(curr_df["sentence"])
                while True:
                    shuffle(curr_sent)
                    if curr_sent != list(curr_df["sentence"]):
                        break
                shuffled_sentences += curr_sent
    return shuffled_sentences
shuffled_sentences = get_shuffled_within_passage(stimuli_df)

#assimilate sentences with Mollica-style (i.e., strip sentence-internal punctuation, lowercase)
shuffled_sentences = [re.sub(r'[^\w\d\s\']+', '', sent.lower()) + "." for sent in shuffled_sentences]

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = shuffled_sentences

if save:
    fname = f"{savedir}/stimuli_sentenceshuffle-withinpassage.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    
print(shuffled_sentences[:5])
perturbed_df

['as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', "it is in every beekeeper's interest to conserve local plants that produce pollen.", 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'beekeeping encourages the conservation of local habitats.', 'they scout the fields know when nectar flows and select the best ways to extract honey.']


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,as a passive form of agriculture it does not r...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,it is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,beekeepers also discourage the use of pesticid...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,beekeeping encourages the conservation of loca...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,they scout the fields know when nectar flows a...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,a woman can become pregnant and bear children.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,a woman is a female human adult.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,a woman is stereotypically seen as a caregiver.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human
