# Script for creating information-loss manipulation datasets from the Pereira2018 fMRI stimuli

In [1]:
save = True

In [2]:
import re
from pathlib import Path
from os.path import abspath
import os
import numpy as np
import random
import pickle
import csv
import subprocess

In [3]:
print(os.getcwd())

/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/ressources/stimuli_creation


In [4]:
importpath = abspath('../..')
os.chdir(importpath)
print(os.getcwd())

/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp


In [5]:
#add seeds for reproducability
np.random.seed(42)
random.seed(42)

# Load base stimulus dataframe (Pereira 2018)

In [6]:
from neural_nlp.benchmarks.neural import *
import neural_nlp
from neural_nlp.stimuli import StimulusSet
import xarray as xr

benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# fetch stimulus set
benchmark = benchmark_pool['Pereira2018-encoding']
stimuli_df = benchmark._target_assembly.attrs['stimulus_set']
stimuli_df

Loading lookup from /om2/user/ckauf/anaconda/envs/perturbedenv/lib/python3.6/site-packages/brainio_collection/lookup.csv
/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/neural_nlp/../ressources/stimuli


 We're running in the NEW version of the implementations.py script.




  xr_data.set_index(append=True, inplace=True, **coords_d)


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [7]:
# benchmark content
benchmark._target_assembly.values.shape
benchmark._target_assembly.coords

Coordinates:
  * presentation      (presentation) MultiIndex
  - stimulus_num      (presentation) int64 0 0 1 1 2 2 3 ... 12 12 13 13 14 14
  - passage_index     (presentation) int64 1 1 1 1 1 1 1 1 2 ... 3 4 4 4 4 4 4 4
  - passage_label     (presentation) object 'Accordion' ... 'dreams'
  - passage_category  (presentation) object 'music' 'beekeeping' ... 'dreams'
  - stimulus_id       (presentation) object '384sentences.0' ... '243sentences.14'
  - story             (presentation) object '384sentences.music' ... '243sentences.dreams'
  - experiment        (presentation) object '384sentences' ... '243sentences'
  * neuroid           (neuroid) MultiIndex
  - subject           (neuroid) object '018' '018' '018' ... '018' '018' '018'
  - voxel_num         (neuroid) int64 28 29 31 32 38 42 ... 152 153 154 159 160
  - atlas             (neuroid) object 'language' 'language' ... 'language'
  - filter_strategy   (neuroid) object '' '' '' '' '' '' ... '' '' '' '' '' ''
  - atlas_selection   (

In [8]:
stimuli_path = os.path.join(os.getcwd(),'ressources/stimuli_creation')
os.chdir(stimuli_path)
print(os.getcwd())

savedir = abspath('../scrambled_stimuli_dfs')
print(savedir)
os.makedirs(savedir, exist_ok=True)

/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/ressources/stimuli_creation
/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/ressources/scrambled_stimuli_dfs


# Create different perturbed versions of the benchmark

`NOTE: "Pereira2018_scrambled.txt" is created via running "get_original_sentenceset.ipynb"`

In [9]:
def get_original_sentenceset(filename):
    with open(os.path.join(stimuli_path,filename),"r") as f:
        reader = csv.reader(f, delimiter="\t")
        sentences = list(reader)
    Original = [sentence[1] for sentence in sentences if int(sentence[0]) == 0]
    Original = [re.sub(r'[^\w\d\s\'\-\$\%]+', '', sent.lower()) + "." for sent in Original]
    
    return Original

In [10]:
#This is the list of stimuli we create the perturbations from
Original = get_original_sentenceset("Pereira2018_scrambled.txt")
print(Original[:5])

['beekeeping encourages the conservation of local habitats.', "it is in every beekeeper's interest to conserve local plants that produce pollen.", 'as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'artisanal beekeepers go to extremes for their craft but their product is worth the effort.']


In [11]:
#continue with Original instead of original sentence list from Pereira2018 for the following reason:

In [12]:
# load sentences from dataset:
pereira_sents = stimuli_df.sentence.values
pereira_sents = [string.lower() for string in pereira_sents] #lowercase
pereira_sents = [re.sub(r'[^\w\d\s\'\-\$\%]+', '', sent.lower()) + "." for sent in pereira_sents] #strip punctuation

#check for differences
for ind in range(len(pereira_sents)):
    if Original[ind] != pereira_sents[ind]:
        print(f"{ind} | {pereira_sents[ind]} | {Original[ind]}")
print("*"*30)

print("Adjusting for differences: ")
pereira_sents = [string.lower() for string in pereira_sents] #lowecase

#check & adjust for differences
for ind, sent in enumerate(pereira_sents):
    if "  " in sent:
        print(f"double space for sent {ind}: {sent}")
    elif " ." in sent:
        print(f"space before period for sent {ind}: {sent}")
    elif ". " in sent:
        print(f"space after period for sent {ind}: {sent}")
    else:
        continue
print("*"*30)
pereira_sents = [re.sub(r' +', ' ', sent) for sent in pereira_sents] #strip double whitespace
pereira_sents = [re.sub(r'\. | \.', '.', sent) for sent in pereira_sents] #strip whitespace before/after final period

print("Changed!")
for ind in range(len(pereira_sents)):
    if Original[ind] != pereira_sents[ind]:
        print(f"{ind} | {pereira_sents[ind]} | {Original[ind]}")
assert pereira_sents == Original
print("Asserted, now they're the same")

209 | upon leaving the train station we saw our hotel looming above  a magnificent castle built on a cliff. | upon leaving the train station we saw our hotel looming above a magnificent castle built on a cliff.
383 | a foot is a body part on the end of a leg . | a foot is a body part on the end of a leg.
392 | forks are usually  made of metal or plastic if disposable. | forks are usually made of metal or plastic if disposable.
434 | a knife can be used to attack by slashing stabbing or throwing . | a knife can be used to attack by slashing stabbing or throwing.
504 | ravens feed on carrion insects berries or small animals . | ravens feed on carrion insects berries or small animals.
506 | in folklore ravens are birds of ill-omen and also tricksters . | in folklore ravens are birds of ill-omen and also tricksters.
562 | farmers often drain swamps to produce fertile arable land . | farmers often drain swamps to produce fertile arable land.
******************************
Adjusting for diff

## Ablation | O'Connor & Andreas (2021)

In [13]:
import nltk
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /home/ckauf/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [14]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [15]:
def pos_tag_sentences(list_of_sentences):
    """Takes list of sentences as inputs and returns list of lists of POS-tags for each word within the sentence
    """
    words = [re.split(r'\s+', sent) for sent in list_of_sentences]
    #don't use NLTK word tokenizer, or else build work-around for 's sentences
    tagged = [nltk.pos_tag(word_list) for word_list in words]
    #print(tagged[1:5])
    return tagged
    
tagged = pos_tag_sentences(Original)
tagged[1]

[('it', 'PRP'),
 ('is', 'VBZ'),
 ('in', 'IN'),
 ('every', 'DT'),
 ("beekeeper's", 'NN'),
 ('interest', 'NN'),
 ('to', 'TO'),
 ('conserve', 'VB'),
 ('local', 'JJ'),
 ('plants', 'NNS'),
 ('that', 'WDT'),
 ('produce', 'VBP'),
 ('pollen.', 'NNS')]

In [16]:
# Note: words do not include dependent parts, like possessive markers:

In [17]:
for ind, sent in enumerate(tagged):
    for tag_tuple in sent:
        curr_word = tag_tuple[0].rstrip(".")
        curr_tag = tag_tuple[1]
        printing = False
        if curr_tag == "POS":
            printing = True
        if printing is True:
            print(ind, sent)
            print("*"* 30)

In [18]:
def get_perturbed_datasets(sentences,perturb_type):
    """
    Input:
    * original sentence list (already lower-cased and stripped from punctuation for permute_sentences.py script)
    * perturb_type = what should stay in the stimuli file?
        nouns: only nouns
        nounsverbs: only nouns and verbs
        etc
    Output:
    * list of perturbed sentences
    """
    tagged = pos_tag_sentences(sentences)
    
    n = ['NN.*', 'PRP.*'] #similar to O'Connor & Andreas (2021)
    v = ['VB.*']
    a = ['JJ.*']
    adv = ['RB.*']
    
    if perturb_type == 'nouns':
        pos_list = n
    elif perturb_type == 'verbs':
        pos_list = v
    elif perturb_type == 'nounsverbs':
        pos_list = n + v
    elif perturb_type == 'nounsverbsadj':
        pos_list = n + v + a
    elif perturb_type == 'contentwords':
        pos_list = n + v + a + adv
    elif perturb_type == 'functionwords':
        pos_list = n + v + a + adv #exclude in next step
    else:
        print("Unknown condition")
        
    perturbed_sents = []
    for sent in tagged:
        if perturb_type != "functionwords": #if some kind of content words
            
            pert = ' '.join([tag_tuple[0].rstrip(".") for tag_tuple in sent if re.match("|".join(pos_list), tag_tuple[1])]) + "."
            perturbed_sents.append(pert)
                
        else:
            pert = ' '.join([tag_tuple[0].rstrip(".") for tag_tuple in sent if not re.match("|".join(pos_list), tag_tuple[1])]) + "."
            perturbed_sents.append(pert)
            
    return perturbed_sents

In [19]:
## function test

In [20]:
noun_sentences = get_perturbed_datasets(Original,perturb_type='nouns')
noun_sentences[:20]

['conservation habitats.',
 "it beekeeper's interest plants pollen.",
 'form agriculture it vegetation way crops.',
 'beekeepers use pesticides crops they honeybees.',
 'beekeepers their craft their product effort.',
 'honey-making quality character quantity consistency.',
 'honey beekeepers micromanagers their honeybees.',
 'they fields nectar ways honey.',
 'beekeeper hum bees air.',
 'beekeeper honey stores supplies bee nursery.',
 "bees his bare arms hands they they're gentle.",
 'i dream exams college.',
 'my dream day my exam i.',
 "i've reading assignments.",
 'i people nightmare.',
 'morning participants study their dream experience night.',
 'they they dreams dream its intensity.',
 'participants dream category dream dream nightmare.',
 'night we our minds we.',
 'we night cycles ninety minutes.']

In [21]:
noun_sentences = get_perturbed_datasets(Original,perturb_type='functionwords')
noun_sentences[:10]

['the of.',
 'in every to that.',
 'as a of that to for.',
 'the of on because could the.',
 'to for but worth the.',
 'and over and.',
 'to the of.',
 'the when and the to.',
 'as the the the of 40000 the.',
 'the and the.']

In [22]:
def get_dataset(stimuli_df, Original, perturb_type):
    """
    Input:
    * original benchmark dataframe
    * original stimuli (list of sentences)
    * perturbation type > what should stay in the stimuli?
    Output:
    * saves perturbed benchmark dataframe to save directory
    """
    perturbed_sentences = get_perturbed_datasets(Original,perturb_type)
    
    perturbed_df = stimuli_df.copy()
    perturbed_df["sentence"] = perturbed_sentences

    if save:
        fname = f"{savedir}/stimuli_{perturb_type}.pkl"
        with open(fname, 'wb') as fout:
            pickle.dump(perturbed_df, fout)
    return perturbed_df

## Create datasets (mostly O'Connor & Andreas (2021))

In [23]:
#loop over perturbation types to create O'Connor & Andreas (2021) datasets
perturb_types = ["contentwords", "nouns", "verbs", "nounsverbs", "nounsverbsadj", "functionwords"]

for perturb in perturb_types:
    perturb_df = get_dataset(stimuli_df, Original, perturb_type=perturb)
    print(f"Created dataset for perturbation type: {perturb}")
perturb_df   


Created dataset for perturbation type: contentwords
Created dataset for perturbation type: nouns
Created dataset for perturbation type: verbs
Created dataset for perturbation type: nounsverbs
Created dataset for perturbation type: nounsverbsadj
Created dataset for perturbation type: functionwords


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,the of.,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,in every to that.,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,as a of that to for.,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,the of on because could the.,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,to for but worth the.,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,some to.,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,a a.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,a as a.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,a can and.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


## Create random noun replacement condition

In [24]:
def get_random_noun_dataset(sentences):
    """
    Input: list of original sentences
    Output: perturbed version of each sentence with only nouns (same number as in original sentence),
    but randomly drawn from dataset
    """
    
    #set random seed for reproducability
    np.random.seed(42)
    random.seed(42)
    
    tagged = pos_tag_sentences(sentences)
    
    n = ['NN.*', 'PRP.*'] #similar to O'Connor & Andreas (2021)
    pos_list = n
    
    #gather all nouns in a list
    all_nouns = []
    #count how many nouns should go in each sentence
    nr_nouns_in_sentences = []
    #keep track of which nouns were in which sentence for checking later
    nouns_in_sentences = []
    
    for sent in tagged:
        curr_nouns = [tag_tuple[0].rstrip(".") for tag_tuple in sent if re.match("|".join(pos_list), tag_tuple[1])]
        
        all_nouns += curr_nouns
        nr_nouns_in_sentences.append(len(curr_nouns))
        nouns_in_sentences.append(curr_nouns)
    
    perturbed_sentences = []
    for ind, n in enumerate(nr_nouns_in_sentences):
        random_nouns = random.sample(all_nouns, n)
        assert set(random_nouns) != set(nouns_in_sentences[ind]) #check that not the same nouns are selected
        perturbed_sentences.append(' '.join(random_nouns) + ".")
        [all_nouns.remove(elm) for elm in random_nouns] #remove selected nouns from list
        #print(len(all_nouns))
    
    assert len(all_nouns) == 0, f"Not all words from the dataset have been used. Length of word list is {len(all_words)}!"
     
    return perturbed_sentences

random_nouns = get_random_noun_dataset(Original)

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = random_nouns

if save:
    fname = f"{savedir}/stimuli_randomnouns.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)

perturbed_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,its doctors.,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,images ligaments chamber documentary jacket.,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,factor compartments world spears density studies.,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,horror computer sea attack food collar.,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,dreams their glimpse water firearm forest.,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,automobile he bear vapor.,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,pianist people.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,law freezes.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,driver villages.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [25]:
#check that nouns and random nouns datasets have same length strings:
check_randomnouns_df = perturbed_df
check_nouns_df = get_dataset(stimuli_df, Original, perturb_type="nouns")
    
sent_len_randomnouns = [len(elm.split()) for elm in list(check_randomnouns_df["sentence"])]
sent_len_nouns = [len(elm.split()) for elm in list(check_nouns_df["sentence"])]

assert sent_len_randomnouns == sent_len_nouns
print("Done")

Done
