# Script for creating information-loss manipulation datasets from the Pereira2018 fMRI stimuli

In [1]:
import re
from pathlib import Path
from os.path import abspath
import os
import numpy as np
import random
import pickle
import csv
import subprocess

In [2]:
print(os.getcwd())

/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/ressources/stimuli_creation


In [3]:
importpath = abspath('../..')
os.chdir(importpath)
print(os.getcwd())

/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp


In [4]:
#add seeds for reproducability
np.random.seed(42)
random.seed(42)

# Load base stimulus dataframe (Pereira 2018)

In [5]:
from neural_nlp.benchmarks.neural import *
import neural_nlp
from neural_nlp.stimuli import StimulusSet
import xarray as xr

benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# fetch stimulus set
benchmark = benchmark_pool['Pereira2018-encoding']
stimuli_df = benchmark._target_assembly.attrs['stimulus_set']
stimuli_df

Loading lookup from /om2/user/ckauf/anaconda/envs/perturbedenv/lib/python3.6/site-packages/brainio_collection/lookup.csv
/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/neural_nlp/../ressources/stimuli


 We're running in the NEW version of the implementations.py script.




  xr_data.set_index(append=True, inplace=True, **coords_d)


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [6]:
# benchmark content
benchmark._target_assembly.values.shape
benchmark._target_assembly.coords

Coordinates:
  * presentation      (presentation) MultiIndex
  - stimulus_num      (presentation) int64 0 0 1 1 2 2 3 ... 12 12 13 13 14 14
  - passage_index     (presentation) int64 1 1 1 1 1 1 1 1 2 ... 3 4 4 4 4 4 4 4
  - passage_label     (presentation) object 'Accordion' ... 'dreams'
  - passage_category  (presentation) object 'music' 'beekeeping' ... 'dreams'
  - stimulus_id       (presentation) object '384sentences.0' ... '243sentences.14'
  - story             (presentation) object '384sentences.music' ... '243sentences.dreams'
  - experiment        (presentation) object '384sentences' ... '243sentences'
  * neuroid           (neuroid) MultiIndex
  - subject           (neuroid) object '018' '018' '018' ... '018' '018' '018'
  - voxel_num         (neuroid) int64 28 29 31 32 38 42 ... 152 153 154 159 160
  - atlas             (neuroid) object 'language' 'language' ... 'language'
  - filter_strategy   (neuroid) object '' '' '' '' '' '' ... '' '' '' '' '' ''
  - atlas_selection   (

In [10]:
stimuli_path = os.path.join(os.getcwd(),'ressources/stimuli_creation')
os.chdir(stimuli_path)
print(os.getcwd())

savedir = abspath('../scrambled_stimuli_dfs')
print(savedir)
os.makedirs(savedir, exist_ok=True)

/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/ressources/stimuli_creation
/rdma/vast-rdma/vast/cpl/ckauf/perturbed-neural-nlp/ressources/scrambled_stimuli_dfs


# Create different perturbed versions of the benchmark

`get original dataset in correct formatting. NOTE: "stim_243sentences_scrambled.txt" and "stim_384sentences_scrambled.txt" are created via running "get_original_sentenceset.ipynb"`

In [11]:
def get_original_sentenceset(filename):
    with open(os.path.join(stimuli_path,filename),"r") as f:
        reader = csv.reader(f, delimiter="\t")
        sentences = list(reader)
    Original = [sentence[1] + '.' for sentence in sentences if int(sentence[0]) == 0]
    
    return Original

In [12]:
Original_2433 = get_original_sentenceset("stim_243sentences_scrambled.txt")
Original_384 = get_original_sentenceset("stim_384sentences_scrambled.txt")

Original = Original_243 + Original_384

## Ablation | O'Connor & Andreas (2021)

In [13]:
#This is the list of stimuli we create the perturbations from
original_stimuli = Original
print(original_stimuli[:5])

['beekeeping encourages the conservation of local habitats.', "it is in every beekeeper's interest to conserve local plants that produce pollen.", 'as a passive form of agriculture it does not require that native vegetation be cleared to make way for crops.', 'beekeepers also discourage the use of pesticides on crops because they could kill the honeybees.', 'artisanal beekeepers go to extremes for their craft but their product is worth the effort.']


In [14]:
#Source: https://stackoverflow.com/questions/49271730/how-to-parse-verbs-using-spacy
#!pip3 install spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [20]:
tokens = nlp(original_stimuli[0])
print([(elm, elm.pos_) for elm in tokens])

[(beekeeping, 'VERB'), (encourages, 'VERB'), (the, 'DET'), (conservation, 'NOUN'), (of, 'ADP'), (local, 'ADJ'), (habitats, 'NOUN'), (., 'PUNCT')]


In [21]:
# Note: words do not include dependent parts, like possessive markers:

In [23]:
for ind, sent in enumerate(original_stimuli):
    printing = False
    tokens = nlp(sent)
    for elm in tokens:
        if elm.pos_ == "PART" and str(elm).startswith("'"):
            printing = True
    if printing is True:
        print(ind, sent)
        print([(elm, elm.pos_) for elm in tokens])
        print("*"* 30)

1 it is in every beekeeper's interest to conserve local plants that produce pollen.
[(it, 'PRON'), (is, 'AUX'), (in, 'ADP'), (every, 'DET'), (beekeeper, 'NOUN'), ('s, 'PART'), (interest, 'NOUN'), (to, 'PART'), (conserve, 'VERB'), (local, 'ADJ'), (plants, 'NOUN'), (that, 'DET'), (produce, 'VERB'), (pollen, 'NOUN'), (., 'PUNCT')]
******************************
76 she uses digital animation to help doctors understand structures within a patient's body.
[(she, 'PRON'), (uses, 'VERB'), (digital, 'ADJ'), (animation, 'NOUN'), (to, 'PART'), (help, 'VERB'), (doctors, 'NOUN'), (understand, 'VERB'), (structures, 'NOUN'), (within, 'ADP'), (a, 'DET'), (patient, 'NOUN'), ('s, 'PART'), (body, 'NOUN'), (., 'PUNCT')]
******************************
115 while the tuxedo is still preferred today's grooms are making personal statements in wedding attire.
[(while, 'SCONJ'), (the, 'DET'), (tuxedo, 'NOUN'), (is, 'AUX'), (still, 'ADV'), (preferred, 'VERB'), (today, 'NOUN'), ('s, 'PART'), (grooms, 'NOUN'), (are

In [26]:
def get_perturbed_datasets(sentences,perturb_type):
    """
    Input:
    * original sentence list (already lower-cased and stripped from punctuation for permute_sentences.py script)
    * perturb_type = what should stay in the stimuli file?
        nouns: only nouns
        nounsverbs: only nouns and verbs
        etc
    Output:
    * list of perturbed sentences
    """
    
    n = ['NOUN', 'PROPN', 'PRON'] #same as in O'Connor & Andreas (2021)
    v = ['VERB']
    a = ['ADJ']
    adv = ['ADV']
    
    if perturb_type == 'nouns':
        pos_list = n
    elif perturb_type == 'verbs':
        pos_list = v
    elif perturb_type == 'nounsverbs':
        pos_list = n + v
    elif perturb_type == 'nounsverbsadj':
        pos_list = n + v + a
    elif perturb_type == 'contentwords':
        pos_list = n + v + a + adv
    elif perturb_type == 'functionwords':
        pos_list = n + v + a + adv #exclude in next step
    else:
        print("Unknown condition")
        
    perturbed_sentences = []
    for sent in sentences:
        tokens = nlp(sent)
        if perturb_type != "functionwords": #if some kind of content words
            
            perturbed_sentences.append(' '.join([str(elm).lower() for elm in tokens if elm.pos_ in pos_list]) + ".")
                
        else:
            fn_sentence = ' '.join([str(elm).lower() for elm in tokens if elm.pos_ not in pos_list])
            fn_sentence = fn_sentence.rstrip(" .") + "." #add final period without space to avoid it being tokenized as a "new word"
            perturbed_sentences.append(fn_sentence)
            
    return perturbed_sentences

In [27]:
## function test

In [28]:
noun_sentences = get_perturbed_datasets(original_stimuli,perturb_type='nouns')
noun_sentences[:10]

['conservation habitats.',
 'it beekeeper interest plants pollen.',
 'form agriculture it vegetation way crops.',
 'beekeepers use pesticides crops they honeybees.',
 'beekeepers extremes their craft their product effort.',
 'honey quality character quantity consistency.',
 'honey beekeepers micromanagers their honeybees.',
 'they fields nectar ways honey.',
 'beekeeper hive hum bees air.',
 'beekeeper honey stores pollen supplies bee nursery.']

In [31]:
def get_dataset(stimuli_df, original_stimuli, perturb_type):
    """
    Input:
    * original benchmark dataframe
    * original stimuli (list of sentences)
    * perturbation type > what should stay in the stimuli?
    Output:
    * saves perturbed benchmark dataframe to save directory
    """
    perturbed_sentences = get_perturbed_datasets(original_stimuli,perturb_type)
    
    perturbed_df = stimuli_df.copy()
    perturbed_df["sentence"] = perturbed_sentences

    else:
        fname = f"{savedir}/stimuli_{perturb_type}.pkl"
    with open(fname, 'wb') as fout:
        pickle.dump(perturbed_df, fout)
    return perturbed_df

## Create datasets (mostly O'Connor & Andreas (2021))

In [32]:
#loop over perturbation types to create O'Connor & Andreas (2021) datasets
perturb_types = ["contentwords", "nouns", "verbs", "nounsverbs", "nounsverbsadj", "functionwords"]

for perturb in perturb_types:
    perturb_df = get_dataset(stimuli_df, original_stimuli, perturb_type=perturb)
    print(f"Created dataset for perturbation type: {perturb}")
perturb_df

Created dataset for perturbation type: nouns_delete50percent
Created dataset for perturbation type: contentwords
Created dataset for perturbation type: nouns
Created dataset for perturbation type: verbs
Created dataset for perturbation type: nounsverbs
Created dataset for perturbation type: nounsverbsadj
Created dataset for perturbation type: functionwords


Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,the of.,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,is in every 's to that.,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,as a of does not that be to for.,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,the of on because could the.,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,to for but the.,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,some to.,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,a is a.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,a is as a.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,a can and.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


## Create random noun replacement condition

In [32]:
def get_random_noun_dataset(sentences):
    """
    Input: list of original sentences
    Output: perturbed version of each sentence with only nouns (same number as in original sentence),
    but randomly drawn from dataset
    """
    
    #set random seed for reproducability
    np.random.seed(42)
    random.seed(42)
    
    n = ['NOUN', 'PROPN', 'PRON'] #same as in O'Connor & Andreas (2021)
    pos_list = n
    
    #gather all nouns in a list
    all_nouns = []
    #count how many nouns should go in each sentence
    nr_nouns_in_sentences = []
    #keep track of which nouns were in which sentence for checking later
    nouns_in_sentences = []
    
    for sent in sentences:
        tokens = nlp(sent)
        curr_nouns = [str(elm).lower() for elm in tokens if elm.pos_ in pos_list]
        
        all_nouns += curr_nouns
        nr_nouns_in_sentences.append(len(curr_nouns))
        nouns_in_sentences.append(curr_nouns)
    
    perturbed_sentences = []
    for ind, n in enumerate(nr_nouns_in_sentences):
        random_nouns = random.sample(all_nouns, n)
        assert set(random_nouns) != set(nouns_in_sentences[ind]) #check that not the same nouns are selected
        perturbed_sentences.append(' '.join(random_nouns) + ".")
        [all_nouns.remove(elm) for elm in random_nouns] #remove selected nouns from list
        #print(len(all_nouns))
    
    assert len(all_words) == 1, f"Not all words from the dataset have been used. Length of word list is {len(all_words)}!"
     
    return perturbed_sentences

random_nouns = get_random_noun_dataset(original_stimuli)

perturbed_df = stimuli_df.copy()
perturbed_df["sentence"] = random_nouns

fname = f"{savedir}/stimuli_randomnouns_noreplacement.pkl"
with open(fname, 'wb') as fout:
    pickle.dump(perturbed_df, fout)
perturbed_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,goods classes.,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,we i history owl stylist.,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,evening mammals pipe people its part.,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,i conditions horns bears coconuts firearms.,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,cycles infantry music bow lizards water night.,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,smell interpretation ingredients.,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,storm repair.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,clothing desk.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,cultures smell.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [33]:
#check that nouns and random nouns datasets have same length strings:
with open(os.path.join(savedir,'stimuli_randomnouns_noreplacement.pkl'), "rb") as f:
    check_randomnouns_df = pickle.load(f)
with open(os.path.join(savedir,'stimuli_nouns.pkl'), "rb") as f:
    check_nouns_df = pickle.load(f)
    
sent_len_randomnouns = [len(elm.split()) for elm in list(check_randomnouns_df["sentence"])]
sent_len_nouns = [len(elm.split()) for elm in list(check_nouns_df["sentence"])]

assert sent_len_randomnouns == sent_len_nouns
print("Done")

Done
