In [1]:
import json

## Causative alternation

In [2]:
verbs = ["caramelized", "fermented", "roasted", "toasted", "simmered"]
objects = ["onions", "tomatoes", "potatoes", "meat"]

# Create 20 sentences with transitive verbs
transitive = [f'The chef {verb} the {obj}.' for obj in objects for verb in verbs]

# Change the 20 sentences above by causative alternation
intransitive = [f'The {obj} {verb} last night.' for obj in objects for verb in verbs]
counting = [f'{obj}' for obj in objects for verb in verbs]

# Save key information of each pair of sentence
data = []
for i, sent in enumerate(transitive):
    example = {'capability': "causative_alternation",
               'test_type': "MFT",
               'test_case': (sent, intransitive[i]),
               'target': (counting[i], counting[i]),
               'expected_label': "ARG1"}
    data.append(example)

# Save as JSON file for the MFT test
causative_alternation = {}
causative_alternation['data'] = data
with open("data/causative_alternation.json", 'w', encoding='utf-8') as f:
    json.dump(causative_alternation, f, ensure_ascii=False, indent=4)

## Long Distance Dependencies

In [3]:
embedded_clauses = ["whose voice was like honey",
                    "who had just won a Grammy",
                    "who had traveled from another country",
                    "who was known for his improvisation skills",
                    "who had played with many famous musicians",
                    "who had just released a new album",
                    "who had studied music at Juilliard",
                    "who was known for his stage presence",
                    "whose music had inspired a generation",
                    "who had a loyal fan base"]

# Create original sentences
original_sentence = ['The musician tapped the drum with a drumstick.']*10

# Create sentences with embedded clauses
sentence_with_clause = [f'The musician {clause} tapped the drum with a drumstick.' for clause in embedded_clauses]

# Save key information of each pair of sentence
data = []
for i, sent in enumerate(original_sentence):
    example = {'capability': "long_distance_dependencies",
               'test_type': "INV",
               'test_case': (sent, sentence_with_clause[i]),
               'target': ("drumstick", "drumstick"),
               'expected_label': "ARG2"}
    data.append(example)

# Save as JSON file for the INV test
long_distance_dependencies = {}
long_distance_dependencies['data'] = data
with open("data/long_distance_dependencies.json", 'w', encoding='utf-8') as f:
    json.dump(long_distance_dependencies, f, ensure_ascii=False, indent=4)

## Location modifiers

In [4]:
locations = ['Amsterdam', 'New York', 'London', 'Paris', 'Tokyo', 
             'Sydney', 'Dubai', 'Hong Kong', 'Mumbai', 'Singapore']

occupations = ['finance', 'marketing', 'healthcare', 'education', 'retail',
               'government', 'advertising', 'manufacturing', 'consulting', 'real estate']

# Create 10 sentences with ARGM-LOC
argm_loc = [f'He works in {location}.' for location in locations]

# Change the 10 sentences above with ARG2
arg_2 = [f'He works in {occupation}.' for occupation in occupations]

# Save key information of each pair of sentence
data = []
for i, sent in enumerate(argm_loc):
    example = {'capability': "location_modifiers",
               'test_type': "DIR",
               'test_case': (sent, arg_2[i]),
               'target': (locations[i], occupations[i]),
               'expected_label': ("ARGM-LOC", "ARG2")}
    data.append(example)

# Save as JSON file for the DIR test
location_modifiers = {}
location_modifiers['data'] = data
with open("data/location_modifiers.json", 'w', encoding='utf-8') as f:
    json.dump(location_modifiers, f, ensure_ascii=False, indent=4)

## Voice

In [5]:
subjects = ['Alice', 'Bob']
verbs = ["burned", "dropped", "opened", "closed", "found", "kicked", "painted", "moved", "flipped", "picked"]

# Create 50 active sentences
active_sentence = [f'{subject} {verb} the box.' for subject in subjects for verb in verbs]

# Change the 50 sentences above as passive sentences
passive_sentence = [f'The box was {verb} by {subject}.' for subject in subjects for verb in verbs]

# Save key information of each pair of sentence
data = []
for i, sent in enumerate(active_sentence):
    example = {'capability': "voice",
               'test_type': "MFT",
               'test_case': (sent, passive_sentence[i]),
               'target': ("box", "box"),
               'expected_label': "ARG1"}
    data.append(example)

# Save as JSON file for the MFT test
voice = {}
voice['data'] = data
with open("data/voice.json", 'w', encoding='utf-8') as f:
    json.dump(voice, f, ensure_ascii=False, indent=4)

## Robustness

In [6]:
Boy = ["boi", "boyz", "bly", "bey", "boy"]
Played = ["pleyed", "playd", "plaied", "plyed"]
Friends = ["frends", "freinds", "frinds", "frands", "friends"]

# Create original sentences
sentence = ['The boy played with his friends.']*100

# Replace some words in the original sentences with some typos or misspellings
typos = [f'The {boy} {played} with his {friends}.' for boy in Boy for played in Played for friends in Friends]
counting = [f'{boy}' for boy in Boy for played in Played for friends in Friends]

# Save key information of each pair of sentence
data = []
for i, sent in enumerate(sentence):
    example = {'capability': "robustness",
               'test_type': "INV",
               'test_case': (sent, typos[i]),
               'target': ("boy", counting[i]),
               'expected_label': "ARG0"}
    data.append(example)

# Save as JSON file for the INV test
robustness = {}
robustness['data'] = data
with open("data/robustness.json", 'w', encoding='utf-8') as f:
    json.dump(robustness, f, ensure_ascii=False, indent=4)