In [1]:
import json
from checklist.expect import Expect
from checklist.test_types import MFT, INV, DIR
from checklist.pred_wrapper import PredictorWrapper
from allennlp_models.pretrained import load_predictor
from utils import extract_test_case_and_target, format_srl, compare_spans, compare_spans_loc, compare_spans_voice, get_tag

In [2]:
# load two required SRL models (BiLSTM/BERT)
BiLSTM = load_predictor('structured-prediction-srl')
BERT = load_predictor('structured-prediction-srl-bert')

error loading _jsonnet (this is expected on Windows), treating C:\Users\13610\AppData\Roaming\Python\Python39\site-packages\allennlp_models\modelcards\coref-spanbert.json as plain json
error loading _jsonnet (this is expected on Windows), treating C:\Users\13610\AppData\Roaming\Python\Python39\site-packages\allennlp_models\modelcards\evaluate_rc-lerc.json as plain json
lerc is not a registered model.
error loading _jsonnet (this is expected on Windows), treating C:\Users\13610\AppData\Roaming\Python\Python39\site-packages\allennlp_models\modelcards\generation-bart.json as plain json
error loading _jsonnet (this is expected on Windows), treating C:\Users\13610\AppData\Roaming\Python\Python39\site-packages\allennlp_models\modelcards\glove-sst.json as plain json
error loading _jsonnet (this is expected on Windows), treating C:\Users\13610\AppData\Roaming\Python\Python39\site-packages\allennlp_models\modelcards\lm-masked-language-model.json as plain json
error loading _jsonnet (this is exp

## Test two SRL models (BiLSTM/BERT) with the capability of Causative Alternation

In [3]:
# Read data from the challenge set of causative alternation
with open('data/causative_alternation.json') as f:
    data = json.load(f)
    causative_alternation_list = data['data']
causative_alternation_dict = extract_test_case_and_target(causative_alternation_list)

# Test two SRL models (BiLSTM/BERT) with the capability of Causative Alternation
for model_name in ['BiLSTM', 'BERT']:
    if model_name == "BiLSTM":
        srl_predictor = BiLSTM
    if model_name == "BERT":
        srl_predictor = BERT
    expection = Expect.pairwise(compare_spans)
    test = INV(**causative_alternation_dict, expect=expection)
    
    def predict_srl(data):
        pred = []
        for d in data:
            pred.append(srl_predictor.predict(d))
        return pred

    predict_and_conf = PredictorWrapper.wrap_predict(predict_srl)
    test.run(predict_and_conf)
    print(f"===== Model: {model_name}, test: Causative alternation =====")
    test.summary(format_example_fn=format_srl)
    print()
    
    # Saving the predictions
    output = []
    for e in causative_alternation_list:
        e['model'] = model_name
        e['prediction'] = (get_tag(test, causative_alternation_list.index(e), 0, 0, 2),
                           get_tag(test, causative_alternation_list.index(e), 1, 0, 1))
        output.append(e)
    
    # Writing system output to JSON file
    system_output = {}
    system_output['output'] = output
    filename = f"output/causative alternation/{model_name}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(system_output, f, ensure_ascii=False, indent=4)

Predicting 40 examples
===== Model: BiLSTM, test: Causative alternation =====
Test cases:      20
Fails (rate):    15 (75.0%)

Example fails:
[ARG0: The chef] [V: caramelized] [ARG1: the onions] .
[ARG0: The onions] [V: caramelized] [ARGM-TMP: last night] .

----
[ARG0: The chef] [V: fermented] [ARG1: the meat] .
[ARG0: The meat] [V: fermented] [ARGM-TMP: last night] .

----
[ARG0: The chef] [V: fermented] [ARG1: the tomatoes] .
[ARG0: The tomatoes] [V: fermented] [ARGM-TMP: last night] .

----

Predicting 40 examples
===== Model: BERT, test: Causative alternation =====
Test cases:      20
Fails (rate):    4 (20.0%)

Example fails:
[ARG0: The chef] [V: toasted] [ARG1: the tomatoes] .
[ARG0: The tomatoes] [V: toasted] [ARGM-TMP: last night] .

----
[ARG0: The chef] [V: toasted] [ARG1: the onions] .
[ARG0: The onions] [V: toasted] [ARGM-TMP: last night] .

----
[ARG0: The chef] [V: toasted] [ARG1: the meat] .
[ARG0: The meat] [V: toasted] [ARGM-TMP: last night] .

----



## Test two SRL models (BiLSTM/BERT) with the capability of Long Distance Dependencies

In [4]:
# Read data from the challenge set of Long Distance Dependencies
with open('data/long_distance_dependencies.json') as f:
    data = json.load(f)
    long_distance_dependencies_list = data['data']
long_distance_dependencies_dict = extract_test_case_and_target(long_distance_dependencies_list)

# Test two SRL models (BiLSTM/BERT) with the capability of Long Distance Dependencies
for model_name in ['BiLSTM', 'BERT']:
    if model_name == "BiLSTM":
        srl_predictor = BiLSTM
    if model_name == "BERT":
        srl_predictor = BERT
    expection = Expect.pairwise(compare_spans)
    test = INV(**long_distance_dependencies_dict, expect=expection)
    
    def predict_srl(data):
        pred = []
        for d in data:
            pred.append(srl_predictor.predict(d))
        return pred
    
    predict_and_conf = PredictorWrapper.wrap_predict(predict_srl)
    test.run(predict_and_conf)
    print(f"===== Model: {model_name}, test: Long Distance Dependencies =====")
    test.summary(format_example_fn=format_srl)
    print()
    
    # Saving the predictions
    output = []
    for e in long_distance_dependencies_list:
        e['model'] = model_name
        e['prediction'] = (get_tag(test, long_distance_dependencies_list.index(e), 0, 0, 7),
                           get_tag(test, long_distance_dependencies_list.index(e), 1, -1, -1))
        output.append(e)
    
    # Writing system output to JSON file
    system_output = {}
    system_output['output'] = output
    filename = f"output/long distance dependencies/{model_name}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(system_output, f, ensure_ascii=False, indent=4)

Predicting 20 examples
===== Model: BiLSTM, test: Long Distance Dependencies =====
Test cases:      10
Fails (rate):    10 (100.0%)

Example fails:
[ARG0: The musician] [V: tapped] [ARG1: the drum] [ARGM-MNR: with a drumstick] .
[ARG1: The musician] [R-ARG1: whose voice] [V: was] [ARG2: like honey tapped the drum with a drumstick] .

----
[ARG0: The musician] [V: tapped] [ARG1: the drum] [ARGM-MNR: with a drumstick] .
The musician whose music [V: had] inspired a generation tapped the drum with a drumstick .

----
[ARG0: The musician] [V: tapped] [ARG1: the drum] [ARGM-MNR: with a drumstick] .
[ARG0: The musician] [R-ARG0: who] [V: had] [ARG1: a loyal fan base] tapped the drum with a drumstick .

----

Predicting 20 examples
===== Model: BERT, test: Long Distance Dependencies =====
Test cases:      10
Fails (rate):    10 (100.0%)

Example fails:
[ARG0: The musician] [V: tapped] [ARG1: the drum] [ARG2: with a drumstick] .
The musician who [V: had] traveled from another country tapped the

## Test two SRL models (BiLSTM/BERT) with the capability of Location modifiers

In [5]:
# Read data from the challenge set of Location modifiers
with open('data/location_modifiers.json') as f:
    data = json.load(f)
    location_modifiers_list = data['data']
location_modifiers_dict = extract_test_case_and_target(location_modifiers_list)

# Test two SRL models (BiLSTM/BERT) with the capability of Location modifiers
for model_name in ['BiLSTM', 'BERT']:
    if model_name == "BiLSTM":
        srl_predictor = BiLSTM
    if model_name == "BERT":
        srl_predictor = BERT
    expection = Expect.pairwise(compare_spans_loc)
    test = DIR(**location_modifiers_dict, expect=expection)
    
    def predict_srl(data):
        pred = []
        for d in data:
            pred.append(srl_predictor.predict(d))
        return pred
    
    predict_and_conf = PredictorWrapper.wrap_predict(predict_srl)
    test.run(predict_and_conf)
    print(f"===== Model: {model_name}, test: Location modifiers =====")
    test.summary(format_example_fn=format_srl)
    print()
    
    # Saving the predictions
    output = []
    for e in location_modifiers_list:
        e['model'] = model_name
        e['prediction'] = (get_tag(test, location_modifiers_list.index(e), 0, 0, -2),
                           get_tag(test, location_modifiers_list.index(e), 1, 0, -2))
        output.append(e)
    
    # Writing system output to JSON file
    system_output = {}
    system_output['output'] = output
    filename = f"output/location modifiers/{model_name}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(system_output, f, ensure_ascii=False, indent=4)

Predicting 20 examples
===== Model: BiLSTM, test: Location modifiers =====
Test cases:      10
Fails (rate):    9 (90.0%)

Example fails:
[ARG0: He] [V: works] [ARGM-LOC: in Sydney] .
[ARG0: He] [V: works] [ARGM-LOC: in government] .

----
[ARG0: He] [V: works] [ARGM-LOC: in Hong Kong] .
[ARG0: He] [V: works] [ARGM-LOC: in manufacturing] .

----
[ARG0: He] [V: works] [ARGM-LOC: in Singapore] .
[ARG0: He] [V: works] [ARGM-LOC: in real estate] .

----

Predicting 20 examples
===== Model: BERT, test: Location modifiers =====
Test cases:      10
Fails (rate):    3 (30.0%)

Example fails:
[ARG0: He] [V: works] [ARGM-LOC: in Sydney] .
[ARG0: He] [V: works] [ARGM-LOC: in government] .

----
[ARG0: He] [V: works] [ARGM-LOC: in Tokyo] .
[ARG0: He] [V: works] [ARGM-LOC: in retail] .

----
[ARG0: He] [V: works] [ARGM-LOC: in London] .
[ARG0: He] [V: works] [ARGM-LOC: in healthcare] .

----



## Test two SRL models (BiLSTM/BERT) with the capability of Voice

In [6]:
# Read data from the challenge set of voice
with open('data/voice.json') as f:
    data = json.load(f)
    voice_list = data['data']
voice_dict = extract_test_case_and_target(voice_list)

# Test two SRL models (BiLSTM/BERT) with the capability of voice
for model_name in ['BiLSTM', 'BERT']:
    if model_name == "BiLSTM":
        srl_predictor = BiLSTM
    if model_name == "BERT":
        srl_predictor = BERT
    expection = Expect.pairwise(compare_spans_voice)
    test = INV(**voice_dict, expect=expection)
    
    def predict_srl(data):
        pred = []
        for d in data:
            pred.append(srl_predictor.predict(d))
        return pred
    
    predict_and_conf = PredictorWrapper.wrap_predict(predict_srl)
    test.run(predict_and_conf)
    print(f"===== Model: {model_name}, test: Voice =====")
    test.summary(format_example_fn=format_srl)
    print()
    
    # Saving the predictions
    output = []
    for e in voice_list:
        e['model'] = model_name
        e['prediction'] = (get_tag(test, voice_list.index(e), 0, 0, 3),
                           get_tag(test, voice_list.index(e), 1, 1, 1))
        output.append(e)
    
    # Writing system output to JSON file
    system_output = {}
    system_output['output'] = output
    filename = f"output/voice/{model_name}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(system_output, f, ensure_ascii=False, indent=4)

Predicting 40 examples
===== Model: BiLSTM, test: Voice =====
Test cases:      20
Fails (rate):    0 (0.0%)

Predicting 40 examples
===== Model: BERT, test: Voice =====
Test cases:      20
Fails (rate):    0 (0.0%)



## Test two SRL models (BiLSTM/BERT) with the capability of Robustness

In [7]:
# Read data from the challenge set of Robustness
with open('data/robustness.json') as f:
    data = json.load(f)
    robustness_list = data['data']
robustness_dict = extract_test_case_and_target(robustness_list)

# Test two SRL models (BiLSTM/BERT) with the capability of Robustness
for model_name in ['BiLSTM', 'BERT']:
    if model_name == "BiLSTM":
        srl_predictor = BiLSTM
    if model_name == "BERT":
        srl_predictor = BERT
    expection = Expect.pairwise(compare_spans)
    test = INV(**robustness_dict, expect=expection)
    
    def predict_srl(data):
        pred = []
        for d in data:
            pred.append(srl_predictor.predict(d))
        return pred
    
    predict_and_conf = PredictorWrapper.wrap_predict(predict_srl)
    test.run(predict_and_conf)
    print(f"===== Model: {model_name}, test: Robustness =====")
    test.summary(format_example_fn=format_srl)
    print()
    
    # Saving the predictions
    output = []
    for e in robustness_list:
        e['model'] = model_name
        e['prediction'] = (get_tag(test, robustness_list.index(e), 0, 0, 1),
                           get_tag(test, robustness_list.index(e), 1, 0, 1))
        output.append(e)
    
    # Writing system output to JSON file
    system_output = {}
    system_output['output'] = output
    filename = f"output/robustness/{model_name}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(system_output, f, ensure_ascii=False, indent=4)

Predicting 200 examples
===== Model: BiLSTM, test: Robustness =====
Test cases:      100
Fails (rate):    25 (25.0%)

Example fails:
[ARG0: The boy] [V: played] [ARGM-MNR: with his friends] .
The boyz playd with his frands .

----
[ARG0: The boy] [V: played] [ARGM-MNR: with his friends] .
The bly playd with his frinds .

----
[ARG0: The boy] [V: played] [ARGM-MNR: with his friends] .
The boi playd with his frands .

----

Predicting 200 examples
===== Model: BERT, test: Robustness =====
Test cases:      100
Fails (rate):    32 (32.0%)

Example fails:
[ARG0: The boy] [V: played] [ARGM-COM: with his friends] .
The boy playd with his frinds .

----
[ARG0: The boy] [V: played] [ARGM-COM: with his friends] .
The boy playd with his friends .

----
[ARG0: The boy] [V: played] [ARGM-COM: with his friends] .
The bly playd with his frands .

----

