In [11]:
import json
import random

def label_to_string(label):
    """Convert numeric label to string."""
    return {0: "entailed", 1: "neutral", 2: "contradiction"}.get(label, "unknown")

def find_mismatches(file_path):
    mismatches = []

    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)

            label = label_to_string(data['label'])
            predicted_label = label_to_string(data['predicted_label'])

            if data['label'] != data['predicted_label']:
                mismatches.append({
                    'label': label,
                    'predicted_label': predicted_label,
                    'premise': data['premise'],
                    'hypothesis': data['hypothesis'],
                    'predicted_scores': data['predicted_scores']
                })

    return mismatches

mismatches = find_mismatches('eval_output/eval_predictions.jsonl')

random.seed(42)
sampled_mismatches = random.sample(mismatches, 25)

for mismatch in sampled_mismatches:
    print(mismatch)

{'label': 'neutral', 'predicted_label': 'contradiction', 'premise': 'A woman walking, smiling and holding a shopping bag.', 'hypothesis': 'Some women go shopping, while men stay at home.', 'predicted_scores': [-4.604426383972168, -0.18738415837287903, 3.454765558242798]}
{'label': 'neutral', 'predicted_label': 'entailed', 'premise': 'A man is sitting outside wearing blue pants.', 'hypothesis': 'The person is on the ground.', 'predicted_scores': [3.0150976181030273, -0.592652440071106, -2.602357864379883]}
{'label': 'neutral', 'predicted_label': 'entailed', 'premise': "A baby in an indoor pool is using an inflatable tube on it's on", 'hypothesis': 'The baby is swimming', 'predicted_scores': [0.9003664255142212, 0.49155476689338684, -1.478476643562317]}
{'label': 'contradiction', 'predicted_label': 'neutral', 'premise': 'The person and the dog are sitting on the yellow canoe.', 'hypothesis': 'The man is using the dog as a paddle.', 'predicted_scores': [-0.9695155024528503, 1.525601983070

In [10]:
import json
from datasets import load_dataset
import nlpaug.augmenter.word as naw

# Load the dataset
dataset = load_dataset('snli', split='validation')

# Initialize the augmenter
aug = naw.SynonymAug(aug_src='wordnet')

def augment_text(text, aug, num_augments=1):
    return [aug.augment(text) for _ in range(num_augments)]

# Open a file to write the examples
with open('augmented_snli.jsonl', 'w') as file:
    for example in dataset.select(range(1000)):  # Assuming we augment the first 100 examples
        # Augment premise and hypothesis
        augmented_premises = augment_text(example['premise'], aug)
        augmented_hypotheses = augment_text(example['hypothesis'], aug)

        # Write the original example
        # original_example = {
        #     'premise': example['premise'],
        #     'hypothesis': example['hypothesis'],
        #     'label': example['label']
        # }
        # file.write(json.dumps(original_example) + '\n')

        # Write each augmented example
        for aug_premise, aug_hypothesis in zip(augmented_premises, augmented_hypotheses):
            augmented_example = {
                'premise': aug_premise[0],
                'hypothesis': aug_hypothesis[0],
                'label': example['label']
            }
            file.write(json.dumps(augmented_example) + '\n')
