In [None]:
# -*- coding: utf-8 -*-
"""WinoGrande.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1h4vxsLK2m-K62-qMLfKqF7jiZ3AlTYRZ
"""

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
import torch.nn.functional as F
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "EleutherAI/gpt-neo-1.3B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16
).to(device)

model.eval()

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 2048)
    (wpe): Embedding(2048, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPTNeoBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
          )
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
          (c_proj):

In [None]:
def forward_with_skipped_layers(model, input_ids, attention_mask, skip_layers):
    """
    Performs a forward pass through GPT-Neo while skipping the transformer
    layers specified in skip_layers.
    """
    # Embeddings
    hidden_states = model.transformer.wte(input_ids)
    position_ids = torch.arange(input_ids.shape[1], device=device).unsqueeze(0)
    hidden_states = hidden_states + model.transformer.wpe(position_ids)

    seq_len = attention_mask.shape[1]
    batch_size = input_ids.shape[0]

    causal_mask = torch.tril(torch.ones((seq_len, seq_len), device=device)).view(
        1, 1, seq_len, seq_len
    )
    attention_mask_4d = attention_mask.view(batch_size, 1, 1, seq_len)
    combined_mask = causal_mask * attention_mask_4d
    combined_mask = (1.0 - combined_mask) * torch.finfo(torch.float16).min

    # Transformer layers
    for idx, layer in enumerate(model.transformer.h):
        if idx in skip_layers:
            continue
        hidden_states = layer(hidden_states, attention_mask=combined_mask)[0]

    # Final layer norm + LM head
    hidden_states = model.transformer.ln_f(hidden_states)
    logits = model.lm_head(hidden_states)
    return logits

In [None]:
def winogrande_prompt(sentence, option):
    """
    Insert the option into the blank (_) in the Winogrande sentence.
    """
    return sentence.replace("_", option)


def evaluate_winogrande(model, tokenizer, skip_layers, sample_size=256):
    """
    Evaluates GPT-Neo on Winogrande using perplexity comparison.
    """
    dataset = load_dataset("winogrande", "winogrande_xl", split="validation")
    dataset = dataset.select(range(sample_size))

    correct = 0

    for example in dataset:
        sentence = example["sentence"]
        option1 = example["option1"]
        option2 = example["option2"]
        label = example["answer"]  # "1" or "2"

        # Make two filled-in sentences
        s1 = winogrande_prompt(sentence, option1)
        s2 = winogrande_prompt(sentence, option2)

        # Tokenize
        inputs1 = tokenizer(s1, return_tensors="pt").to(device)
        inputs2 = tokenizer(s2, return_tensors="pt").to(device)

        with torch.no_grad():
            logits1 = forward_with_skipped_layers(
                model, inputs1.input_ids, inputs1.attention_mask, skip_layers
            )
            logits2 = forward_with_skipped_layers(
                model, inputs2.input_ids, inputs2.attention_mask, skip_layers
            )

        # Compute sentence losses (negative log-likelihood)
        def compute_loss(logits, labels):
            shift_logits = logits[:, :-1].contiguous()
            shift_labels = labels[:, 1:].contiguous()
            loss = F.cross_entropy(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1),
                reduction="mean",
            )
            return loss.item()

        loss1 = compute_loss(logits1, inputs1.input_ids)
        loss2 = compute_loss(logits2, inputs2.input_ids)

        # Model chooses the option with *lower* loss
        pred = "1" if loss1 < loss2 else "2"
        # print()
        # print(f"Option 1: {s1}")
        # print(f"Option 2: {s2}")
        # print(f"Option {label} is correct. The model found option {pred} more likely.")
        if pred == label:
            correct += 1

    accuracy = correct / sample_size
    return accuracy

In [None]:
def hellaswag_prompt(context, ending):
    """
    Combine context + ending for HellaSwag.
    """
    # You can customize how you join context + ending; simplest:
    return context + " " + ending


def evaluate_hellaswag(model, tokenizer, skip_layers, sample_size=256):
    """
    Evaluates GPT-Neo on HellaSwag using perplexity comparison.
    """
    dataset = load_dataset("hellaswag", split="validation")
    dataset = dataset.select(range(sample_size))

    correct = 0

    for example in dataset:
        context = example["ctx"]
        endings = example["endings"]  # list of 4 endings
        label = int(example["label"])     # integer 0..3

        losses = []
        for ending in endings:
            sentence = hellaswag_prompt(context, ending)
            # print(sentence)
            inputs = tokenizer(sentence, return_tensors="pt").to(device)

            with torch.no_grad():
                logits = forward_with_skipped_layers(
                    model, inputs.input_ids, inputs.attention_mask, skip_layers
                )

            # Compute negative log-likelihood
            shift_logits = logits[:, :-1].contiguous()
            shift_labels = inputs.input_ids[:, 1:].contiguous()
            loss = F.cross_entropy(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1),
                reduction="mean",
            )
            losses.append(loss.item())

        # Model chooses the ending with the lowest loss
        pred = int(torch.argmin(torch.tensor(losses)))
        if pred == label:
            correct += 1

    accuracy = correct / sample_size
    return accuracy

In [None]:
def boolq_prompt(passage, question, answer_text):
    """
    Combine passage + question + answer candidate as a prompt.
    We'll score the model's likelihood of the answer_text being correct.
    """
    # Simple prompt template
    return f"Passage: {passage}\nQuestion: {question}\nAnswer: {answer_text}"

def evaluate_boolq(model, tokenizer, skip_layers=[], sample_size=256):
    """
    Evaluate GPT-Neo on BoolQ using likelihood comparison.
    """
    dataset = load_dataset("boolq", split="validation")
    dataset = dataset.select(range(sample_size))

    correct = 0

    for example in dataset:
        passage = example["passage"]
        question = example["question"]
        label = example["answer"]  # True/False

        # Candidate options
        candidates = ["True", "False"]

        losses = []
        for option in candidates:
            prompt = boolq_prompt(passage, question, option)
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

            with torch.no_grad():
                logits = forward_with_skipped_layers(
                    model, inputs.input_ids, inputs.attention_mask, skip_layers
                )

            # Compute negative log-likelihood
            shift_logits = logits[:, :-1].contiguous()
            shift_labels = inputs.input_ids[:, 1:].contiguous()
            mask = shift_labels != tokenizer.pad_token_id
            loss = F.cross_entropy(
                shift_logits.view(-1, shift_logits.size(-1))[mask.view(-1)],
                shift_labels.view(-1)[mask.view(-1)],
                reduction="mean",
            )
            losses.append(loss.item())

        # Model predicts the option with the lowest loss
        pred_idx = int(torch.argmin(torch.tensor(losses)))
        pred = True if candidates[pred_idx] == "True" else False

        if pred == label:
            correct += 1

    accuracy = correct / sample_size
    return accuracy


In [None]:
import json
from tqdm import tqdm

def greedy_layer_pruning(
    model,
    tokenizer,
    evaluate_fn,        # function: (model, tokenizer, skip_layers, sample_size) -> accuracy
    max_layers_to_remove=6,
    num_samples=128,
    output_file=None
):
    """
    Greedy layer pruning: iteratively remove layers that hurt accuracy the least.

    Args:
        model: HuggingFace transformer model
        tokenizer: corresponding tokenizer
        evaluate_fn: evaluation function that returns accuracy
        num_layers: number of transformer layers; inferred from model if None
        max_layers_to_remove: max number of layers to prune
        num_samples: number of dataset samples to use in evaluation
        output_file: JSON filename to save full log
    """
    num_layers = len(model.transformer.h)

    removed_layers = set()
    removal_history = []
    full_test_log = []

    # Baseline accuracy
    baseline_acc = evaluate_fn(model, tokenizer, skip_layers=[], sample_size=num_samples)
    print(f"Baseline accuracy: {baseline_acc:.2%}")

    removal_history.append({
        'iteration': 0,
        'removed_layers': [],
        'accuracy': baseline_acc
    })

    for iteration in range(1, max_layers_to_remove + 1):
        print(f"\nIteration {iteration}: finding best layer to remove...")
        best_acc = -1.0
        best_layer = None

        candidate_layers = [l for l in range(num_layers) if l not in removed_layers]
        iteration_log = []

        for layer in tqdm(candidate_layers):
            test_skip_layers = removed_layers | {layer}
            acc = evaluate_fn(model, tokenizer, skip_layers=list(test_skip_layers), sample_size=num_samples)

            # Log every layer tested
            iteration_log.append({
                'tested_layer': layer,
                'skip_layers': sorted(list(test_skip_layers)),
                'accuracy': acc
            })

            # Pick the layer whose removal hurts accuracy the least
            if acc >= best_acc:
                best_acc = acc
                best_layer = layer

        # Permanently remove the best layer
        removed_layers.add(best_layer)
        removal_history.append({
            'iteration': iteration,
            'layer_removed': best_layer,
            'removed_layers': sorted(list(removed_layers)),
            'accuracy': best_acc
        })

        # Append iteration log to full log
        full_test_log.append({
            'iteration': iteration,
            'tested_candidates': iteration_log,
            'selected_layer': best_layer,
            'accuracy_after_removal': best_acc
        })

        print(f"Removed layer {best_layer}. New removed set: {sorted(list(removed_layers))}")
        print(f"Accuracy after removal: {best_acc:.2%}")

    # Print final summary
    print("\nGreedy Layer Pruning Summary:")
    for record in removal_history:
        if record['iteration'] == 0:
            print(f"Baseline: Accuracy={record['accuracy']:.2%}")
        else:
            print(f"After removing {record['iteration']} layer(s) ({record['removed_layers']}): Accuracy={record['accuracy']:.2%}")

    # Save full log if requested
    if output_file is not None:
        with open(output_file, "w") as f:
            json.dump(full_test_log, f, indent=4)

    return removal_history, full_test_log

In [None]:
from datasets import load_dataset
import torch
import torch.nn.functional as F

def copa_prompt(premise, choice, question_type):
    """
    Build COPA prompt.
    question_type is either 'cause' or 'effect'.
    """
    if question_type == "cause":
        return f"{premise} This happened because {choice}."
    else:  # effect
        return f"{premise} As a result, {choice}."


def evaluate_copa(model, tokenizer, skip_layers, sample_size=200):
    """
    Evaluates COPA using loss comparison between choice1 and choice2.
    """
    dataset = load_dataset("pkavumba/balanced-copa", split="train")
    dataset = dataset.select(range(min(sample_size, len(dataset))))

    correct = 0

    for example in dataset:
        premise = example["premise"]
        choice1 = example["choice1"]
        choice2 = example["choice2"]
        qtype   = example["question"]    # "cause" or "effect"
        label   = int(example["label"])  # 0 or 1

        s1 = copa_prompt(premise, choice1, qtype)
        s2 = copa_prompt(premise, choice2, qtype)

        # Tokenize
        inputs1 = tokenizer(s1, return_tensors="pt").to(model.device)
        inputs2 = tokenizer(s2, return_tensors="pt").to(model.device)

        with torch.no_grad():
            logits1 = forward_with_skipped_layers(
                model, inputs1.input_ids, inputs1.attention_mask, skip_layers
            )
            logits2 = forward_with_skipped_layers(
                model, inputs2.input_ids, inputs2.attention_mask, skip_layers
            )

        # NLL losses
        def loss_fn(logits, labels):
            shift_logits = logits[:, :-1].contiguous()
            shift_labels = labels[:, 1:].contiguous()
            return F.cross_entropy(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1),
                reduction="mean"
            ).item()

        loss1 = loss_fn(logits1, inputs1.input_ids)
        loss2 = loss_fn(logits2, inputs2.input_ids)

        pred = 0 if loss1 < loss2 else 1

        if pred == label:
            correct += 1

    return correct / len(dataset)

In [None]:
samples = 256

In [None]:
hellaswag_history, hellaswag_log = greedy_layer_pruning(
    model,
    tokenizer,
    evaluate_fn=evaluate_hellaswag,
    max_layers_to_remove=6,
    num_samples=samples,
    output_file="hellaswag_pruning_log.json"
)


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/6.11M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/6.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/39905 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10042 [00:00<?, ? examples/s]

Baseline accuracy: 36.72%

Iteration 1: finding best layer to remove...


100%|██████████| 24/24 [11:57<00:00, 29.88s/it]


Removed layer 21. New removed set: [21]
Accuracy after removal: 38.67%

Iteration 2: finding best layer to remove...


100%|██████████| 23/23 [10:46<00:00, 28.12s/it]


Removed layer 12. New removed set: [12, 21]
Accuracy after removal: 39.45%

Iteration 3: finding best layer to remove...


100%|██████████| 22/22 [10:06<00:00, 27.57s/it]


Removed layer 19. New removed set: [12, 19, 21]
Accuracy after removal: 38.28%

Iteration 4: finding best layer to remove...


100%|██████████| 21/21 [09:11<00:00, 26.25s/it]


Removed layer 15. New removed set: [12, 15, 19, 21]
Accuracy after removal: 39.45%

Iteration 5: finding best layer to remove...


100%|██████████| 20/20 [08:23<00:00, 25.18s/it]


Removed layer 13. New removed set: [12, 13, 15, 19, 21]
Accuracy after removal: 36.72%

Iteration 6: finding best layer to remove...


100%|██████████| 19/19 [07:37<00:00, 24.09s/it]

Removed layer 20. New removed set: [12, 13, 15, 19, 20, 21]
Accuracy after removal: 35.94%

Greedy Layer Pruning Summary:
Baseline: Accuracy=36.72%
After removing 1 layer(s) ([21]): Accuracy=38.67%
After removing 2 layer(s) ([12, 21]): Accuracy=39.45%
After removing 3 layer(s) ([12, 19, 21]): Accuracy=38.28%
After removing 4 layer(s) ([12, 15, 19, 21]): Accuracy=39.45%
After removing 5 layer(s) ([12, 13, 15, 19, 21]): Accuracy=36.72%
After removing 6 layer(s) ([12, 13, 15, 19, 20, 21]): Accuracy=35.94%





In [None]:
winogrande_history, winogrande_log = greedy_layer_pruning(
    model,
    tokenizer,
    evaluate_fn=evaluate_winogrande,
    max_layers_to_remove=6,
    num_samples=samples,
    output_file="winogrande_pruning_log.json"
)

README.md: 0.00B [00:00, ?B/s]

winogrande_xl/train-00000-of-00001.parqu(…):   0%|          | 0.00/2.06M [00:00<?, ?B/s]

winogrande_xl/test-00000-of-00001.parque(…):   0%|          | 0.00/118k [00:00<?, ?B/s]

winogrande_xl/validation-00000-of-00001.(…):   0%|          | 0.00/85.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40398 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Baseline accuracy: 57.81%

Iteration 1: finding best layer to remove...


100%|██████████| 24/24 [06:10<00:00, 15.44s/it]


Removed layer 15. New removed set: [15]
Accuracy after removal: 57.81%

Iteration 2: finding best layer to remove...


100%|██████████| 23/23 [05:40<00:00, 14.79s/it]


Removed layer 21. New removed set: [15, 21]
Accuracy after removal: 57.42%

Iteration 3: finding best layer to remove...


100%|██████████| 22/22 [05:14<00:00, 14.29s/it]


Removed layer 12. New removed set: [12, 15, 21]
Accuracy after removal: 58.20%

Iteration 4: finding best layer to remove...


100%|██████████| 21/21 [04:46<00:00, 13.66s/it]


Removed layer 17. New removed set: [12, 15, 17, 21]
Accuracy after removal: 57.42%

Iteration 5: finding best layer to remove...


100%|██████████| 20/20 [04:21<00:00, 13.07s/it]


Removed layer 16. New removed set: [12, 15, 16, 17, 21]
Accuracy after removal: 57.81%

Iteration 6: finding best layer to remove...


100%|██████████| 19/19 [03:57<00:00, 12.50s/it]

Removed layer 4. New removed set: [4, 12, 15, 16, 17, 21]
Accuracy after removal: 57.03%

Greedy Layer Pruning Summary:
Baseline: Accuracy=57.81%
After removing 1 layer(s) ([15]): Accuracy=57.81%
After removing 2 layer(s) ([15, 21]): Accuracy=57.42%
After removing 3 layer(s) ([12, 15, 21]): Accuracy=58.20%
After removing 4 layer(s) ([12, 15, 17, 21]): Accuracy=57.42%
After removing 5 layer(s) ([12, 15, 16, 17, 21]): Accuracy=57.81%
After removing 6 layer(s) ([4, 12, 15, 16, 17, 21]): Accuracy=57.03%





In [None]:
boolq_history, boolq_log = greedy_layer_pruning(
    model,
    tokenizer,
    evaluate_fn=evaluate_boolq,
    max_layers_to_remove=6,
    num_samples=samples,
    output_file="boolq_pruning_log.json"
)

In [None]:
copa_history, copa_log = greedy_layer_pruning(
    model,
    tokenizer,
    evaluate_fn=evaluate_copa,
    max_layers_to_remove=6,
    num_samples=samples,
    output_file="copa_pruning_log.json"
)

README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Baseline accuracy: 59.38%

Iteration 1: finding best layer to remove...


100%|██████████| 24/24 [06:01<00:00, 15.06s/it]


Removed layer 14. New removed set: [14]
Accuracy after removal: 61.33%

Iteration 2: finding best layer to remove...


100%|██████████| 23/23 [05:31<00:00, 14.41s/it]


Removed layer 17. New removed set: [14, 17]
Accuracy after removal: 60.94%

Iteration 3: finding best layer to remove...


100%|██████████| 22/22 [05:03<00:00, 13.81s/it]


Removed layer 20. New removed set: [14, 17, 20]
Accuracy after removal: 62.50%

Iteration 4: finding best layer to remove...


100%|██████████| 21/21 [04:42<00:00, 13.47s/it]


Removed layer 21. New removed set: [14, 17, 20, 21]
Accuracy after removal: 61.33%

Iteration 5: finding best layer to remove...


 85%|████████▌ | 17/20 [03:39<00:39, 13.12s/it]

In [None]:
def hellaswag_prompt(context, ending):
    """
    Combine context + ending for HellaSwag.
    """
    # You can customize how you join context + ending; simplest:
    return context + " " + ending


def evaluate_hellaswag(model, tokenizer, skip_layers, sample_size=256):
    """
    Evaluates GPT-Neo on HellaSwag using perplexity comparison.
    """
    dataset = load_dataset("hellaswag", split="validation")
    dataset = dataset.select(range(sample_size))

    correct = 0

    for example in dataset:
        context = example["ctx"]
        endings = example["endings"]  # list of 4 endings
        label = int(example["label"])     # integer 0..3

        losses = []
        for ending in endings:
            sentence = hellaswag_prompt(context, ending)
            # print(sentence)
            inputs = tokenizer(sentence, return_tensors="pt").to(device)

            with torch.no_grad():
                logits = forward_with_skipped_layers(
                    model, inputs.input_ids, inputs.attention_mask, skip_layers
                )

            # Compute negative log-likelihood
            shift_logits = logits[:, :-1].contiguous()
            shift_labels = inputs.input_ids[:, 1:].contiguous()
            loss = F.cross_entropy(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1),
                reduction="mean",
            )
            losses.append(loss.item())

        # Model chooses the ending with the lowest loss
        pred = int(torch.argmin(torch.tensor(losses)))
        if pred == label:
            correct += 1

    accuracy = correct / sample_size
    return accuracy

In [15]:
from datasets import load_dataset

dataset = load_dataset("winogrande", "winogrande_xl", split="validation")

correct = 0

test_dataset = []

def winogrande_prompt(sentence, option):
    """
    Insert the option into the blank (_) in the Winogrande sentence.
    """
    return sentence.replace("_", option)

for example in dataset:
    sentence = example["sentence"]
    option1 = example["option1"]
    option2 = example["option2"]
    label = example["answer"]

    s1 = winogrande_prompt(sentence, option1)
    s2 = winogrande_prompt(sentence, option2)

    test_dataset.append({
        "options": [s1, s2],
        "answer": int(label) - 1
    })

import json

import json

# Write as a single JSON array with comma-separated objects
with open('winogrande_test.json', 'w') as f:
    json.dump(test_dataset, f, indent=2)

print(f"Saved {len(test_dataset)} examples to winogrande_test.json")

Saved 1267 examples to winogrande_test.json


In [26]:
from datasets import load_dataset
import torch
import torch.nn.functional as F

def copa_prompt(premise, choice, question_type):
    """
    Build COPA prompt.
    question_type is either 'cause' or 'effect'.
    """
    if question_type == "cause":
        return f"{premise} This happened because {choice}."
    else:  # effect
        return f"{premise} As a result, {choice}."


dataset = load_dataset("pkavumba/balanced-copa", split="test")
print(dataset)

correct = 0

for example in dataset:
    premise = example["premise"]
    choice1 = example["choice1"]
    choice2 = example["choice2"]
    qtype   = example["question"]    # "cause" or "effect"
    label   = int(example["label"])  # 0 or 1

    s1 = copa_prompt(premise, choice1, qtype)
    s2 = copa_prompt(premise, choice2, qtype)


    test_dataset.append({
        "options": [s1, s2],
        "answer": int(label)
    })

with open('copa.json', 'w') as f:
    json.dump(test_dataset, f, indent=2)

print(f"Saved {len(test_dataset)} examples to winogrande_test.json")


Dataset({
    features: ['label', 'id', 'premise', 'question', 'choice1', 'choice2', 'mirrored'],
    num_rows: 500
})
Saved 500 examples to winogrande_test.json
