In [1]:
"""
Imports
"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.loss.loss_utils import ForCausalLMLoss # Cross-entropy loss that handles label shifting
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
from termcolor import colored
import importlib 

from utils.memory import check_memory, clear_all_cuda_memory
from utils.store_topk import convert_topk_to_df
from utils.store_outputs import convert_outputs_to_df
from utils.vocab import export_vocab_as_csv
from utils import pretrained_models

main_device = 'cuda:0'
seed = 1234
clear_all_cuda_memory()
check_memory()

All CUDA memory cleared on all devices.
Device 0: NVIDIA RTX 6000 Ada Generation
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 47.50 GB



## Load base model

In [2]:
"""
Load the base tokenizer/model

Architectures supported currently:
- OlMoE architecture, includes OLMoE-1B-7B-0125-Instruct (1B/7B)
- Qwen2MoE architecture, inclues Qwen1.5-MoE-A2.7B-Chat (2.7B/14.3B), Qwen2-57B-A14B (14B/57B)
- Deepseek v2 architecture, includes Deepseek-v2-Lite (2.4B/15.7B), Deepseek-v2 (21B/236B)
- Deepseek v3 architecture, includes Deepseek-v3 (37B/671B), Deepseek-R1 (37B/671B), Moonlight-16B-A3B (3B/16B)
"""
selected_model_index = 1

def get_model(index):
    model = [
        ('allenai/OLMoE-1B-7B-0125-Instruct', 'olmoe', 'olmoe'),
        ('Qwen/Qwen1.5-MoE-A2.7B-Chat', 'qwen1.5moe', 'qwen2moe'),
        ('deepseek-ai/DeepSeek-V2-Lite', 'dsv2', 'dsv2'),
        ('moonshotai/Moonlight-16B-A3B', 'moonlight', 'dsv3')
    ][index]

    return model[0], model[1], model[2]

model_id, model_prefix, model_architecture = get_model(selected_model_index)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token = False, add_bos_token = False, padding_side = 'left', trust_remote_code = True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16, trust_remote_code = True).cuda().eval()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 47.50 GiB of which 16.56 MiB is free. Process 2563311 has 45.89 GiB memory in use. Process 2828758 has 1.58 GiB memory in use. Of the allocated memory 1.06 GiB is allocated by PyTorch, and 107.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
"""
Load reverse-engineered forward pass functions that return topk expert IDs and weights
"""
model_module = importlib.import_module(f"utils.pretrained_models.{model_architecture}")
run_model_return_topk = getattr(model_module, f"run_{model_architecture}_return_topk")

def test_custom_forward_pass(model, pad_token_id):
    inputs = tokenizer(['Hi! I am a dog and I like to bark', 'Vegetables are good for'], return_tensors = 'pt', padding = 'max_length', truncation = True, max_length = 512).to(model.device)
    original_results = model(**inputs)
    custom_results = run_model_return_topk(model, inputs['input_ids'], inputs['attention_mask'])
    assert torch.equal(original_results.logits, custom_results['logits']), 'Error in custom forward'
    assert len(custom_results['all_topk_experts']) == len(custom_results['all_topk_weights']), 'Length of topk IDs and weights not equal'
    print(f"Length of topk: {len(custom_results['all_topk_experts'])}")
    print(f"Topk size: {custom_results['all_topk_experts'][0].shape}")
    print(f"First token topk IDs: {custom_results['all_topk_experts'][0][1,]}")
    print(f"First token topk weights: {custom_results['all_topk_weights'][0][1,]}")
    loss = ForCausalLMLoss(custom_results['logits'], torch.where(inputs['input_ids'] == pad_token_id, torch.tensor(-100), inputs['input_ids']), model.config.vocab_size).detach().cpu().item()
    print(f"LM loss: {loss}")

test_custom_forward_pass(model, tokenizer.pad_token_id)

## Get dataset

In [None]:
"""
Load dataset (c4)
"""
ds = load_dataset('allenai/c4', 'en', split = 'validation', streaming = True).shuffle(seed = 123, buffer_size = 1_000_000)
# ds = load_dataset('HuggingFaceFW/fineweb-edu', 'CC-MAIN-2024-51', split = 'train', streaming = True).shuffle(seed = 123, buffer_size = 1_000_000)
ds_iter = iter(ds)

c4_raw = []
for _ in range(0, 25_000):
    sample = next(ds_iter, None)
    if sample is None:
        break
    c4_raw.append(sample['text'])

In [None]:
""" 
Load dataset into a dataloader
"""
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, tokenizer_output):
        self.input_ids = tokenizer_output['input_ids']
        self.attention_mask = tokenizer_output['attention_mask']

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

res = tokenizer(c4_raw, add_special_tokens = False, max_length = 512, padding = 'max_length', truncation = True, return_tensors = 'pt')
c4_dl = DataLoader(TextDataset(res), batch_size = 8, shuffle = False) # 64 for OlMoE

## Get expert selections + export

In [None]:
"""
Define model-specific forward pass functions

These functions must return a dict with keys:
- `logits`: The standard B x N x V LM output
- `all_topk_experts`: A list of length equal to the number of MoE layers, with each element a BN x topk tensor of expert IDs
- `all_topk_weights`: A list of length equal to the number of MoE layers, with each element a BN x topk tensor of expert weights
"""

@torch.no_grad()
def run_olmoe_return_topk(input_ids, attention_mask):
    input_embeds = model.model.embed_tokens(input_ids)
    
    cache_position = torch.arange(0, input_embeds.shape[1], device = input_embeds.device)
    position_ids = cache_position.unsqueeze(0)
    causal_mask = model.model._update_causal_mask(attention_mask, input_embeds, cache_position, None, None)

    hidden_state = input_embeds
    position_embeddings = model.model.rotary_emb(hidden_state, position_ids)

    all_topk_experts = []
    all_topk_weights = []
    for layer in model.model.layers:
        # SA
        residual = hidden_state
        hidden_state = layer.input_layernorm(hidden_state)
        hidden_state, _, _ = layer.self_attn(hidden_states = hidden_state, attention_mask = causal_mask, position_ids = position_ids, position_embeddings = position_embeddings)
        hidden_state = residual + hidden_state
        residual = hidden_state
        hidden_state = layer.post_attention_layernorm(hidden_state)

        ####### OlMoESparseMoeBlock - below code replaces hidden_state = layer.mlp(hidden_state)
        batch_size, sequence_length, hidden_dim = hidden_state.shape
        moe_hidden_state = hidden_state.view(-1, hidden_dim)
        router_logits = layer.mlp.gate(moe_hidden_state)

        routing_weights = torch.nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, layer.mlp.top_k, dim=-1, sorted = True)
        routing_weights = routing_weights.to(moe_hidden_state.dtype)
        final_hidden_states = torch.zeros((batch_size * sequence_length, hidden_dim), dtype = hidden_state.dtype, device = hidden_state.device)
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes = layer.mlp.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(layer.mlp.num_experts):
            expert_layer = layer.mlp.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            current_state = moe_hidden_state[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(moe_hidden_state.dtype))

        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        #######

        hidden_state = final_hidden_states
        hidden_state = residual + hidden_state

        all_topk_experts.append(selected_experts.detach().cpu())
        all_topk_weights.append(routing_weights.detach().cpu().to(torch.float32))

    hidden_state = model.model.norm(hidden_state)
    logits = model.lm_head(hidden_state)
    return {'logits': logits, 'all_topk_experts': all_topk_experts, 'all_topk_weights': all_topk_weights}

@torch.no_grad()
def run_qwenv15_return_topk(input_ids, attention_mask):
    input_embeds = model.model.embed_tokens(input_ids)
    
    cache_position = torch.arange(0, input_embeds.shape[1], device = input_embeds.device)
    position_ids = cache_position.unsqueeze(0)
    causal_mask = model.model._update_causal_mask(attention_mask, input_embeds, cache_position, None, None)

    hidden_state = input_embeds
    position_embeddings = model.model.rotary_emb(hidden_state, position_ids)

    all_topk_experts = []
    all_topk_weights = []
    for layer in model.model.layers:
        # SA
        residual = hidden_state
        hidden_state = layer.input_layernorm(hidden_state)
        hidden_state, _, _ = layer.self_attn(hidden_states = hidden_state, attention_mask = causal_mask, position_ids = position_ids, position_embeddings = position_embeddings)
        hidden_state = residual + hidden_state
        residual = hidden_state
        hidden_state = layer.post_attention_layernorm(hidden_state)

        ####### Qwen2MoeSparseMoeBlock - below code replaces hidden_state = layer.mlp(hidden_state)
        batch_size, sequence_length, hidden_dim = hidden_state.shape
        moe_hidden_state = hidden_state.view(-1, hidden_dim)
        router_logits = layer.mlp.gate(moe_hidden_state) # Size (BN, n_experts)

        routing_weights = torch.nn.functional.softmax(router_logits, dim = 1, dtype = torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, layer.mlp.top_k, dim = -1, sorted = True)
        routing_weights = routing_weights.to(moe_hidden_state.dtype)

        final_hidden_states = torch.zeros((batch_size * sequence_length, hidden_dim), dtype = moe_hidden_state.dtype, device = moe_hidden_state.device)

        # One hot encode the selected experts to create an expert mask 
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes = layer.mlp.num_experts).permute(2, 1, 0)

        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(layer.mlp.num_experts):
            expert_layer = layer.mlp.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])
            # Index the correct hidden states and compute the expert hidden state for the current expert.
            current_state = moe_hidden_state[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
            # However `index_add_` only support torch tensors for indexing so we'll use the `top_x` tensor here.
            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(moe_hidden_state.dtype))

        shared_expert_output = layer.mlp.shared_expert(moe_hidden_state)
        shared_expert_output = torch.nn.functional.sigmoid(layer.mlp.shared_expert_gate(moe_hidden_state)) * shared_expert_output

        final_hidden_states = (final_hidden_states + shared_expert_output).reshape(batch_size, sequence_length, hidden_dim)
        #######
        hidden_state = final_hidden_states
        hidden_state = residual + hidden_state

        all_topk_experts.append(selected_experts.detach().cpu())
        all_topk_weights.append(routing_weights.detach().cpu().to(torch.float32))

    hidden_state = model.model.norm(hidden_state)
    logits = model.lm_head(hidden_state)
    return {'logits': logits, 'all_topk_experts': all_topk_experts, 'all_topk_weights': all_topk_weights}


from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
@torch.no_grad()
def run_dsv2_return_topk(input_ids, attention_mask):
    B, N = input_ids.shape[:2]
    position_ids = torch.arange(0, N, dtype=torch.long, device = main_device).unsqueeze(0)

    inputs_embeds = model.model.embed_tokens(input_ids)
    attention_mask = _prepare_4d_causal_attention_mask(attention_mask, (B, N), inputs_embeds, 0,)

    hidden_state = inputs_embeds
    all_topk_experts = []
    all_topk_weights = []
    for layer_ix, layer in enumerate(model.model.layers):
        # layer_outputs = layer(hidden_state, attention_mask = attention_mask, position_ids = position_ids,)
        residual = hidden_state
        hidden_state = layer.input_layernorm(hidden_state)
        # Self Attention
        hidden_state, self_attn_weights, present_key_value = layer.self_attn(hidden_states = hidden_state, attention_mask = attention_mask, position_ids = position_ids)
        hidden_state = residual + hidden_state
        # Fully Connected
        residual = hidden_state
        hidden_state = layer.post_attention_layernorm(hidden_state)
        ## MLP
        if 'DeepseekV2MLP' in str(type(layer.mlp)):
            hidden_state = layer.mlp(hidden_state)
        else:
            identity = hidden_state
            orig_shape = hidden_state.shape
            ### Start MoeGate - originally topk_idx, topk_weight, aux_loss = layer.mlp.gate(hidden_state)
            bsz, seq_len, h = hidden_state.shape
            moe_hidden_state = hidden_state.view(-1, h)
            logits = torch.nn.functional.linear(moe_hidden_state.type(torch.float32), layer.mlp.gate.weight.type(torch.float32), None)
            scores = logits.softmax(dim=-1, dtype=torch.float32)
            topk_weight, topk_idx = torch.topk(scores, k=layer.mlp.gate.top_k, dim=-1, sorted=False)
            if layer.mlp.gate.top_k > 1 and layer.mlp.gate.norm_topk_prob:
                denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
                topk_weight = topk_weight / denominator
            else:
                topk_weight = topk_weight * layer.mlp.gate.routed_scaling_factor
            #### End MoeGate
            hidden_state = hidden_state.view(-1, hidden_state.shape[-1])
            ### Start moe_infer
            x = hidden_state
            topk_ids = topk_idx
            cnts = topk_ids.new_zeros((topk_ids.shape[0], len(layer.mlp.experts)))
            cnts.scatter_(1, topk_ids, 1)
            tokens_per_expert = cnts.sum(dim=0)
            idxs = topk_ids.view(-1).argsort()
            sorted_tokens = x[idxs // topk_ids.shape[1]]
            tokens_per_expert = tokens_per_expert.cpu().numpy()
            outputs = []
            start_idx = 0
            for i, num_tokens in enumerate(tokens_per_expert):
                end_idx = start_idx + num_tokens
                if num_tokens == 0:
                    continue
                expert = layer.mlp.experts[i + layer.mlp.ep_rank * layer.mlp.experts_per_rank]
                tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
                expert_out = expert(tokens_for_this_expert)
                outputs.append(expert_out)
                start_idx = end_idx
            outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
            new_x = torch.empty_like(outs)
            new_x[idxs] = outs
            final_out = (new_x.view(*topk_ids.shape, -1).type(topk_weight.dtype).mul_(topk_weight.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype))
            ### End moe_infer
            y = final_out.view(*orig_shape)
            if layer.mlp.config.n_shared_experts is not None:
                y = y + layer.mlp.shared_experts(identity)
            hidden_state = y

            all_topk_experts.append(topk_ids)
            all_topk_weights.append(topk_weight)

        hidden_state = residual + hidden_state

    hidden_state = model.model.norm(hidden_state)
    logits = model.lm_head(hidden_state)
    return {'logits': logits, 'all_topk_experts': all_topk_experts, 'all_topk_weights': all_topk_weights}


@torch.no_grad()
def run_dsv3_return_topk(input_ids, attention_mask):
    B, N = input_ids.shape[:2]
    position_ids = torch.arange(0, N, dtype=torch.long, device = main_device).unsqueeze(0)
    inputs_embeds = model.model.embed_tokens(input_ids)
    
    if model.model._use_flash_attention_2:
        attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
    else:
        attention_mask = _prepare_4d_causal_attention_mask(attention_mask, (B, N), inputs_embeds, 0,)

    hidden_state = inputs_embeds
    all_topk_experts = []
    all_topk_weights = []
    for layer_ix, layer in enumerate(model.model.layers):
        # layer_outputs = layer(hidden_state, attention_mask = attention_mask, position_ids = position_ids,)
        residual = hidden_state
        hidden_state = layer.input_layernorm(hidden_state)
        # Self Attention
        hidden_state, self_attn_weights, present_key_value = layer.self_attn(hidden_states = hidden_state, attention_mask = attention_mask, position_ids = position_ids)
        hidden_state = residual + hidden_state
        # Fully Connected
        residual = hidden_state
        hidden_state = layer.post_attention_layernorm(hidden_state)
        ## MLP
        if 'DeepseekV3MLP' in str(type(layer.mlp)):
            hidden_state = layer.mlp(hidden_state)
        else:
            identity = hidden_state
            orig_shape = hidden_state.shape
            ### Start MoeGate - originally topk_idx, topk_weight = layer.mlp.gate(hidden_state)
            bsz, seq_len, h = hidden_state.shape
            moe_hidden_state = hidden_state.view(-1, h)
            logits = torch.nn.functional.linear(moe_hidden_state.type(torch.float32), layer.mlp.gate.weight.type(torch.float32), None)
            scores = logits.sigmoid()
            
            scores_for_choice = scores.view(bsz * seq_len, -1) + layer.mlp.gate.e_score_correction_bias.unsqueeze(0)
            group_scores = (scores_for_choice.view(bsz * seq_len, layer.mlp.gate.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1))  # [n, n_group]
            group_idx = torch.topk(group_scores, k = layer.mlp.gate.topk_group, dim=-1, sorted = False)[1]  # [n, top_k_group]
            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
            score_mask = (group_mask.unsqueeze(-1).expand(bsz * seq_len, layer.mlp.gate.n_group, layer.mlp.gate.n_routed_experts // layer.mlp.gate.n_group).reshape(bsz * seq_len, -1))  # [n, e]
            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
            _, topk_idx = torch.topk(tmp_scores, k=layer.mlp.gate.top_k, dim=-1, sorted=True)
            topk_weight = scores.gather(1, topk_idx)
            if layer.mlp.gate.top_k > 1 and layer.mlp.gate.norm_topk_prob:
                denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
                topk_weight = topk_weight / denominator
            else:
                topk_weight = topk_weight * layer.mlp.gate.routed_scaling_factor
            ### End MoeGate 
            hidden_state = hidden_state.view(-1, hidden_state.shape[-1])
            ### Start moe_infer - replaces layer.mlp.moe_infer(hidden_state, topk_idx, topk_weight).view(*orig_shape)
            x = hidden_state
            topk_ids = topk_idx
            cnts = topk_ids.new_zeros((topk_ids.shape[0], len(layer.mlp.experts)))
            cnts.scatter_(1, topk_ids, 1)
            tokens_per_expert = cnts.sum(dim=0)
            idxs = topk_ids.view(-1).argsort()
            sorted_tokens = x[idxs // topk_ids.shape[1]]
            tokens_per_expert = tokens_per_expert.cpu().numpy()
            outputs = []
            start_idx = 0
            for i, num_tokens in enumerate(tokens_per_expert):
                end_idx = start_idx + num_tokens
                if num_tokens == 0:
                    continue
                expert = layer.mlp.experts[i + layer.mlp.ep_rank * layer.mlp.experts_per_rank]
                tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
                expert_out = expert(tokens_for_this_expert)
                outputs.append(expert_out)
                start_idx = end_idx
            outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
            new_x = torch.empty_like(outs)
            new_x[idxs] = outs
            final_out = (new_x.view(*topk_ids.shape, -1).type(topk_weight.dtype).mul_(topk_weight.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype))
            ### End moe_infer
            y = final_out.view(*orig_shape)
            if layer.mlp.config.n_shared_experts is not None:
                y = y + layer.mlp.shared_experts(identity)
            hidden_state = y

            all_topk_experts.append(topk_ids)
            all_topk_weights.append(topk_weight)

        hidden_state = residual + hidden_state

    hidden_state = model.model.norm(hidden_state)
    logits = model.lm_head(hidden_state).float()
    return {'logits': logits, 'all_topk_experts': all_topk_experts, 'all_topk_weights': all_topk_weights}


In [None]:
""" 
Export expert selections
"""

@torch.no_grad()
def run_and_export_topk(model_prefix, c4_dl, max_batches = None):
    """
    Run forward passes on a given model ID, return topk df
    """
    b_count = 0

    for batch_ix, batch in tqdm(enumerate(c4_dl), total = len(c4_dl)):

        input_ids = batch['input_ids'].to(main_device)
        attention_mask = batch['attention_mask'].to(main_device)
        
        if model_prefix == 'olmoe':
            fn = run_olmoe_return_topk
        elif model_prefix == 'qwenv15':
            fn = run_qwenv15_return_topk
        elif model_prefix == 'dsv2':
            fn = run_dsv2_return_topk
        elif model_prefix == 'moonlight':
            fn = run_dsv3_return_topk
        else:
            raise Exception('Unsupported model!')

        output = fn(input_ids, attention_mask)

        # Check no bugs by validating output/perplexity
        if batch_ix == 0:
            label_ids = torch.where(input_ids == tokenizer.pad_token_id, torch.tensor(-100), input_ids)
            base_loss = ForCausalLMLoss(output['logits'], label_ids, model.config.vocab_size).detach().cpu().item()
            for i in range(min(2, input_ids.size(0))):
                decoded_input = tokenizer.decode(input_ids[i, :attention_mask[i].sum()], skip_special_tokens = True)
                next_token_id = torch.argmax(output['logits'][i, -1, :]).item()
                print(decoded_input + colored(tokenizer.decode([next_token_id], skip_special_tokens = True), 'green'))
            print(f"PPL:", torch.exp(torch.tensor(base_loss)).item())
        
        output_df =\
            convert_outputs_to_df(input_ids, attention_mask, output['logits'])\
            .assign(
                batch_ix = batch_ix
            )

        topk_df =\
            convert_topk_to_df(input_ids, attention_mask, output['all_topk_experts'], output['all_topk_weights'])\
            .assign(
                batch_ix = batch_ix,
                weight = lambda df: df['weight'].round(3)
            )
        
        output_df\
            .to_csv(f'{model_prefix}-c4-outputs.csv', mode = 'w' if batch_ix == 0 else 'a', index = False, header = (batch_ix == 0))

        topk_df\
            .to_csv(f'{model_prefix}-c4-routes.csv', mode = 'w' if batch_ix == 0 else 'a', index = False, header = (batch_ix == 0))
        
        topk_df[topk_df['topk_ix'] == 1]\
            .to_csv(f'{model_prefix}-c4-routes-top1.csv', mode = 'w' if batch_ix == 0 else 'a', index = False, header = (batch_ix == 0))

        b_count += 1
        if max_batches is not None and b_count >= max_batches:
            break

    return True

export_vocab_as_csv(tokenizer, f'{model_prefix}-vocab.csv')
run_and_export_topk(model_prefix, c4_dl)