In [11]:
import os
from torch import nn
from transformers import AutoModel

import warnings
warnings.filterwarnings("ignore")

In [12]:
ALL_MODELS_FOLDER = "../models/"

In [23]:
def load_model_from_checkpoint(model_folder: str) -> str:
    """Loads from local checkpoint. Loads the checkpoint with the highest number.

    Args:
        model_folder (str): model folder with may contain multiple checkpoints.

    Returns:
        str: returns the checkpoint with the highest number
    """
    checkpoints = [d for d in os.listdir(model_folder) if d.startswith("checkpoint-")]

    if not checkpoints:
        print(f"No checkpoints found in {model_folder}")
        exit()

    # Find the checkpoint with the highest step number
    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
    checkpoint_dir = os.path.join(model_folder, checkpoints[0])

    return checkpoint_dir


def num_params(model) -> None:
    """
    Prints the total number of parameters in the model in millions.

    Args:
        model: The model whose parameters are to be counted.
    """
    num_params = sum(p.numel() for p in model.parameters())
    num_params_million = num_params / 1e6
    return f"{num_params_million:.2f}M"


def process_model_folders(base_dir: str) -> dict:
    """
    Processes folders with the 'language_tokenizer_vs' pattern to load the model with the highest checkpoint and get the parameters.
    
    Args:
        base_dir (str): The base directory containing model folders.
    
    Returns:
        dict: A dictionary with folder names as keys and their corresponding model parameter counts as values.
    """
    num_params_dict = {}
    # Iterate through folders starting with 'language_tokenizer_vs' and ending with a digit
    for folder_name in os.listdir(base_dir):
        if folder_name[-1].isdigit():
            model_folder = os.path.join(base_dir, folder_name)

            # Load the checkpoint with the highest number
            checkpoint_dir = load_model_from_checkpoint(model_folder)
            
            # Load the model
            model = AutoModel.from_pretrained(checkpoint_dir)

            # Get the number of parameters
            params = num_params(model)

            # Store the result in the dictionary
            num_params_dict[folder_name] = params
            
    return num_params_dict

In [27]:
results = process_model_folders(ALL_MODELS_FOLDER)

In [28]:
results

{'model_es_BPE_vs10000': '7.95M',
 'model_es_BPE_vs20000': '11.07M',
 'model_es_BPE_vs30000': '14.19M',
 'model_es_BPE_vs40000': '17.31M',
 'model_es_Unigram_vs10000': '7.95M',
 'model_es_Unigram_vs20000': '11.07M',
 'model_es_Unigram_vs30000': '14.19M',
 'model_es_Unigram_vs40000': '17.31M',
 'model_es_Wordpiece_vs10000': '7.95M',
 'model_es_Wordpiece_vs20000': '11.07M',
 'model_es_Wordpiece_vs30000': '14.19M',
 'model_es_Wordpiece_vs40000': '17.31M',
 'model_tr_BPE_vs10000': '7.95M',
 'model_tr_BPE_vs20000': '11.07M',
 'model_tr_BPE_vs30000': '14.19M',
 'model_tr_BPE_vs40000': '17.31M',
 'model_tr_Unigram_vs10000': '7.95M',
 'model_tr_Unigram_vs20000': '11.07M',
 'model_tr_Unigram_vs30000': '14.19M',
 'model_tr_Unigram_vs40000': '17.31M',
 'model_tr_Wordpiece_vs10000': '7.95M',
 'model_tr_Wordpiece_vs20000': '11.07M',
 'model_tr_Wordpiece_vs30000': '14.19M',
 'model_tr_Wordpiece_vs40000': '17.31M'}