In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install Transformers
!pip install --upgrade accelerate

In [3]:
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling, TrainingArguments, LineByLineTextDataset, Trainer
from glob import glob
import random
import os
import torch

In [4]:
# Define the paths
PROJECT_PATH = '/content/drive/MyDrive/BabyLM_Final'
MODELS_PATH = os.path.join(PROJECT_PATH, 'model_folders')
TRAIN_DATA_PATH = os.path.join(PROJECT_PATH, 'train_data')

# Define the curriculums and levels
CURRICULUMS = ['C1', 'C2', 'C3', 'C4']
LEVELS = ['5-LEVEL', '10-LEVEL', '20-LEVEL']
STRATEGIES = ['E2H', 'H2E', 'RND']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("babylm/roberta-base-strict-small")
config = AutoModelForMaskedLM.from_pretrained("babylm/roberta-base-strict-small").config
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
for curriculum in CURRICULUMS:
    for level in LEVELS:
        # Get all training files for the current curriculum and level
        training_data_files = sorted(glob(os.path.join(TRAIN_DATA_PATH, curriculum, level, "*.train")))

        for strategy in STRATEGIES:
            print(f"Training on curriculum {curriculum}, level {level}, strategy {strategy} data")
            # Construct the output directory for the current curriculum, level, and strategy
            output_dir = os.path.join(MODELS_PATH, curriculum, level, strategy)
            os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist

            # Initialize the model for the current strategy
            model = AutoModelForMaskedLM.from_config(config)
            training_args = TrainingArguments(output_dir=output_dir, per_device_train_batch_size=64, num_train_epochs=1,)

            # Determine the order of the files based on the strategy
            if strategy == 'E2H':
                files = sorted(training_data_files)
            elif strategy == 'H2E':
                files = sorted(training_data_files, reverse=True)
            elif strategy == 'RND':
                files = list(training_data_files)
                random.shuffle(files)

            # Train the model using curriculum learning on each file in the current level and strategy
            for idx, file_path in enumerate(files, start=1):
                print(f"Training on file {idx} in level {level}, curriculum {curriculum}, strategy {strategy}")

                # Prepare dataset for the current file
                dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=128)

                # Set up Trainer for the current file with the model instance
                trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=dataset,)

                # Train the model on the current file
                trainer.train()

            # Save the model AFTER training on all files in the current curriculum, level, and strategy
            trainer.save_model(output_dir)
            del model
            del trainer
            torch.cuda.empty_cache()