## Train models for BERT ablation study, on Google Collab with TPUs

In [None]:
# Enter Google Collab storage bucket information here
project_id = ""
bucket_name = ""
from google.colab import auth

auth.authenticate_user()

!gcloud config set project {project_id}

In [None]:
!pip install -U numpy==1.19.5
!pip install tensorflow-gpu==1.15

import csv
import json
import os
import time

import regex as re
from google.cloud import storage

In [None]:
!gsutil -m cp -nr gs://{bucket_name}/electra-nas .

In [None]:
def download_configs():
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob("BERT_benchmark_train_ablation.json")
    return json.loads(blob.download_as_string())

In [None]:
def get_config():
    configs = download_configs()
    config = {}
    for tempconfig in configs:
        if not tempconfig["is_running"] and not tempconfig["completed"]:
            config = tempconfig
            break

    hparams = config["hparams"]
    hparams["tpu_name"] += os.environ["COLAB_TPU_ADDR"]

    with open("hparams.json", "w") as f:
        json.dump(hparams, f, indent=4)
    
    return config["id"]

In [None]:
def run_pretraining(config_id):
    model_name = "ablation" + str(config_id)
    
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    configs = download_configs()
    config = configs[config_id]

    !gsutil -m rm -r gs://{bucket_name}/electra_data/models/{model_name}
    
    start_time = time.time()
    config["is_running"] = True

    configs[config["id"]] = config
    blob = bucket.blob("BERT_benchmark_train_ablation.json")
    blob.upload_from_string(json.dumps(configs).encode('utf-8'))

    !python tensorflow-model/run_pretraining.py --data-dir gs://{bucket_name}/electra_data/ --model-name {model_name} --hparams hparams.json
    end_time = time.time()

    configs = download_configs()
    config = configs[config_id]
    config["time_to_train"] = end_time - start_time
    config["is_running"] = False
    config["completed"] = True

    configs[config["id"]] = config
    blob = bucket.blob("BERT_benchmark_train_ablation.json")
    blob.upload_from_string(json.dumps(configs).encode('utf-8'))

In [None]:
def run_finetuning(config_id):
    model_name = "ablation" + str(config_id)

    !gsutil -m cp -nr gs://{bucket_name}/electra_data/finetuning_tfrecords gs://{bucket_name}/electra_data/models/{model_name}/

    with open("hparams.json", "r") as f:
        hparams_finetuning = json.load(f)

    hparams_finetuning["eval_batch_size"] = 32
    hparams_finetuning["train_batch_size"] = 32
    num_train_steps = hparams_finetuning.pop("num_train_steps")
    hparams_finetuning.pop("keep_checkpoint_max")

    glue_tasks = ["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst", "sts"]
    
    for i in range(0, num_train_steps+1, hparams_finetuning['save_checkpoints_steps']):
        hparams_finetuning["init_checkpoint"] = "gs://{bucket_name}/electra_data/models/" + model_name + "/model.ckpt-" + str(i)

        for task in glue_tasks:
            hparams_finetuning["task_names"] = [task]
            hparams_finetuning["results_txt"] = (
                "gs://" + bucket_name + "/electra_data/models/"
                + model_name
                + "/results/model."
                + str(i)
                + "."
                + task
                + "_results.txt"
            )
            with open("hparams_finetuning.json", "w") as f:
                json.dump(hparams_finetuning, f, indent=4)

            !python3 tensorflow-model/run_finetuning.py --data-dir gs://{bucket_name}/electra_data/ --model-name {model_name} --hparams hparams_finetuning.json

In [None]:
def save_results(config_id):
    model_name = "ablation" + str(config_id)

    client = storage.Client()
    bucket = client.bucket(bucket_name)

    glue_tasks = ["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst", "sts"]

    configs = download_configs()
    config = configs[config_id]

    for i in range(0, config['hparams']['num_train_steps']+1, config['hparams']['save_checkpoints_steps']):
        glue_score = 0
        config["scores"][str(i)] = {}
        for blob in bucket.list_blobs(
            prefix="electra_data/models/" + model_name + "/results/model." + str(i)
        ):
            contents = blob.download_as_string()
            task = next(task for task in glue_tasks if task in blob.name)
            if task is not None:
                config["scores"][str(i)][task] = float(re.search("(?<= [a-z]+: ).*?(?= )", str(contents)).group())
                glue_score += config["scores"][str(i)][task]

        config["scores"][str(i)]["glue"] = glue_score/len(glue_tasks)
        configs[config["id"]] = config
        blob = bucket.blob("BERT_benchmark_train_ablation.json")
        blob.upload_from_string(json.dumps(configs).encode('utf-8'))

In [None]:
while(True):
    config_id = get_config()
    run_pretraining(config_id)
    run_finetuning(config_id)
    save_results(config_id)