In [10]:
import matplotlib.pyplot as plt

import random
import numpy as np

In [11]:
import requests
import csv
from io import StringIO

def fetch_google_sheet_data_as_dicts(url):
    response = requests.get(url)
    assert response.status_code == 200, 'Failed to fetch the CSV data'
    
    csv_data = StringIO(response.text)
    reader = csv.DictReader(csv_data)
    return [row for row in reader]

In [12]:
from dataclasses import dataclass

@dataclass
class Size:
    n_layers: int
    emb_size: int
    n_heads: int
    d_head: int
    batch_size: int
    lr: float
    model_parallel: int

    @property
    def ffn_size(self):
        return 4 * self.emb_size


# from appendix b of https://arxiv.org/pdf/2005.14165.pdf
# see table 2.1 in https://arxiv.org/pdf/2005.14165.pdf

# assert all sizes make sense, as the gpt-3 paper contains typos

M = 1024 * 1024  # 1 million
MODEL_SIZES = {
    "800K": Size(4, 128, 2, 64, int(0.05 * M), 1.0e-3, 2),  # tiny 8m
    "85M": Size(12, 768, 12, 64, int(0.5 * M), 6.0e-4, 2),  # small 125m
    "302M": Size(24, 1024, 16, 64, int(0.5 * M), 3.0e-4, 2),  # medium 350m
    "680M": Size(24, 1536, 16, 96, int(0.5 * M), 2.5e-4, 2),  # large 760m
    "1.2B": Size(24, 2048, 32, 64, int(1.0 * M), 2.0e-4, 2),  # xl 1.2b
    
    "2.7b": Size(32, 2560, 32, 80, int(1.0 * M), 4.5e-4, 2),  # 2.7b
    "6.7b": Size(32, 4096, 32, 128, int(2.0 * M), 1.2e-4, 2),  # 6.7b
    "13b": Size(40, 5120, 40, 128, int(4.0 * M), 1.0e-4, 2),  # 13b
    "30b": Size(48, 7168, 56, 128, int(4.0 * M), 1.0e-4, 2),
    "66b": Size(64, 9216, 72, 128, int(1.0 * M), 1.0e-4, 4),  # 66b on 512 GPUs in RSC
    "175b": Size(96, 12288, 96, 128, int(0.25 * M), 3e-5, 8),  # GPTZ/GPT-3
}

In [13]:
project_name = 'Molecular_Generation_with_GDB13'

In [14]:
# # The link you got from the publish step
# r = random.randint(1,100000)
# url = f'https://docs.google.com/spreadsheets/d/e/2PACX-1vS6lVqvtxU0md0PqNN8MvKEcWD3YB4mwkN4sJAUP5ttj6yNdhP7xB6p0rwWI8yaKBmOuWx1zVuSiDQJ/pub?gid=846390475&single=true&output=csv&tm={r}'
# rows = fetch_google_sheet_data_as_dicts(url)


In [15]:
# # The link you got from the publish step
# r = random.randint(1,100000)
# url = f'https://docs.google.com/spreadsheets/d/e/2PACX-1vS6lVqvtxU0md0PqNN8MvKEcWD3YB4mwkN4sJAUP5ttj6yNdhP7xB6p0rwWI8yaKBmOuWx1zVuSiDQJ/pub?gid=219163838&single=true&output=csv'
# rows = fetch_google_sheet_data_as_dicts(url)


In [16]:
# The link you got from the publish step
r = random.randint(1,100000)
url = f'https://docs.google.com/spreadsheets/d/e/2PACX-1vS6lVqvtxU0md0PqNN8MvKEcWD3YB4mwkN4sJAUP5ttj6yNdhP7xB6p0rwWI8yaKBmOuWx1zVuSiDQJ/pub?gid=2125070475&single=true&output=csv' 
rows = fetch_google_sheet_data_as_dicts(url)

In [17]:
rows

[{'dataset': 'For  1.2B',
  'representation': '',
  'data size': '',
  'bsz': '',
  'epochs': '',
  'steps': '',
  'warmup': '',
  'lr': '',
  'model size': '',
  'GPU': '',
  'train ppl': 'not',
  'test ppl': '',
  'lowest test ppl': '',
  'generation': '',
  '1000': '',
  '10000': '',
  '100000': '',
  '1000000': '',
  '10000000': '',
  'laws': '',
  '': 'Equal distance from two molecules'},
 {'dataset': 'aspirin_0.4',
  'representation': 'sf',
  'data size': '4000',
  'bsz': '128',
  'epochs': '1',
  'steps': '31',
  'warmup': '3',
  'lr': '2.00E-05',
  'model size': '1.2B',
  'GPU': '7',
  'train ppl': '',
  'test ppl': '2.215',
  'lowest test ppl': '',
  'generation': '',
  '1000': '',
  '10000': '',
  '100000': '',
  '1000000': '',
  '10000000': '',
  'laws': 'chinchilla',
  '': '2.346'},
 {'dataset': 'aspirin_0.4',
  'representation': 'sf',
  'data size': '16000',
  'bsz': '128',
  'epochs': '1',
  'steps': '125',
  'warmup': '13',
  'lr': '2.00E-05',
  'model size': '1.2B',
  '

In [19]:
gpu = 0

for r in rows:
    if r['train ppl'] != "":
#         print(f"{r['dataset']}_{r['representation']} with {data_size} molecules, {r['model size']} model size on {r['epochs']} epochs is already there")
        continue
    
    if r['GPU'] != f'{gpu}':
        continue
        
        
    if (r['dataset'] == "") or (r['representation'] == ''):
        continue
        
    data_size = f"{int(r['data size'])//1000}K"
    
    
        
    lr = r['lr']
#     print(f"LR: {lr}")
    if lr[-3] == '-':
        end_learning_rate = lr[:-1] + str(int(lr[-1]) + 1)
    else:
        end_learning_rate = lr.split(".")[0] + ".0" +lr.split(".")[1]
        
    template = f"""

CUDA_VISIBLE_DEVICES="{r['GPU']}" \
metaseq-train --task streaming_language_modeling \
../{project_name}/data/data_bin_{r['dataset']}_{r['representation']}_{data_size}/ \
--sample-break-mode "eos_pad_8" \
--hf-tokenizer ../{project_name}/data/tokenizers/tokenizer_{r['representation']}/tokenizer.json \
--train-subset train \
--valid-subset valid \
--combine-valid-subsets \
--no-reshard-after-forward \
--use-sharded-state \
--checkpoint-activations \
--full-megatron-init \
--megatron-init-sigma 0.006 \
--activation-fn relu \
--arch transformer_lm \
--share-decoder-input-output-embed \
--decoder-layers {MODEL_SIZES[r['model size']].n_layers} \
--decoder-embed-dim {MODEL_SIZES[r['model size']].emb_size} \
--decoder-ffn-embed-dim {MODEL_SIZES[r['model size']].ffn_size} \
--decoder-attention-heads {MODEL_SIZES[r['model size']].n_heads} \
--decoder-learned-pos \
--no-scale-embedding \
--dropout 0.0 \
--attention-dropout 0.0 \
--no-emb-dropout \
--weight-decay 0.1 \
--optimizer adam \
--adam-betas  "(0.9, 0.95)" \
--adam-eps 1e-08 \
--clip-norm 1.0 \
--clip-norm-type l2 \
--criterion cross_entropy \
--required-batch-size-multiple 1 \
--distributed-world-size 1 \
--model-parallel-size 1 \
--ddp-backend pytorch_ddp \
--memory-efficient-fp16 \
--fp16-init-scale 4 \
--fp16 \
--seed 1 \
--num-workers 0 \
--num-workers-valid 0 \
--lr-scheduler polynomial_decay \
--lr {lr} \
--end-learning-rate {end_learning_rate} \
--warmup-updates {r['warmup']} \
--total-num-update {r['steps']} \
--max-update {r['steps']} \
--tokens-per-sample 64 \
--batch-size {r['bsz']} \
--update-freq 1 \
--log-format json \
--log-interval 1 \
--ignore-unused-valid-subsets \
--validate-interval-updates 500 \
--wandb-project Scaling_Laws \
--wandb-run-name OPT_{r['model size']}_ep_{r['epochs']}_{r['dataset']}_{r['representation']}_{data_size}_{lr} \
--save-interval-epochs 100 \
--keep-last-updates 1 \
--save-dir ./checkpoints/OPT_{r['model size']}_ep_{r['epochs']}_{r['dataset']}_{r['representation']}_{data_size}_{lr} \
--restore-file ""




    """
    print(template)
    
print(f"\n\nnano run_train_gpu_{gpu}_n1.sh")
print(f"chmod +x run_train_gpu_{gpu}_n1.sh")
print(f"./run_train_gpu_{gpu}_n1.sh")



CUDA_VISIBLE_DEVICES="0" metaseq-train --task streaming_language_modeling ../Molecular_Generation_with_GDB13/data/data_bin_aspirin_0.4_sf_4K/ --sample-break-mode "eos_pad_8" --hf-tokenizer ../Molecular_Generation_with_GDB13/data/tokenizers/tokenizer_sf/tokenizer.json --train-subset train --valid-subset valid --combine-valid-subsets --no-reshard-after-forward --use-sharded-state --checkpoint-activations --full-megatron-init --megatron-init-sigma 0.006 --activation-fn relu --arch transformer_lm --share-decoder-input-output-embed --decoder-layers 12 --decoder-embed-dim 768 --decoder-ffn-embed-dim 3072 --decoder-attention-heads 12 --decoder-learned-pos --no-scale-embedding --dropout 0.0 --attention-dropout 0.0 --no-emb-dropout --weight-decay 0.1 --optimizer adam --adam-betas  "(0.9, 0.95)" --adam-eps 1e-08 --clip-norm 1.0 --clip-norm-type l2 --criterion cross_entropy --required-batch-size-multiple 1 --distributed-world-size 1 --model-parallel-size 1 --ddp-backend pytorch_ddp --memory-eff

# Cloud dump

In [8]:
gpu = 1

for r in rows:
    if r['laws'] != f'tocloud':
        continue

    data_size = f"{int(r['data size'])//1000}K"
    
    template = f"""
gcloud storage cp -r OPT_{r['model size']}_ep_{r['epochs']}_{r['dataset']}_{r['representation']}_{data_size}_{r['lr']}  gs://models.storage.yerevann.com/molgen/ 
    """
    print(template.strip() + "\n")
    
for r in rows:
    if r['laws'] != f'tocloud':
        continue
    data_size = f"{int(r['data size'])//1000}K"

    template = f"""
rm OPT_{r['model size']}_ep_{r['epochs']}_{r['dataset']}_{r['representation']}_{data_size}_{r['lr']}/*
    """
    print(template.strip() + "\n")
    

gcloud storage cp -r OPT_302M_ep_10_aspirin_0.4_sf_64K_1.80E-04  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_302M_ep_10_aspirin_0.4_sf_64K_1.00E-04  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_302M_ep_10_aspirin_0.4_sf_64K_3.00E-05  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_302M_ep_10_aspirin_0.4_sf_64K_1.00E-05  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_85M_ep_10_aspirin_0.4_sf_64K_1.00E-04  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_85M_ep_10_aspirin_0.4_sf_64K_6.00E-04  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_85M_ep_10_aspirin_0.4_sf_64K_1.20E-03  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_800K_ep_10_aspirin_0.4_sf_64K_3.00E-04  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_800K_ep_10_aspirin_0.4_sf_64K_6.00E-04  gs://models.storage.yerevann.com/molgen/

gcloud storage cp -r OPT_800K_ep_10_aspi