In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

!ln -s /content/gdrive/My\ Drive/igcar_ps/ /mydrive

In [None]:

!git clone https://github.com/google-research/bert

In [None]:

import os
import sys
import json
import random
import logging
import tensorflow as tf

from glob import glob
from google.colab import auth, drive
from tensorflow.keras.utils import Progbar

sys.path.append("bert")



from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

auth.authenticate_user()

log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s : \
    %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

if 'COLAB_TPU_ADDR' in os.environ:
    log.info("Using TPU runtime")
    USE_TPU = True
    TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
    
    with tf.Session(TPU_ADDRESS) as session:
        log.info('TPU address is ' + TPU_ADDRESS)
        with open('/content/adc.json', 'r') as f:
          auth_info = json.load(f)
        
        tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
        
       
else:
    log.warning('Not connected to TPU runtime')
    USE_TPU = False

In [None]:
%cd /content/
!ls

In [None]:
!ls

In [None]:

from bert import modeling, optimization, tokenization


In [None]:
import bert.run_pretraining
from bert.run_pretraining import input_fn_builder, model_fn_builder


In [None]:
MAX_SEQ_LENGTH = 128
MASKED_LM_PROB = 0.15
MAX_PREDICTIONS = 20
DO_LOWER_CASE = True
PROCESSES = 2
PRETRAINING_DIR = "pretraining_data"

In [None]:
!wc -w final_data_newer.txt

In [None]:
!mkdir ./shards
!split -a 4 -l 256000 -d 'final_data_newer.txt' ./shards/shard_

In [None]:
!ls ./shards/

In [None]:
BERT_MODEL = 'uncased_L-12_H-768_A-12'
BERT_PRETRAINED_DIR = '/mydrive/bert_uncased/' + BERT_MODEL
print('****** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR)) 
#!ls BERT_PRETRAINED_DIR

In [None]:
BERT_CONFIG = BERT_PRETRAINED_DIR + '/bert_config.json'
CHKPT_DIR = BERT_PRETRAINED_DIR + '/bert_model.ckpt.*'
VOCAB_FILE = BERT_PRETRAINED_DIR + '/vocab.txt'
INIT_CHECKPOINT = BERT_PRETRAINED_DIR + '/bert_model.ckpt'
!ls $CHKPT_DIR

In [None]:
XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
            "python3 bert/create_pretraining_data.py "
            "--input_file=./shards/{} "
            "--output_file={}/{}.tfrecord "
            "--vocab_file={} "
            "--do_lower_case={} "
            "--max_predictions_per_seq={} "
            "--max_seq_length={} "
            "--masked_lm_prob={} "
            "--random_seed=108 "
            "--dupe_factors=5 ")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}',
                            PRETRAINING_DIR, '{}',
                            VOCAB_FILE,
                            DO_LOWER_CASE,
                            MAX_PREDICTIONS, MAX_SEQ_LENGTH,
                            MASKED_LM_PROB)

print(XARGS_CMD)

In [None]:
tf.gfile.MkDir(PRETRAINING_DIR)


In [None]:
!$XARGS_CMD

In [None]:

BUCKET_NAME = "ayushjain1144-bucket"
MODEL_DIR = "bert_model"

if not BUCKET_NAME:
  log.warning("Warning: no bucket")



In [None]:
MODEL_DIR = "bert_model"
tf.gfile.MkDir(MODEL_DIR)

In [None]:

# hyperparameters for BERT BASE

bert_base_config = {
    "attention_probs_dropout_prob": 0.1,
    "directionality": "bidi",
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pooler_fc_size": 768,
    "pooler_num_attention_heads": 12,
    "pooler_num_fc_layers": 3,
    "pooler_size_per_head": 128,
    "pooler_type": "first_token_transform",
    "vocab_size": 30522
}



In [None]:

!cp $BERT_CONFIG $MODEL_DIR/
!cp $CHKPT_DIR $MODEL_DIR/
!ls $MODEL_DIR/
    
with open("{}/bert_vocab.txt".format(MODEL_DIR), "w") as vocab:
    vocab_bert = open(VOCAB_FILE, 'r').read()
    vocab.write(vocab_bert)
    

In [None]:
!cp -r $MODEL_DIR $PRETRAINING_DIR /mydrive/

if BUCKET_NAME:
  !gsutil -m cp -r $MODEL_DIR $PRETRAINING_DIR gs://$BUCKET_NAME/original/
else:
  print("Not able to copy")



```
# This is formatted as code
```



In [None]:
TRAIN_BATCH_SIZE = 128
MAX_PREDICTIONS =20
MAX_SEQ_LENGTH = 128
MASKED_LM_PROB = 0.15

EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 40000
SAVE_CHECKPOINTS_STEPS = 5000
NUM_TPU_CORES = 8

BERT_DRIVE_DIR = "{}/{}".format('/mydrive', MODEL_DIR)
DATA_DRIVE_DIR = "{}/{}".format('/mydrive', PRETRAINING_DIR)

if BUCKET_NAME:
  BUCKET_PATH = "gs://{}/original".format(BUCKET_NAME)
else:
  print("bucket name not found")

BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)

PATH_TO_CHECKPOINT = os.path.join(BERT_GCS_DIR, "bert_model.ckpt")

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)


if INIT_CHECKPOINT == None:
    print("no checkpoint found, loading the default")
    INIT_CHECKPOINT = PATH_TO_CHECKPOINT



CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")
VOCAB_FILE = os.path.join(BERT_GCS_DIR, "bert_vocab.txt")

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR, '*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

In [None]:
model_fn = model_fn_builder(
    bert_config = bert_config,
    init_checkpoint= INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=TRAIN_STEPS,
    num_warmup_steps=4000,
    use_tpu=True,
    use_one_hot_embeddings=True)


tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=BERT_GCS_DIR,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
            tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
            num_shards=NUM_TPU_CORES,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)

train_input_fn = input_fn_builder(
    input_files=input_files,
    max_seq_length=MAX_SEQ_LENGTH,
    max_predictions_per_seq=MAX_PREDICTIONS,
    is_training=True)


In [None]:

estimator.train(input_fn=train_input_fn, max_steps = TRAIN_STEPS )
