# GCS Auth

In [None]:
import os
from google.colab import auth
os.environ['USE_AUTH_EPHEM'] = '0'
auth.authenticate_user()

# Set Up

In [None]:
from IPython.display import clear_output 
!pip install gcsfs
!pip3 install t5==0.9.2
!pip install -q tensorflow-text==2.8.0rc0
!pip install sentencepiece 
clear_output()

In [None]:
print("Installing dependencies...")
import sentencepiece as spm
import tensorflow.compat.v1 as tf
tf.app.flags.DEFINE_string ('f', '', '')

In [None]:
print("Setting up TPU...")
import tensorflow_gcs_config
TPU_TOPOLOGY = "2x2"
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  TPU_ADDRESS = tpu.get_master()
  print('Running on TPU:', TPU_ADDRESS)
except ValueError:
  raise BaseException(
    'ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
tf.config.experimental_connect_to_host(TPU_ADDRESS)
tensorflow_gcs_config.configure_gcs_from_colab_auth()
tf.disable_v2_behavior()


# Paths

In [None]:
representation = "tokens" #@param ["ast", "tokens"]
task = "masking" #@param ["masking"]
VOCAB_PREFIX = "tokenizer" #@param {type:"string"}
tokenizer_source_file = f"gs://lance2/tokenizer/{representation}/{VOCAB_PREFIX}.txt"
tokenizer_model_local_path = os.path.join(f"/content/{VOCAB_PREFIX}.model")
tokenizer_vocab_local_path = os.path.join(f"/content/{VOCAB_PREFIX}.vocab")

# Training Tokenizer

In [None]:
VOCAB_SIZE = 32000 #@param { type: "integer" }

print("Training the tokenizer and building the vocabulary ...")
with tf.io.gfile.GFile(tokenizer_source_file, "r") as f:
  spm.SentencePieceTrainer.train(sentence_iterator=f, 
                                 model_prefix=VOCAB_PREFIX,
                                 pad_id=0, bos_id=-1, 
                                 eos_id=1, unk_id=2, 
                                 character_coverage=1.0, 
                                 vocab_size=VOCAB_SIZE)
  print(f"Training tokenizer finished, at: {tokenizer_model_local_path}")

# Storing results

In [None]:
tokenizer_model_path = f"gs://lance2/tokenizer/{representation}/{VOCAB_PREFIX}.model"
tokenizer_vocab_path = f"gs://lance2/tokenizer/{representation}/{VOCAB_PREFIX}.vocab"

## Store in GCS:
!gsutil cp $tokenizer_model_local_path $tokenizer_model_path
!gsutil cp $tokenizer_vocab_local_path $tokenizer_vocab_path

print("Copying Finished.")