# **Code Generation**
We shall build a scaled-down version of a code generation model using GPT-2 with the help of TensorFlow. We shall train the tokenizer and the model from scratch using a subset of CodeParrot dataset.

### **1. Install and Import Required Libraries**

In [None]:
!pip install datasets transformers

In [None]:
import tensorflow as tf
import numpy as np
import collections

from transformers import AutoTokenizer, PreTrainedTokenizerFast, AutoConfig, TFGPT2LMHeadModel, DataCollatorForLanguageModeling, create_optimizer, pipeline
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from datasets import load_dataset, Dataset

### **2. Load Data**

In [None]:
raw_dataset = load_dataset('codeparrot/codeparrot-valid-near-deduplication', split='train', streaming=True)

### **3. Preprocess Data**

In [None]:
def any_keyword_in_string(string, keywords):
  for keyword in keywords:
    if keyword in string:
      return True
  return False

def filter_streaming_dataset(dataset, filters):
  filtered_dict = collections.defaultdict(list)
  total = 0
  for sample in iter(dataset):
    total += 1
    if any_keyword_in_string(sample['content'], filters):
      for key, value in sample.items():
        filtered_dict[key].append(value)
  print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
  return Dataset.from_dict(filtered_dict)

In [None]:
filters = ['pandas', 'sklearn', 'matplotlib', 'seaborn']
filtered_dataset = filter_streaming_dataset(raw_dataset, filters)

6.18% of data after filtering.


In [None]:
filtered_dataset = filtered_dataset.train_test_split(train_size=1000, test_size=100, seed=44)
filtered_dataset['validation'] = filtered_dataset.pop('test')
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license', 'hash', 'line_mean', 'line_max', 'alpha_frac', 'autogenerated'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license', 'hash', 'line_mean', 'line_max', 'alpha_frac', 'autogenerated'],
        num_rows: 100
    })
})

### **4. Build and Train a New Tokenizer**
### Train a New Tokenizer from Existing Tokenizer

In [None]:
def get_training_corpus(example):
  return (example['train'][i : i+1000]['content'] for i in range(0, len(example['train']), 1000))

In [None]:
old_tokenizer = AutoTokenizer.from_pretrained('gpt2')
new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(filtered_dataset), vocab_size=52000)

In [None]:
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

print(old_tokenizer.tokenize(example))

['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [None]:
print(new_tokenizer.tokenize(example))

['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`."""', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


### Build a New Tokenizer from Scratch

In [None]:
tokenizer = Tokenizer(models.BPE())

In [None]:
# GPT-2 doesn't use normalizer and hence, skipping it
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('Ġtest', (5, 10)),
 ('Ġpre', (10, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

In [None]:
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(filtered_dataset), trainer=trainer)

encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['L', 'et', "'s", 'Ġtest', 'Ġthis', 'Ġtokenizer', '.']


In [None]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start : end]

' this'

In [None]:
tokenizer.decoder = decoders.ByteLevel()

tokenizer.decode(encoding.ids)

"Let's test this tokenizer."

In [None]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>"
)

tokenizer = wrapped_tokenizer

In [None]:
context_length = 128

def tokenizer_function(example):
  outputs = tokenizer(
      example['content'],
      max_length=context_length,
      truncation=True,
      return_overflowing_tokens=True,
      return_length=True
  )

  # Discarding chunks with length not equal to context_length
  input_batch = list()
  for length, input_ids in zip(outputs['length'], outputs['input_ids']):
    if length == context_length:
      input_batch.append(input_ids)

  return {'input_ids': input_batch}

In [None]:
tokenized_dataset = filtered_dataset.map(tokenizer_function, batched=True, remove_columns=filtered_dataset['train'].column_names)

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 23839
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 2709
    })
})

### **5. Initialize a New Model and Train from Scratch**

In [None]:
config = AutoConfig.from_pretrained(
    'gpt2',
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [None]:
model = TFGPT2LMHeadModel(config)
model(model.dummy_inputs) # Building the model
model.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 105042432 
 r)                                                              
                                                                 
Total params: 105,042,432
Trainable params: 105,042,432
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_size = 32
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')

tf_train_dataset = model.prepare_tf_dataset(
    tokenized_dataset['train'],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=batch_size
)

tf_validation_dataset = model.prepare_tf_dataset(
    tokenized_dataset['validation'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=batch_size
)

In [None]:
num_epochs = 1
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=1000,
    weight_decay_rate=0.01
)

model.compile(optimizer=optimizer, metrics=['accuracy'])

In [None]:
# Training in mixed-precision float16
tf.keras.mixed_precision.set_global_policy('mixed_float16')

history = model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs, verbose=1)



### **6. Predict using the Trained Model**

In [None]:
code_generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""

# Model needs to be retrained with additional data for improving accuracy
print(code_generator(txt, num_return_sequences=1, pad_token_id=code_generator.tokenizer.eos_token_id)[0]['generated_text'])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
def test_file_file1]
def return y=0
