<a href="https://colab.research.google.com/github/agemagician/CodeTrans/blob/main/prediction/single%20task/function%20documentation%20generation/python/small_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install the library and download the pretrained models

In [1]:
print("Installing dependencies...")
%tensorflow_version 2.x

!pip install -q t5==0.6.4

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

!wget "https://www.dropbox.com/sh/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa?dl=1" -O vocabulary.zip
!unzip vocabulary.zip
!rm vocabulary.zip
!wget "https://www.dropbox.com/sh/yawxwkcsjwhv1zz/AAD76qQ60rswiN28hXpiDPBPa?dl=1" -O python.zip
!unzip python.zip
!rm python.zip

Installing dependencies...
[K     |████████████████████████████████| 163kB 5.9MB/s 
[K     |████████████████████████████████| 2.6MB 42.0MB/s 
[K     |████████████████████████████████| 348kB 53.8MB/s 
[K     |████████████████████████████████| 1.1MB 50.3MB/s 
[K     |████████████████████████████████| 71kB 9.0MB/s 
[K     |████████████████████████████████| 3.7MB 48.9MB/s 
[K     |████████████████████████████████| 1.3MB 45.1MB/s 
[K     |████████████████████████████████| 2.9MB 49.8MB/s 
[K     |████████████████████████████████| 890kB 52.1MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: transformers 3.5.1 has requirement sentencepiece==0.1.91, but you'll have sentencepiece 0.1.94 which is incompatible.[0m
INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 
--2020-11-20 13:03:31--  https://www.dropbox.com/sh/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa?dl=1
Resolving www.dropbox.co

## Set sentencepiece model

In [2]:
from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary

vocab_model_path = 'code_spm_unigram_40M.model'
vocab = SentencePieceVocabulary(vocab_model_path, extra_ids=100)

print("Vocab has a size of %d\n" % vocab.vocab_size)

Vocab has a size of 32100



## Set t5 small model

In [3]:
MODEL_DIR = "small"
model_parallelism = 1
train_batch_size = 256

tf.io.gfile.makedirs(MODEL_DIR)
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=None,
    tpu_topology=None,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    mesh_shape="model:1,batch:1",
    mesh_devices=["GPU:0"],
    learning_rate_schedule=0.003,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=None,
    iterations_per_loop=100,
)

## Code Documentation Summarization

### Give the code for summarization

In [4]:
code = "def e(message, exit_code=None):\n   print_log(message, YELLOW, BOLD)\n    if exit_code is not None:\n        sys.exit(exit_code)" #@param {type:"raw"}


### Parsing and Tokenization

In [5]:
!pip install tree_sitter
!git clone https://github.com/tree-sitter/tree-sitter-python

Collecting tree_sitter
[?25l  Downloading https://files.pythonhosted.org/packages/3f/52/26d11536a8fafaadabe9deeb0611abdd71e11602904f60a6debdde053e6f/tree_sitter-0.2.0.tar.gz (110kB)
[K     |███                             | 10kB 18.3MB/s eta 0:00:01[K     |██████                          | 20kB 21.5MB/s eta 0:00:01[K     |█████████                       | 30kB 9.3MB/s eta 0:00:01[K     |███████████▉                    | 40kB 8.9MB/s eta 0:00:01[K     |██████████████▉                 | 51kB 4.4MB/s eta 0:00:01[K     |█████████████████▉              | 61kB 4.9MB/s eta 0:00:01[K     |████████████████████▉           | 71kB 4.9MB/s eta 0:00:01[K     |███████████████████████▊        | 81kB 5.3MB/s eta 0:00:01[K     |██████████████████████████▊     | 92kB 5.4MB/s eta 0:00:01[K     |█████████████████████████████▊  | 102kB 4.3MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 4.3MB/s 
[?25hBuilding wheels for collected packages: tree-sitter
  Building wheel 

In [6]:
from tree_sitter import Language, Parser

Language.build_library(
  'build/my-languages.so',
  ['tree-sitter-python']
)

PYTHON_LANGUAGE = Language('build/my-languages.so', 'python')
parser = Parser()
parser.set_language(PYTHON_LANGUAGE)

In [7]:
def get_string_from_code(node, lines):
  line_start = node.start_point[0]
  line_end = node.end_point[0]
  char_start = node.start_point[1]
  char_end = node.end_point[1]
  if line_start != line_end:
    code_list.append(' '.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]]))
  else:
    code_list.append(lines[line_start][char_start:char_end])

def my_traverse(node, code_list):
  lines = code.split('\n')
  if node.child_count == 0:
    get_string_from_code(node, lines)
  elif node.type == 'string':
    get_string_from_code(node, lines)
  else:
    for n in node.children:
      my_traverse(n, code_list)
 
  return ' '.join(code_list)

In [8]:
tree = parser.parse(bytes(code, "utf8"))
code_list=[]
tokenized_code = my_traverse(tree.root_node, code_list)
print("Output after tokenization: " + tokenized_code)

Output after tokenization: def e ( message , exit_code = None ) : print_log ( message , YELLOW , BOLD ) if exit_code is not None : sys . exit ( exit_code )


### Record the code for summarization with the prefix to a txt file

In [9]:
codes = [tokenized_code]

inputs_path = 'input.txt'
with tf.io.gfile.GFile(inputs_path, "w") as f:
  for c in codes:
    f.write("function documentation generation python: %s\n" % c)

predict_outputs_path = 'MtfModel-output.txt'


### Running the model with the best checkpoint to summarize the given code

In [10]:
model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
model.predict(
    input_file="input.txt",
    output_file=predict_outputs_path,
    checkpoint_steps=20000,
    beam_size=4,
    vocabulary=vocab, 
    # Select the most probable output token at each step.
    temperature=0,
)

INFO:tensorflow:Using config: {'_model_dir': 'small', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_job_name=None, initial_infeed_sleep_s

### Code Summarization Result

In [11]:
prediction_file = "MtfModel-output.txt-20000"
print("\nPredictions using checkpoint 20000:\n" )
with tf.io.gfile.GFile(prediction_file) as f:
  for c, d in zip(codes, f):
    if c:
      print("Code for prediction: " + c + '\n')
      print("Generated Documentation: " + d)



Predictions using checkpoint 20000:

Code for prediction: def e ( message , exit_code = None ) : print_log ( message , YELLOW , BOLD ) if exit_code is not None : sys . exit ( exit_code )

Generated Documentation: b'Prints an error and exits with an optional exit code .'

