<a href="https://colab.research.google.com/github/agemagician/CodeTrans/blob/main/prediction/single%20task/source%20code%20summarization/sql/small_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install the library and download the pretrained models

In [153]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5==0.6.4

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

!wget "https://www.dropbox.com/sh/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa?dl=1" -O vocabulary.zip
!unzip vocabulary.zip
!rm vocabulary.zip
!wget "https://www.dropbox.com/sh/9ua4fa2sj1m6jla/AAA0mHkL9MY_FLNbiyloo6xaa?dl=1" -O sql.zip
!unzip sql.zip
!rm sql.zip

Installing dependencies...
[K     |████████████████████████████████| 163kB 2.8MB/s 
[K     |████████████████████████████████| 1.1MB 7.9MB/s 
[K     |████████████████████████████████| 71kB 6.4MB/s 
[K     |████████████████████████████████| 2.6MB 17.9MB/s 
[K     |████████████████████████████████| 348kB 37.3MB/s 
[K     |████████████████████████████████| 1.3MB 34.9MB/s 
[K     |████████████████████████████████| 3.6MB 47.9MB/s 
[K     |████████████████████████████████| 890kB 46.2MB/s 
[K     |████████████████████████████████| 2.9MB 44.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: transformers 3.5.0 has requirement sentencepiece==0.1.91, but you'll have sentencepiece 0.1.94 which is incompatible.[0m
INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 
--2020-11-10 21:18:56--  https://www.dropbox.com/sh/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa?dl=1
Resolving www.dropbox.com

## Set sentencepiece model

In [154]:
from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary

vocab_model_path = 'code_spm_unigram_40M.model'
vocab = SentencePieceVocabulary(vocab_model_path, extra_ids=100)

print("Vocab has a size of %d\n" % vocab.vocab_size)

Vocab has a size of 32100



## Set the preprocessors and the task registry for the t5 model

In [155]:
def sql_sourceSum_dataset_fn(split, shuffle_files=True):
    del shuffle_files

    ds = tf.data.TextLineDataset(sql_path[split])
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""], field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    ds = ds.map(lambda *ex: dict(zip(["code", "docstring"], ex)))
    return ds


def sql_preprocessor(ds):
    def normalize_text(text):
        return text

    def to_inputs_and_targets(ex):
        return {
            "inputs": tf.strings.join(["source code summarization sql: ", normalize_text(ex["code"])]),
            "targets": normalize_text(ex["docstring"])
        }

    return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)


t5.data.TaskRegistry.remove('sourcesum_sql_code')
t5.data.TaskRegistry.add(
    "sourcesum_sql_code",
    dataset_fn=sql_sourceSum_dataset_fn,
    output_features={
        "inputs": t5.data.utils.Feature(vocabulary=vocab),
        "targets": t5.data.utils.Feature(vocabulary=vocab),
    },
    splits=["train", "validation"],
    text_preprocessor=[sql_preprocessor],
    postprocess_fn=t5.data.postprocessors.lower_text,
    metric_fns=[t5.evaluation.metrics.bleu, t5.evaluation.metrics.accuracy, t5.evaluation.metrics.rouge],
)

## Set t5 small model

In [156]:
MODEL_DIR = "small"
model_parallelism = 1
train_batch_size = 256

tf.io.gfile.makedirs(MODEL_DIR)
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=None,
    tpu_topology=None,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    mesh_shape="model:1,batch:1",
    mesh_devices=["GPU:0"],
    learning_rate_schedule=0.003,
    save_checkpoints_steps=500,
    keep_checkpoint_max=None,
    iterations_per_loop=100,
)

## Code Documentation Summarization

### Give the code for summarization

In [157]:
code = "select time (fieldname) from tablename" #@param {type:"raw"}


### Parsing and Tokenization

In [158]:
import re
import sqlparse

scanner=re.Scanner([
  (r"\[[^\]]*\]",       lambda scanner,token: token),
  (r"\+",      lambda scanner,token:"R_PLUS"),
  (r"\*",        lambda scanner,token:"R_KLEENE"),
  (r"%",        lambda scanner,token:"R_WILD"),
  (r"\^",        lambda scanner,token:"R_START"),
  (r"\$",        lambda scanner,token:"R_END"),
  (r"\?",        lambda scanner,token:"R_QUESTION"),
  (r"[\.~``;_a-zA-Z0-9\s=:\{\}\-\\]+", lambda scanner,token:"R_FREE"),
  (r'.', lambda scanner, token: None),
])

def tokenizeRegex(s):
  results, remainder=scanner.scan(s)
  return results

def my_traverse(token_list, statement_list, result_list):
  for t in token_list:
    if t.ttype == None:
      my_traverse(t, statement_list, result_list)
    elif t.ttype != sqlparse.tokens.Whitespace:
      statement_list.append(t.ttype)
      result_list.append(str(t))
  return statement_list, result_list

def sanitizeSql(sql):
  s = sql.strip().lower()
  if not s[-1] == ";":
    s += ';'
  s = re.sub(r'\(', r' ( ', s)
  s = re.sub(r'\)', r' ) ', s)
  s = s.replace('#', '')
  return s

In [159]:
statement_list = []
result_list = []
code = sanitizeSql(code)
tokens = sqlparse.parse(code)
statements, result = my_traverse(tokens, statement_list, result_list)

table_map = {}
column_map = {}
for i in range(len(statements)):
  if statements[i] in [sqlparse.tokens.Number.Integer, sqlparse.tokens.Literal.Number.Integer]:
    result[i] = "CODE_INTEGER"
  elif statements[i] in [sqlparse.tokens.Number.Float, sqlparse.tokens.Literal.Number.Float]:
    result[i] = "CODE_FLOAT"
  elif statements[i] in [sqlparse.tokens.Number.Hexadecimal, sqlparse.tokens.Literal.Number.Hexadecimal]:
    result[i] = "CODE_HEX"
  elif statements[i] in [sqlparse.tokens.String.Symbol, sqlparse.tokens.String.Single, sqlparse.tokens.Literal.String.Single, sqlparse.tokens.Literal.String.Symbol]:
    result[i] = tokenizeRegex(result[i])
  elif statements[i] in[sqlparse.tokens.Name, sqlparse.tokens.Name.Placeholder, sqlparse.sql.Identifier]:
    old_value = result[i]
    if old_value in column_map:
      result[i] = column_map[old_value]
    else:
      result[i] = 'col'+ str(len(column_map))
      column_map[old_value] = result[i]
  elif (result[i] == "." and statements[i] == sqlparse.tokens.Punctuation and i > 0 and result[i-1].startswith('col')):
    old_value = result[i-1]
    if old_value in table_map:
      result[i-1] = table_map[old_value]
    else:
      result[i-1] = 'tab'+ str(len(table_map))
      table_map[old_value] = result[i-1]
  if (result[i].startswith('col') and i > 0 and (result[i-1] in ["from"])):
    old_value = result[i]
    if old_value in table_map:
      result[i] = table_map[old_value]
    else:
      result[i] = 'tab'+ str(len(table_map))
      table_map[old_value] = result[i]

tokenized_code = ' '.join(result)
print("SQL after tokenized: " + tokenized_code)

SQL after tokenized: select time ( col0 ) from tab0 ;


### Record the code for summarization with the prefix to a txt file

In [160]:
codes = [tokenized_code]

inputs_path = 'input.txt'
with tf.io.gfile.GFile(inputs_path, "w") as f:
  for c in codes:
    f.write("source code summarization sql: %s\n" % c)

predict_outputs_path = 'MtfModel-output.txt'


### Running the model with the best checkpoint to summarize the given code

In [161]:
model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
model.predict(
    input_file="input.txt",
    output_file=predict_outputs_path,
    checkpoint_steps=500,
    beam_size=4,
    vocabulary=vocab, 
    # Select the most probable output token at each step.
    temperature=0,
)

INFO:tensorflow:Using config: {'_model_dir': 'small', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_job_name=None, initial_infeed_sleep_s

### Code Summarization Result

In [162]:
prediction_file = "MtfModel-output.txt-500"
print("\nPredictions using checkpoint 500:\n" )
with tf.io.gfile.GFile(prediction_file) as f:
  for c, d in zip(codes, f):
    if c:
      print("Code for prediction: " + c + '\n')
      print("Generated Summarization: " + d)



Predictions using checkpoint 500:

Code for prediction: select time ( col0 ) from tab0 ;

Generated Summarization: b'mysql : how to get the difference between two dates ?'

