### Change query to make it larger and stratify sampling via window function

In [1]:
s=r'''
SELECT suffix, processed_content, RAND() >0.1 as is_train
FROM (
  SELECT 
          REGEXP_EXTRACT(sample_path, '.*(\\..+)$') as suffix,
          substr(content,1,1024)  as processed_content,
          RAND() AS rnd, 
          ROW_NUMBER() OVER(PARTITION BY suffix ORDER BY rnd) AS pos
        FROM  [bigquery-public-data:github_repos.sample_contents]
        WHERE 
            LENGTH(content) >= 1024
            AND content IS NOT NULL
            AND content != ''
            AND REGEXP_EXTRACT(sample_path, '.*(\\..+)$') in ('.py','.c','.rb')
            AND REGEXP_MATCH(sample_path, '.*(\\..+)$')
        ) a
     WHERE 
        pos <= 5000/3
'''

In [2]:
from google.cloud import bigquery
c = bigquery.Client()
query = c.run_sync_query(s)
query.run()

### Split data to 2 csvs according to random split

In [3]:
import csv
from base64 import urlsafe_b64encode


def transform_content(content):
    content = content.encode('utf-8')    
    content = content[:1024]    
    return content

with file('/tmp/train.csv','wb') as train_out_file:
    with file('/tmp/test.csv','wb') as test_out_file:
        w_train = csv.writer(train_out_file, quoting=csv.QUOTE_ALL, delimiter='\t')
        w_test = csv.writer(test_out_file, quoting=csv.QUOTE_ALL, delimiter='\t')
        for language,content, is_train in query.rows:
            print language, is_train
            w = w_train if is_train else w_test
            w.writerow([language,urlsafe_b64encode(transform_content(content))])

### Packaged net building into method

In [None]:
import tensorflow as tf
BATCH_SIZE = 128
LEARNING_RATE = 0.001
TRAIN_ITERATIONS = 500
MAX_STRING_SIZE = 1024

def make_net(input_path, batch_size=BATCH_SIZE):
    filenames_queue = tf.train.string_input_producer([input_path])
    reader = tf.TextLineReader()
    key, value = reader.read(filenames_queue)
    default_values=[['UNKNOWN'], ['']]


    # decode content
    language, b64_content = tf.decode_csv(value,default_values, field_delim='\t')
    content = tf.decode_base64(b64_content)

    language_batch_op, content_batch_op = tf.train.shuffle_batch([language,content], 
                                                                 batch_size=batch_size, 
                                                                 capacity=1000, 
                                                                 min_after_dequeue=100)
    #Make hash table for langauges
    language_keys=['.py','.c','.h']
    values=range(1, len(language_keys)+1)
    language_codes_table = tf.contrib.lookup.HashTable(
        tf.contrib.lookup.KeyValueTensorInitializer(language_keys, values), 0)

    #Make embeddings for the characters
    bytes = tf.transpose(tf.decode_raw(content_batch_op, tf.uint8))
    bytes_embedding_weights = tf.Variable(name="embedding_weights",
                                          initial_value=tf.random_uniform(shape=(256, 64),
                                                                          minval=-0.1, 
                                                                          maxval=0.1))
    bytes_embedding = tf.nn.embedding_lookup(bytes_embedding_weights, tf.cast(bytes,tf.int32))
    embedding_mean = tf.reduce_mean(bytes_embedding,axis=0)

    # Convert languages to numeric codes
    language_codes_indices = language_codes_table.lookup(language_batch_op)
    language_codes_batch_op = tf.one_hot(language_codes_indices, len(language_keys))
    dense_weights = tf.get_variable(name='dense_weights',
                              shape=[64, len(language_keys)],
                              initializer=tf.contrib.layers.xavier_initializer())
    biases = tf.Variable(tf.zeros([len(language_keys)]), name='biases')
    logits = tf.nn.relu(tf.matmul(embedding_mean, dense_weights) + biases, name='logits')
    prediction = tf.argmax(logits, 1)

    
    batch_loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=language_codes_batch_op)
    loss_op = tf.reduce_mean(batch_loss, name='loss')
    return prediction, logits, loss_op, content_batch_op
    

### Create test batch by running content batch op only on test csv

In [None]:
sess = tf.Session()
with sess.as_default():
    _,_,_, content_batch_op = make_net('/tmp/test.csv')
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    test_batch = sess.run([content_batch_op])[0] 
    coord.request_stop()
    coord.join(threads)

tf.reset_default_graph()
    

### Train as usual but add summaries for both train and test, where for test only run the content batch op

In [None]:
with tf.Session() as train_sess:
        prediction, logits, loss_op, content_batch_op = make_net('/tmp/train.csv')

        optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
        global_step = tf.Variable(0,name='global_step', trainable=False)
        train_op = optimizer.minimize(loss_op,global_step=global_step)

        init_vars_op = tf.global_variables_initializer()
        init_tables_op = tf.tables_initializer()


        summary_writer = tf.summary.FileWriter('/tmp/tf_tutorial/logs/train2', train_sess.graph)
        loss_summary = tf.summary.scalar('loss', loss_op)
        merge_summaries_op = tf.summary.merge_all()


        train_sess.run([init_vars_op, init_tables_op])


        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        for i in range(TRAIN_ITERATIONS):
            train_loss,_, merged_summary, current_global_step = train_sess.run([loss_op, train_op, merge_summaries_op, global_step])
            summary_writer.add_summary(merged_summary,current_global_step)
            print "Train loss: %s" % train_loss
            
            test_loss = train_sess.run(loss_op, feed_dict={content_batch_op: test_batch})
            test_summary = tf.Summary(
                value=[tf.Summary.Value(tag="test_loss", simple_value=test_loss)])
            summary_writer.add_summary(test_summary, current_global_step)

            print "Test loss: %s" % test_loss
        
        summary_writer.flush()    
        coord.request_stop()
        coord.join(threads)

In [None]:
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.saved_model import signature_def_utils
from tensorflow.python.saved_model import utils

! rm -rf /tmp/tf_tutorial/model/0

with sess.as_default():    
    legacy_init_op = tf.group(
          tf.tables_initializer(), name='legacy_init_op')
    builder = saved_model_builder.SavedModelBuilder('/tmp/tf_tutorial/model/0')
    signature = signature_def_utils.build_signature_def(
          inputs={'content': utils.build_tensor_info(content_batch_op)},
          outputs={'logits': utils.build_tensor_info(logits),
                   'prediction': utils.build_tensor_info(prediction)},
          method_name=signature_constants.PREDICT_METHOD_NAME)    
    builder.add_meta_graph_and_variables(
                        sess, 
                        [tag_constants.SERVING],
                        signature_def_map={
                            'predict_language': signature,
                        },
                        main_op=legacy_init_op
                        )                
    builder.save()