In [1]:
! rm -rf /tmp/tf_tutorial/logs/*

### Download data as usual but add to query a random train test split

In [2]:
s=r'''SELECT regexp_extract(sample_path, '.*(\\..+)$') as suffix,
             content,
             RAND() >0.1 as is_train
      FROM  [bigquery-public-data:github_repos.sample_contents]      
      WHERE 
         content IS NOT NULL
          AND content != ''
          and length(content) > 1024
      HAVING 
             suffix IS NOT NULL
             AND suffix in ('.py','.c','rb')
      LIMIT 1000;'''

In [3]:
from google.cloud import bigquery
c = bigquery.Client()
query = c.run_sync_query(s)
query.run()

### Split data to 2 csvs according to random split

In [4]:
import csv
from base64 import urlsafe_b64encode


def transform_content(content):
    content = content.encode('utf-8')    
    content = content[:1024]    
    return content

with file('/tmp/train.csv','wb') as train_out_file:
    with file('/tmp/test.csv','wb') as test_out_file:
        w_train = csv.writer(train_out_file, quoting=csv.QUOTE_ALL, delimiter='\t')
        w_test = csv.writer(test_out_file, quoting=csv.QUOTE_ALL, delimiter='\t')
        for language,content, is_train in query.rows:
            # decide where to write by the is_train value
            w = w_train if is_train else w_test
            w.writerow([language,urlsafe_b64encode(transform_content(content))])

### Packaged net building into method

In [5]:
import tensorflow as tf
BATCH_SIZE = 128
LEARNING_RATE = 0.001
TRAIN_ITERATIONS = 500
MAX_STRING_SIZE = 1024

def make_net(input_path, batch_size=BATCH_SIZE):
    filenames_queue = tf.train.string_input_producer([input_path])
    reader = tf.TextLineReader()
    key, value = reader.read(filenames_queue)
    default_values=[['UNKNOWN'], ['']]


    # decode content
    language, b64_content = tf.decode_csv(value,default_values, field_delim='\t')
    content = tf.decode_base64(b64_content)

    language_batch_op, content_batch_op = tf.train.shuffle_batch([language,content], 
                                                                 batch_size=batch_size, 
                                                                 capacity=1000, 
                                                                 min_after_dequeue=100)
    #Make hash table for langauges
    language_keys=['.py','.c','.h']
    values=range(1, len(language_keys)+1)
    language_codes_table = tf.contrib.lookup.HashTable(
        tf.contrib.lookup.KeyValueTensorInitializer(language_keys, values), 0)

    #Make embeddings for the characters
    bytes = tf.transpose(tf.decode_raw(content_batch_op, tf.uint8))
    bytes_embedding_weights = tf.Variable(name="embedding_weights",
                                          initial_value=tf.random_uniform(shape=(256, 64),
                                                                          minval=-0.1, 
                                                                          maxval=0.1))
    bytes_embedding = tf.nn.embedding_lookup(bytes_embedding_weights, tf.cast(bytes,tf.int32))
    embedding_mean = tf.reduce_mean(bytes_embedding,axis=0)

    # Convert languages to numeric codes
    language_codes_indices = language_codes_table.lookup(language_batch_op)
    language_codes_batch_op = tf.one_hot(language_codes_indices, len(language_keys))
    dense_weights = tf.get_variable(name='dense_weights',
                              shape=[64, len(language_keys)],
                              initializer=tf.contrib.layers.xavier_initializer())
    biases = tf.Variable(tf.zeros([len(language_keys)]), name='biases')
    logits = tf.nn.relu(tf.matmul(embedding_mean, dense_weights) + biases, name='logits')
    prediction = tf.argmax(logits, 1)

    
    batch_loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=language_codes_batch_op)
    loss_op = tf.reduce_mean(batch_loss, name='loss')
    return prediction, logits, loss_op, content_batch_op
    

### Create test batch by running content batch op only on test csv

In [6]:
sess = tf.Session()
with sess.as_default():
    _,_,_, content_batch_op = make_net('/tmp/test.csv')
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    test_batch = sess.run([content_batch_op])[0] 
    coord.request_stop()
    coord.join(threads)

tf.reset_default_graph()
print test_batch.shape    

(128,)


### Train as usual but add summaries for both train and test, where for test only run the content batch op

In [7]:
with tf.Session() as train_sess:
        prediction, logits, loss_op, content_batch_op = make_net('/tmp/train.csv')

        optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
        global_step = tf.Variable(0,name='global_step', trainable=False)
        train_op = optimizer.minimize(loss_op,global_step=global_step)

        init_vars_op = tf.global_variables_initializer()
        init_tables_op = tf.tables_initializer()


        summary_writer = tf.summary.FileWriter('/tmp/tf_tutorial/logs/train1', train_sess.graph)
        loss_summary = tf.summary.scalar('loss', loss_op)
        merge_summaries_op = tf.summary.merge_all()


        train_sess.run([init_vars_op, init_tables_op])


        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        for i in range(TRAIN_ITERATIONS):
            train_loss,_, merged_summary, current_global_step = train_sess.run([loss_op, train_op, merge_summaries_op, global_step])
            summary_writer.add_summary(merged_summary,current_global_step)
            print "Train loss: %s" % train_loss
            
            test_loss = train_sess.run(loss_op, feed_dict={content_batch_op: test_batch})
            test_summary = tf.Summary(
                value=[tf.Summary.Value(tag="test_loss", simple_value=test_loss)])
            summary_writer.add_summary(test_summary, current_global_step)

            print "Test loss: %s" % test_loss
        
        summary_writer.flush()    
        coord.request_stop()
        coord.join(threads)

Train loss: 1.09824
Test loss: 1.09412
Train loss: 1.09699
Test loss: 1.09377
Train loss: 1.09531
Test loss: 1.09323
Train loss: 1.09262
Test loss: 1.08845
Train loss: 1.09379
Test loss: 1.09083
Train loss: 1.0868
Test loss: 1.0865
Train loss: 1.08976
Test loss: 1.086
Train loss: 1.08051
Test loss: 1.0831
Train loss: 1.07761
Test loss: 1.07878
Train loss: 1.07296
Test loss: 1.07903
Train loss: 1.07852
Test loss: 1.07742
Train loss: 1.06761
Test loss: 1.07229
Train loss: 1.06623
Test loss: 1.06916
Train loss: 1.06464
Test loss: 1.06839
Train loss: 1.06584
Test loss: 1.0695
Train loss: 1.06328
Test loss: 1.06484
Train loss: 1.05362
Test loss: 1.0598
Train loss: 1.04758
Test loss: 1.05528
Train loss: 1.05838
Test loss: 1.05722
Train loss: 1.04954
Test loss: 1.05279
Train loss: 1.04881
Test loss: 1.04928
Train loss: 1.04094
Test loss: 1.04929
Train loss: 1.04054
Test loss: 1.03907
Train loss: 1.03294
Test loss: 1.03194
Train loss: 1.0342
Test loss: 1.03755
Train loss: 1.03126
Test loss: 1.

Train loss: 0.549408
Test loss: 0.818002
Train loss: 0.568108
Test loss: 0.779191
Train loss: 0.565863
Test loss: 0.794401
Train loss: 0.57149
Test loss: 0.73776
Train loss: 0.546763
Test loss: 0.706812
Train loss: 0.556873
Test loss: 0.755346
Train loss: 0.537296
Test loss: 0.767871
Train loss: 0.556796
Test loss: 0.759543
Train loss: 0.527063
Test loss: 0.749928
Train loss: 0.530049
Test loss: 0.756794
Train loss: 0.565602
Test loss: 0.76086
Train loss: 0.518429
Test loss: 0.753129
Train loss: 0.533867
Test loss: 0.793632
Train loss: 0.542042
Test loss: 0.795173
Train loss: 0.531332
Test loss: 0.777279
Train loss: 0.5405
Test loss: 0.810058
Train loss: 0.509232
Test loss: 0.74187
Train loss: 0.539004
Test loss: 0.789165
Train loss: 0.530973
Test loss: 0.785293
Train loss: 0.541583
Test loss: 0.79798
Train loss: 0.539074
Test loss: 0.731506
Train loss: 0.544586
Test loss: 0.789455
Train loss: 0.533756
Test loss: 0.790195
Train loss: 0.534239
Test loss: 0.765993
Train loss: 0.516432
Te

Train loss: 0.280596
Test loss: 0.79832
Train loss: 0.364241
Test loss: 0.897583
Train loss: 0.341997
Test loss: 1.06486
Train loss: 0.312194
Test loss: 0.914403
Train loss: 0.309416
Test loss: 1.02652
Train loss: 0.275949
Test loss: 1.03842
Train loss: 0.34881
Test loss: 1.04011
Train loss: 0.32238
Test loss: 1.02425
Train loss: 0.354302
Test loss: 1.14068
Train loss: 0.265619
Test loss: 0.863553
Train loss: 0.342802
Test loss: 1.12241
Train loss: 0.295224
Test loss: 0.996993
Train loss: 0.294705
Test loss: 0.959664
Train loss: 0.312817
Test loss: 1.16316
Train loss: 0.304694
Test loss: 1.02184
Train loss: 0.317895
Test loss: 1.05181
Train loss: 0.30572
Test loss: 1.05307
Train loss: 0.267944
Test loss: 1.00639
Train loss: 0.319356
Test loss: 1.14049
Train loss: 0.298283
Test loss: 0.864371
Train loss: 0.2961
Test loss: 1.14088
Train loss: 0.285131
Test loss: 0.915367
Train loss: 0.303817
Test loss: 1.05664
Train loss: 0.280475
Test loss: 0.97179
Train loss: 0.261344
Test loss: 1.1967

In [1]:
import subprocess
cmd = 'tensorboard --logdir /tmp/tf_tutorial/logs'
# Run our docker
p = subprocess.Popen(cmd, shell=True)