## First we'll define our query

### Github Repo Dataset, includes files and metadata
### Predict language given file contents
### We want to find the suffix of each file, and make sure the content is at least 1024 characters long

In [1]:
s=r'''SELECT regexp_extract(sample_path, '.*(\\..+)$') as suffix,
             content
      FROM  [bigquery-public-data:github_repos.sample_contents]      
      where 
          content IS NOT NULL
          AND content != ''
          and length(content) > 1024
      HAVING 
             suffix IS NOT NULL
             AND suffix in ('.py','.c','rb')
      LIMIT 1000;'''

# Next we'll obtain the default Big Query client and run the query

In [2]:
from google.cloud import bigquery
c = bigquery.Client()
query = c.run_sync_query(s)
query.run()

# We then want to transform our data to base 64, control size, and write to a csv

In [3]:
import csv
from base64 import urlsafe_b64encode


def transform_content(content):
    content = content.encode('utf-8')    
    content = content[:1024]    
    return content

with file('/tmp/data.csv','wb') as out_file:
    w = csv.writer(out_file, quoting=csv.QUOTE_ALL, delimiter='\t')
    for language,content in query.rows:
        w.writerow([language,urlsafe_b64encode(transform_content(content))])

### Import and set constants

In [4]:
import tensorflow as tf
BATCH_SIZE = 128
LEARNING_RATE = 0.01
TRAIN_ITERATIONS = 1000
MAX_STRING_SIZE = 1024

### Start reading data into queue

In [5]:
filenames_queue = tf.train.string_input_producer(['/tmp/data.csv'])
reader = tf.TextLineReader()
key, value = reader.read(filenames_queue)
default_values=[['UNKNOWN'], ['']]# decode content
language, b64_content = tf.decode_csv(value,default_values, field_delim='\t')
content = tf.decode_base64(b64_content)

### Create batches from queus by shuffling data

In [6]:
language_batch_op, content_batch_op = tf.train.shuffle_batch([language,content], 
                                                             batch_size=BATCH_SIZE, 
                                                             capacity=1000, 
                                                             min_after_dequeue=100)

#### Make hash table and one hot encoding for langauges

In [7]:
language_keys=['.py','.c','.rb']
values=range(1, len(language_keys)+1)
language_codes_table = tf.contrib.lookup.HashTable(
    tf.contrib.lookup.KeyValueTensorInitializer(language_keys, values), 0)

language_codes_indices = language_codes_table.lookup(language_batch_op)
language_codes_batch_op = tf.one_hot(language_codes_indices, len(language_keys))

### Make embeddings for the characters


In [8]:
bytes = tf.transpose(tf.decode_raw(content_batch_op, tf.uint8))
bytes_embedding_weights = tf.Variable(name="embedding_weights",
                                      initial_value=tf.random_uniform(shape=(256, 64),
                                                                      minval=-0.1, 
                                                                      maxval=0.1))
bytes_embedding = tf.nn.embedding_lookup(bytes_embedding_weights, tf.cast(bytes,tf.int32))
embedding_mean = tf.reduce_mean(bytes_embedding,axis=0)

### Create dense layer with Relu and bias, from embedding length to number of categories

In [9]:
dense_weights = tf.get_variable(name='dense_weights',
                          shape=[64, len(language_keys)],
                          initializer=tf.contrib.layers.xavier_initializer())
biases = tf.Variable(tf.zeros([len(language_keys)]), name='biases')
logits = tf.nn.relu(tf.matmul(embedding_mean, dense_weights) + biases, name='logits')
prediction = tf.argmax(logits, 1)

### Create loss

In [10]:
batch_loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=language_codes_batch_op)
loss_op = tf.reduce_mean(batch_loss, name='loss')

### Define optimizer, global step counter, and train operation

In [11]:
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
global_step = tf.Variable(0,name='global_step', trainable=False)
train_op = optimizer.minimize(loss_op,global_step=global_step)

### Initialize variables, tables, and session 

In [12]:
init_vars_op = tf.global_variables_initializer()
init_tables_op = tf.tables_initializer()

sess = tf.Session()

### We will now start the session, using a train Coordinator and Queue runners
### We'll run for all the train iterations and then stop all threads

In [13]:
with sess.as_default():
    sess.run([init_vars_op, init_tables_op])
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    for i in range(TRAIN_ITERATIONS):
        train_loss,_ = sess.run([loss_op, train_op])
        print "Train loss: %s" % train_loss
    coord.request_stop()
    coord.join(threads)

Train loss: 1.09998
Train loss: 1.08718
Train loss: 1.06466
Train loss: 1.03933
Train loss: 1.02273
Train loss: 1.00025
Train loss: 0.961968
Train loss: 0.939344
Train loss: 0.905729
Train loss: 0.890576
Train loss: 0.849384
Train loss: 0.833741
Train loss: 0.803506
Train loss: 0.795069
Train loss: 0.752052
Train loss: 0.748417
Train loss: 0.721226
Train loss: 0.700206
Train loss: 0.670752
Train loss: 0.64038
Train loss: 0.641447
Train loss: 0.631414
Train loss: 0.596992
Train loss: 0.575163
Train loss: 0.544281
Train loss: 0.552259
Train loss: 0.54449
Train loss: 0.522265
Train loss: 0.515425
Train loss: 0.484299
Train loss: 0.472563
Train loss: 0.460844
Train loss: 0.469132
Train loss: 0.457314
Train loss: 0.409448
Train loss: 0.437277
Train loss: 0.404841
Train loss: 0.404376
Train loss: 0.370697
Train loss: 0.3507
Train loss: 0.36521
Train loss: 0.375835
Train loss: 0.326528
Train loss: 0.355624
Train loss: 0.320836
Train loss: 0.316329
Train loss: 0.285463
Train loss: 0.299892
Tra

Train loss: 0.0143603
Train loss: 0.0197304
Train loss: 0.0128187
Train loss: 0.0106895
Train loss: 0.0228384
Train loss: 0.023421
Train loss: 0.0128262
Train loss: 0.0115354
Train loss: 0.0125654
Train loss: 0.0112655
Train loss: 0.0137601
Train loss: 0.0185784
Train loss: 0.00765675
Train loss: 0.0102258
Train loss: 0.0154883
Train loss: 0.01793
Train loss: 0.0136768
Train loss: 0.0125636
Train loss: 0.0147032
Train loss: 0.0153072
Train loss: 0.0100479
Train loss: 0.014395
Train loss: 0.0111869
Train loss: 0.0129578
Train loss: 0.0210022
Train loss: 0.0104114
Train loss: 0.0172104
Train loss: 0.016571
Train loss: 0.0199602
Train loss: 0.0180722
Train loss: 0.0118401
Train loss: 0.0114885
Train loss: 0.011522
Train loss: 0.0101551
Train loss: 0.00919563
Train loss: 0.0123015
Train loss: 0.0134097
Train loss: 0.0200256
Train loss: 0.0146669
Train loss: 0.00771676
Train loss: 0.0159156
Train loss: 0.0146457
Train loss: 0.00746825
Train loss: 0.0100763
Train loss: 0.00802957
Train loss:

Train loss: 0.00869631
Train loss: 0.00673452
Train loss: 0.00493908
Train loss: 0.00608286
Train loss: 0.00320008
Train loss: 0.0054401
Train loss: 0.0028852
Train loss: 0.00893378
Train loss: 0.0054515
Train loss: 0.00390156
Train loss: 0.00480551
Train loss: 0.00687917
Train loss: 0.00385056
Train loss: 0.00597552
Train loss: 0.000951224
Train loss: 0.00816755
Train loss: 0.0028105
Train loss: 0.00340622
Train loss: 0.00538887
Train loss: 0.00676869
Train loss: 0.00209056
Train loss: 0.00134794
Train loss: 0.00764507
Train loss: 0.0045591
Train loss: 0.00386891
Train loss: 0.00298083
Train loss: 0.00281659
Train loss: 0.00425498
Train loss: 0.00271393
Train loss: 0.00735442
Train loss: 0.0047339
Train loss: 0.00501415
Train loss: 0.0045275
Train loss: 0.00195174
Train loss: 0.00729796
Train loss: 0.0066807
Train loss: 0.00175155
Train loss: 0.00292856
Train loss: 0.00667825
Train loss: 0.00396888
Train loss: 0.00440776
Train loss: 0.00438817
Train loss: 0.00329923
Train loss: 0.0024

### Saving model using tf.saved_model so we can serve easily

In [14]:
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.saved_model import signature_def_utils
from tensorflow.python.saved_model import utils

! rm -rf /tmp/tf_tutorial/model/0

with sess.as_default():    
    legacy_init_op = tf.group(
          tf.tables_initializer(), name='legacy_init_op')
    
    builder = saved_model_builder.SavedModelBuilder('/tmp/tf_tutorial/model/0')
    
    signature = signature_def_utils.build_signature_def(
          inputs={'content': utils.build_tensor_info(content_batch_op)},
          outputs={'logits': utils.build_tensor_info(logits),
                   'prediction': utils.build_tensor_info(prediction)},
          method_name=signature_constants.PREDICT_METHOD_NAME)    
    
    builder.add_meta_graph_and_variables(
                        sess, 
                        [tag_constants.SERVING],
                        signature_def_map={
                            'predict_language': signature,
                        },
                        main_op=legacy_init_op
                        )                
    builder.save()

INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /tmp/tf_tutorial/model/0/saved_model.pb
