In [1]:
import sys
!{sys.executable} -m pip install PyAthena


[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
from pyathena import connect
import pandas as pd
conn = connect(s3_staging_dir='s3://aws-athena-query-results-984073016564-us-west-2/sagemaker', region_name='us-west-2')
df = pd.read_sql("SELECT distinct * FROM retsdata.union_geo3 WHERE county = 'San Diego'", conn)

In [None]:
len(df)

In [None]:
df[:10]

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
def pad(df):
    return (df['streetnumber'].map(lambda s: '{!s:6.6}'.format(str(s))) + ' ' + 
       df['streetname'].map(lambda s: '{!s:20.20}'.format(s).upper()) + ' ' + 
       df['city'].map(lambda s: '{!s:16.16}'.format(s).upper()))

df_train   = pad(df.loc[df['zip5'] != '0'])
df_labels  = df['zip5'].map(str)
df_predict = pad(df.loc[df['zip5'] == '0'])

In [None]:
df_train[:10]

In [None]:
df_labels[:10]

In [None]:
df_predict[:10]

In [None]:
df_train[:3]

In [None]:
N_CODES = 128

def encode(df):
    sess = tf.Session()
    v = df.map(lambda s: list(map(ord, list(s)))).values.tolist()
    v = sess.run(list(map(lambda row: tf.one_hot(row, N_CODES), v)))
    v = list(map(lambda v: [flatten for sub in v for flatten in sub], v))
    return pd.Series(v)
    

In [None]:
encode(df_train[:3])

In [None]:
SUBSET=1000
train_inputs = encode(df_train[:SUBSET])

In [None]:
train_inputs[:3]

In [None]:
train_labels = encode(df_labels[:SUBSET])
train_labels[:3]

In [None]:
len(train_labels[0])

In [None]:
SAMPLES = len(train_inputs)
INPUT_VAR_CODES = len(train_inputs[0])
INPUT_VARS = INPUT_VAR_CODES // N_CODES
OUTPUT_VAR_CODES = len(train_labels[0])
OUTPUT_VARS = OUTPUT_VAR_CODES // N_CODES

In [None]:
X = tf.placeholder(tf.float32, shape=[1, len(train_inputs[0])])
Y = tf.placeholder(tf.float32, shape=[1, len(train_labels[0])])

In [None]:
HIDDEN=5
parameters = {
		'W1': tf.Variable(tf.random_normal([len(train_inputs[0]), HIDDEN])),
		'b1': tf.Variable(tf.random_normal([HIDDEN])),
		'W2': tf.Variable(tf.random_normal([HIDDEN, len(train_labels[0])])),
		'b2': tf.Variable(tf.random_normal([len(train_labels[0])]))
}

In [None]:
num_epochs = 2000
learning_rate = 0.0005

In [None]:
def neural_net(X,parameters):
	Z1 = tf.add(tf.matmul(X, parameters['W1']), parameters['b1'])
	A2 = tf.nn.relu(Z1)
	Z2 = tf.add(tf.matmul(A2, parameters['W2']), parameters['b2'])
	return Z2

In [None]:
import datetime

def train():
    print (datetime.datetime.now())
    Z = neural_net(X,parameters)
    costs = []
    optimizers = []
    for i in range(OUTPUT_VARS):
        c = Z[0][N_CODES*i:N_CODES*i+N_CODES]
        costs.append(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=c,  labels=Y[0][N_CODES*i:N_CODES*i+N_CODES])))
        optimizer_k = optimizers.append(tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(costs[i]))
    optimizer = tf.group(*optimizers)
    cost = tf.reduce_sum(costs)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(init)
        epoch = 0
        while epoch<num_epochs:
            for i in range(SAMPLES):
                _ , c = sess.run([optimizer, cost], feed_dict={
                    X: np.reshape(train_inputs[i],[1,INPUT_VAR_CODES]), 
                    Y: np.reshape(train_labels[i],[1,OUTPUT_VAR_CODES])
                }) 
            if epoch % 200 == 0 or epoch == num_epochs:
                print (str(datetime.datetime.now()) +  " Cost after epoch %i: %f" % (epoch, c))
            epoch += 1
        saver.save(sess, 'model.ckpt')


In [None]:
train()

In [None]:
def toheno(onehot):
    # reverse the one-hot encoding
    calcs = []
    for i in range(len(onehot)//N_CODES):
        charvec = onehot[N_CODES*i : N_CODES*(i+1)]
        calcs.append(tf.argmax(charvec))
    with tf.Session() as sess:
        out = sess.run(calcs)
    return ''.join([chr(i) for i in out])
        
toheno(train_inputs[0])        

In [None]:
def test():
    # Test predictions by computing the output using training set as input
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = tf.train.import_meta_graph('model.ckpt.meta')
        saver.restore(sess,'model.ckpt')
        for row in range(len(train_inputs)):

            g = train_inputs[row]
            g = np.reshape(g,[1,INPUT_VAR_CODES])
            output = neural_net(g,parameters)
            outputs = []
            for i in range(OUTPUT_VARS):
                kk = tf.nn.softmax(output[0][N_CODES*i : N_CODES*i+N_CODES])
                outputs.append(kk)
                
            
            out = sess.run(outputs)
            out = np.reshape(list(map(list,out)), [1, OUTPUT_VAR_CODES])[0].tolist()
            out = list(map(lambda x: float("%.1f" % x), out))
            
            print("\nROW #" + str(row))
            print("Expected: " + toheno(train_labels[row]))
            print("Actual..: " + toheno(out))            
            row = row + 1


In [None]:
test()