In [None]:
import tensorflow as tf
import pandas as pd
import os

tf.enable_eager_execution()

In [178]:
from sklearn import preprocessing, cross_validation
import numpy as np

data_path = '/data/users-csv.csv'
print("TensorFlow version: {}".format(tf.VERSION))
print("Eager execution: {}".format(tf.executing_eagerly()))

COLUMN_NAMES = ['screen_name','name','description', 'class']

# CLASSES = ['sports','technology','entertainment', 'politics',
#            'music','legal','medical','education','journalism']

CLASSES = ['sports', 'politics', 'education','journalism']

NUM_CLASSES = len(CLASSES)
MAX_VOCABULARY_SIZE = 1000
BATCH_SIZE = 100
EPOCHS = 10
LEARNING_RATE = 0.001
    

def raw_data_fn(y_name='class'):
    data = pd.read_csv(data_path, header=0)
    
    #drop all df with value NaN
    data = data.dropna(subset=['name', 'description', 'class'])
    
    #count total data
    print("Count for data: %s" % len(data))
    
    #append name and description
    name_description = data['name'].astype(str) + ' ' + data['description']   
    #print(name_description)
    
    #get labels 
    classes = data['class']

    # define Tokenizer with Vocab Size
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_VOCABULARY_SIZE)
    tokenizer.fit_on_texts(name_description)
    
    # print the word_index aka what is the equivaluant for each unique word
    #print(tokenizer.word_index)
    
    # define encoder and make labels
    encoder = preprocessing.LabelBinarizer()
    encoder.fit(CLASSES)
    
    #make features and labels
    labels = encoder.transform(classes)
    features = tokenizer.texts_to_matrix(name_description, mode='tfidf')
       
    return (features, labels, tokenizer)

def load_data_fn(features, labels, test_size=0.3):
    return cross_validation.train_test_split(features, labels, test_size=test_size, random_state=42)

def train_input_fn(features, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((features,labels))
    
    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    
    return dataset

def model_fn():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(512, activation=tf.nn.relu, input_shape=(MAX_VOCABULARY_SIZE,)))
    model.add(tf.keras.layers.Dense(256))
    model.add(tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax))

    # Create a TensorFlow optimizer, rather than using the Keras version
    # This is currently necessary when working in eager mode
    optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE)

    # We will now compile and print out a summary of our model
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    model.summary()
    return model

def predict_fn(model, labels, tokenizer, name, description):
    name_description = name + ' ' + description

    feature = tokenizer.texts_to_matrix(np.array([name_description]), mode='tfidf')
    prediction = model.predict(feature)
    
    predicted_label = CLASSES[np.argmax(prediction[0])]
    return predicted_label

    
features, labels, tokenizer = raw_data_fn()
#print(features[0])
#print(labels[0])

train_features, test_features, train_labels, test_labels = load_data_fn(features, labels)
print("Train count {features/labels}: %s/%s" % (len(train_features), len(train_labels)))
print("Test count {features/labels}: %s/%s" % (len(test_features), len(test_labels)))

# #train_dataset = train_input_fn(train_features, train_labels, BATCH_SIZE)
# #print(train_dataset)

#create a model for training
model = model_fn()

#create a saver to save the model
saver = tf.train.Saver()

#train the training set with model
model.fit(train_features, train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS)

#evaluate the model on test set
loss, accuracy = model.evaluate(test_features, test_labels)
print("Loss : %s   Accuracy: %s" % (loss, accuracy))


model.save("user_topic.ckpt")

name = "Garrett Mitchell"
description = "Professional hockey player. Washington Capitals"

predicted_label = predict_fn(model, labels, tokenizer, name, description)
print("\nname: %s \ndescription: %s \npredicted class: %s" % (name, description, predicted_label))


TensorFlow version: 1.8.0
Eager execution: True
Count for data: 63
Train count {features/labels}: 44/44
Test count {features/labels}: 19/19
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_185 (Dense)            (None, 512)               512512    
_________________________________________________________________
dense_186 (Dense)            (None, 256)               131328    
_________________________________________________________________
dense_187 (Dense)            (None, 4)                 1028      
Total params: 644,868
Trainable params: 644,868
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss : 1.2782313823699951   Accuracy: 0.3684210479259491

name: Garrett Mitchell 
description: Professional hockey player. Washington Capitals 
predi