In [21]:
## Problem statement - To classify input sentence into 15 categories like People, Company, etc. Input data from tensorflow 
## dbpedia dataset. Training data contains description and category tagging. Test set contains just description
## Algorithm used here is 1) bag of words or skip gram model by tensorflow (Skip gram independent of left right words 
## basically independent of context)
## 2) Using RNN to take the context and 
## Future improvements - Use Glove Word Embedding for each sentence by: 1) adding embedding for each word 2) concating 
## embedding and create bigger dimension vector

import tensorflow as tf
import pandas as pd
import numpy as np
learn = tf.contrib.learn
from sklearn import metrics

# dbpedia = learn.datasets.load_dataset('dbpedia') ## data loaded so commenting it

MAX_DOCUMENT_LENGTH=50
EMBEDDING_SIZE = 100

In [22]:
## Future Improvements apply lemma, stemming, stop words
## as per prior experience, context of the sentence is lost by using lemma, stemming and stop words. Use only 6-7 top stop words
## Porter stemming not that great. For bow, all this can be applied since context is not needed (skip gram).

x_train = pd.DataFrame(dbpedia.train.data)[1]
y_train = pd.Series(dbpedia.train.target)
x_test = pd.DataFrame(dbpedia.test.data)[1] 
y_test = pd.Series(dbpedia.test.target)

print x_train[:5]
print y_train[:5]
print x_test[:5]
print y_test[:5]


0     Abbott of Farnham E D Abbott Limited was a Br...
1     Michael Best & Friedrich LLP is a U.S. law fi...
2     The Centurion Bank of Punjab (formerly Centur...
3     Carlsberg Srbija (full legal name: Carlsberg ...
4     Amdocs Limited is a provider of software and ...
Name: 1, dtype: object
0    1
1    1
2    1
3    1
4    1
dtype: int32
0     TY KU /taɪkuː/ is an American alcoholic bever...
1     Palace Software was a British video game publ...
2     NIKI Luftfahrt GmbH also known as flyNiki is ...
3     Dirgantara Air Service was an airline based i...
4     Nash Timbers is a global and domestic distrib...
Name: 1, dtype: object
0    1
1    1
2    1
3    1
4    1
dtype: int32


In [23]:
## Developing Vocabulary from the input data. 
vocab_processor = learn.preprocessing.VocabularyProcessor(
  MAX_DOCUMENT_LENGTH)  
x_train = np.array(list(vocab_processor.fit_transform(x_train))) 
x_test = np.array(list(vocab_processor.transform(x_test)))
n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

Total words: 7664


In [24]:
def bag_of_words_model(features, target):  
  target = tf.one_hot(target, 15, 1, 0) ## 15 is the number of categories, 1 is the on value and 0 is the off value
  
  features = tf.contrib.layers.bow_encoder(features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)  ## converts input word id array to embedding size vector
    
  logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None) ## it is to create weights which is multiplied
  ## by input to produce vector of hidden layer
  
  loss = tf.contrib.losses.softmax_cross_entropy(logits, target) ## softmax entropy loss function between target and actual output
  
  train_op = tf.contrib.layers.optimize_loss( ## given loss and optimizing parameters return a training op
      loss, tf.contrib.framework.get_global_step(),      
      optimizer='Adam', learning_rate=0.01)  
  return (      
      {'class': tf.argmax(logits, 1), 
       'prob': tf.nn.softmax(logits)},      
      loss, train_op)

In [26]:
def rnn_model(features, target):
  # Convert indexes of words into embeddings.
  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
  # maps word indexes of the sequence into [batch_size, sequence_length,
  # EMBEDDING_SIZE].
  word_vectors = tf.contrib.layers.embed_sequence(
      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')

  # Split into list of embedding per word, while removing doc length dim.
  # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
  word_list = tf.unstack(word_vectors, axis=1) 

  # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
  cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)

  # Create an unrolled Recurrent Neural Networks to length of
  # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
  _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)

  # Given encoding of RNN, take encoding of last step (e.g hidden size of the
  # neural network of last step) and pass it as features for logistic
  # regression over output classes.
  target = tf.one_hot(target, 15, 1, 0)
  logits = tf.contrib.layers.fully_connected(encoding, 15, activation_fn=None)
  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)

  # Create a training op.
  train_op = tf.contrib.layers.optimize_loss(
      loss,
      tf.contrib.framework.get_global_step(),
      optimizer='Adam',
      learning_rate=0.01)

  return ({
      'class': tf.argmax(logits, 1),
      'prob': tf.nn.softmax(logits)
  }, loss, train_op)

In [27]:
## Future Improvements apply k-fold cross validation to solve over-fitting
## 87% accuracy with Bag of Words model

# classifier = learn.Estimator(model_fn=bag_of_words_model) 
classifier = learn.Estimator(model_fn=rnn_model)  ## it will take 1 hour+ for training

# Train and predict 
classifier.fit(x_train, y_train, steps=10000) 

y_predicted = [p['class'] for p in classifier.predict(x_test, as_iterable=True)] 

score = metrics.accuracy_score(y_test, y_predicted) 

print('Accuracy: {0:f}'.format(score))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': None, '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': 'local', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1196bcd90>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_num_worker_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': ''}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving int

KeyboardInterrupt: 