[View in Colaboratory](https://colab.research.google.com/github/acesaif/ml_crash_course/blob/master/ml_embeddings.ipynb)

## Setup

In [0]:
import collections
import math

from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics

tf.logging.set_verbosity(tf.logging.ERROR)

## Load Dataset

In [0]:
train_url = 'https://storage.googleapis.com/mledu-datasets/sparse-data-embedding/train.tfrecord'
train_path = tf.keras.utils.get_file(train_url.split('/')[-1], train_url)
test_url = 'https://storage.googleapis.com/mledu-datasets/sparse-data-embedding/test.tfrecord'
test_path = tf.keras.utils.get_file(test_url.split('/')[-1], test_url)

In [12]:
print(train_path)
print(test_path)

/content/.keras/datasets/train.tfrecord
/content/.keras/datasets/test.tfrecord


## Building a Sentiment Analysis Model

Vocabulary: list of each term we expect to see in data. We will turn our string-values into features vectors by using vocabulary. Each term in vocabulary is mapped to a cooridinate of feature vector. To convert string-value terms for an example into this vector format, we encode.

Encoding happens this way...

* Each coordinate gets a value of `0` if, vocabulary terms not found in the given example. Elsewhere `1` if found.
* If examples terms not in vocabulary, then `throw away` those example terms.

## Building the Input Pipeline

In [0]:
def _parse_function(record):
  """Extracts features and labels.
  
  Args:
    record: File path to a TFRecord file    
  Returns:
    A `tuple` `(labels, features)`:
      features: A dict of tensors representing the features
      labels: A tensor with the corresponding labels.
  """
  features = {
      "terms": tf.VarLenFeature(dtype=tf.string), # terms are strings of varying lengths
      "labels": tf.FixedLenFeature(shape=[1], dtype=tf.float32) # labels are 0 or 1
  }
  
  parsed_features = tf.parse_single_example(record, features)
  
  terms = parsed_features['terms'].values
  labels = parsed_features['labels']
  
  return {'terms': terms}, labels

In [15]:
# Create the Dataset object
ds = tf.data.TFRecordDataset(train_path)
# Map the features and labels with parse function
ds = ds.map(_parse_function)
ds

<MapDataset shapes: ({terms: (?,)}, (1,)), types: ({terms: tf.string}, tf.float32)>

In [16]:
n = ds.make_one_shot_iterator().get_next() # retrieve the first example of training dataset.
sess = tf.Session()
sess.run(n)

({'terms': array([b'but', b'it', b'does', b'have', b'some', b'good', b'action',
         b'and', b'a', b'plot', b'that', b'is', b'somewhat', b'interesting',
         b'.', b'nevsky', b'acts', b'like', b'a', b'body', b'builder',
         b'and', b'he', b'isn', b"'", b't', b'all', b'that', b'attractive',
         b',', b'in', b'fact', b',', b'imo', b',', b'he', b'is', b'ugly',
         b'.', b'(', b'his', b'acting', b'skills', b'lack', b'everything',
         b'!', b')', b'sascha', b'is', b'played', b'very', b'well', b'by',
         b'joanna', b'pacula', b',', b'but', b'she', b'needed', b'more',
         b'lines', b'than', b'she', b'was', b'given', b',', b'her',
         b'character', b'needed', b'to', b'be', b'developed', b'.',
         b'there', b'are', b'way', b'too', b'many', b'men', b'in', b'this',
         b'story', b',', b'there', b'is', b'zero', b'romance', b',', b'too',
         b'much', b'action', b',', b'and', b'way', b'too', b'dumb', b'of',
         b'an', b'ending', b'.', b'

## Input Function

In [0]:
# Create an input_fn that parses the tf.Examples from the given files,
# and split them into features and targets
def _input_fn(input_filenames, num_epochs=None, shuffle=True):
  
  # Create a dataset and map features and labels
  ds = tf.data.TFRecordDataset(input_filenames)
  ds = ds.map(_parse_function)
  
  if shuffle:
    ds = ds.shuffle(10000)
  
  # Our feature data is variable-length, so we pad and batch
  # each field of the dataset structure to whatever size is necessary.
  ds = ds.padded_batch(25, ds.output_shapes)
  
  ds = ds.repeat(num_epochs)
  
  # Return the next batch of data.
  features, labels = ds.make_one_shot_iterator().get_next()
  return features, labels

## Linear Model with Sparse Inputs and Explicit Vocabulary

In [0]:
# 50 informative terms that compose our model vocabulary 
informative_terms = ("bad", "great", "best", "worst", "fun", "beautiful",
                     "excellent", "poor", "boring", "awful", "terrible",
                     "definitely", "perfect", "liked", "worse", "waste",
                     "entertaining", "loved", "unfortunately", "amazing",
                     "enjoyed", "favorite", "horrible", "brilliant", "highly",
                     "simple", "annoying", "today", "hilarious", "enjoyable",
                     "dull", "fantastic", "poorly", "fails", "disappointing",
                     "disappointment", "not", "him", "her", "good", "time",
                     "?", ".", "!", "movie", "film", "action", "comedy",
                     "drama", "family")

terms_feature_column = tf.feature_column.categorical_column_with_vocabulary_list(key="terms", vocabulary_list=informative_terms)

In [22]:
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

feature_columns = [ terms_feature_column ]

classifier = tf.estimator.LinearClassifier(
    feature_columns = feature_columns,
    optimizer = my_optimizer
)

classifier.train(
    input_fn = lambda: _input_fn([train_path]),
    steps = 1000
)

evaluation_metrics = classifier.evaluate(
    input_fn = lambda: _input_fn([train_path]),
    steps = 1000
)
print("Training set metrics: ")
for m in evaluation_metrics:
  print(m, evaluation_metrics[m])
print("---")

evaluation_metrics = classifier.evaluate(
    input_fn = lambda: _input_fn([test_path]),
    steps = 1000
)
print("Testing set metrics: ")
for m in evaluation_metrics:
  print(m, evaluation_metrics[m])
print("---")

Training set metrics: 
accuracy 0.78824
accuracy_baseline 0.5
auc 0.87203777
auc_precision_recall 0.8627088
average_loss 0.45079547
label/mean 0.5
loss 11.269887
prediction/mean 0.5136876
global_step 1000
---
Testing set metrics: 
accuracy 0.78616
accuracy_baseline 0.5
auc 0.87018645
auc_precision_recall 0.8601884
average_loss 0.45149922
label/mean 0.5
loss 11.28748
prediction/mean 0.51235497
global_step 1000
---


## Deep Neural Network Model

`tf.feature_column.indicator_column(categorical_column)` as input and represents multi-hot representation of given `categorical_column`

In [25]:
classifier = tf.estimator.DNNClassifier(
    feature_columns = [tf.feature_column.indicator_column(terms_feature_column)],
    hidden_units = [20, 20],
    optimizer = my_optimizer
)

try:
  classifier.train(
      input_fn = lambda: _input_fn([train_path]),
      steps = 1000
  )
  
  evaluation_metrics = classifier.evaluate(
      input_fn = lambda: _input_fn([train_path]),
      steps = 1000
  )
  print("Training set metrics: ")
  for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
  print("---")
  evaluation_metrics = classifier.evaluate(
      input_fn = lambda: _input_fn([test_path]),
      steps = 1
  )
  print("Testing set metrics: ")
  for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
  print("---")
except ValueError as err:
  print(err)

Training set metrics: 
accuracy 0.78804
accuracy_baseline 0.5
auc 0.8722286
auc_precision_recall 0.86412716
average_loss 0.44860134
label/mean 0.5
loss 11.215034
prediction/mean 0.50501287
global_step 1000
---
Testing set metrics: 
accuracy 0.68
accuracy_baseline 0.52
auc 0.7307692
auc_precision_recall 0.71444154
average_loss 0.6154636
label/mean 0.52
loss 15.38659
prediction/mean 0.52615196
global_step 1000
---


## Use an Embedding with Deep Neural Network Model

An `embedding_column` takes sparse data as input and returns a lower-dimensional dense vector as output.

In [27]:
terms_embedding_column = tf.feature_column.embedding_column(terms_feature_column, dimension=2)
feature_columns = [ terms_embedding_column ]

my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
classier = tf.estimator.DNNClassifier(
    feature_columns = feature_columns,
    hidden_units = [20, 20],
    optimizer = my_optimizer
)

try:
  classifier.train(
      input_fn = lambda: _input_fn([train_path]),
      steps = 1000
  )
  
  evaluation_metrics = classifier.evaluate(
      input_fn = lambda: _input_fn([train_path]),
      steps = 1000
  )
  print("Training set metrics: ")
  for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
  print("---")
  
  evaluation_metrics = classifier.evaluate(
      input_fn = lambda: _input_fn([test_path]),
      steps = 1000
  )
  print("Testing set metrics: ")
  for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
  print("---")
except ValueError as err:
  print(err)

Training set metrics: 
accuracy 0.788
accuracy_baseline 0.5
auc 0.87665486
auc_precision_recall 0.86694
average_loss 0.44380242
label/mean 0.5
loss 11.09506
prediction/mean 0.53582823
global_step 3000
---
Testing set metrics: 
accuracy 0.78408
accuracy_baseline 0.5
auc 0.8737385
auc_precision_recall 0.86340797
average_loss 0.4469021
label/mean 0.5
loss 11.172553
prediction/mean 0.5349425
global_step 3000
---
