In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [7]:
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

def download_and_load_datasets(force_download=False):
    # dataset = tf.keras.utils.get_file(
    #     fname="aclImdb.tar.gz", 
    #     origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    #     extract=True)
  # dataset_train = open("aclimdb/train", 'rt')
  # dataset_train = load_dataset(os.path.join("aclimdb", "train"))
  # dataset_test = load_dataset(os.path.join("aclimdb", "test"))

  
  train_df = load_dataset(os.path.join(os.path.dirname("./aclimdb"), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname("./aclimdb"), 
                                      "aclImdb", "test"))
  
  return train_df, test_df

tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = download_and_load_datasets()
train_df.head()

Unnamed: 0,sentence,sentiment,polarity
0,"Well, TiVo recorded this because of Angelina J...",2,0
1,There are no spoilers in this review because e...,1,0
2,Have never understood why the MacDonald-Eddy s...,9,1
3,"I grew up in Royersford, Pa. The town where Je...",8,1
4,Flight of Fury starts as General Tom Barnes (A...,2,0


In [8]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["polarity"], num_epochs=None, shuffle=True)

predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["polarity"], shuffle=False)

predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    test_df, test_df["polarity"], shuffle=False)


In [9]:
embedded_text_feature_column = hub.text_embedding_column(
    key="sentence", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")


In [26]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[500, 100],
    feature_columns=[embedded_text_feature_column],
    n_classes=2,
    optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))
model_name = "tf_testing"
path = "./" + model_name + "-cnn/"

load_checkpoint = False


In [29]:
estimator.train(input_fn=train_input_fn, steps=5000)
saver = tf.train.Saver(max_to_keep=2)
saver.save(sess, path + model_name)


ValueError: No variables to save

In [30]:
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)

print("Training set accuracy: {accuracy}".format(**train_eval_result))
print("Test set accuracy: {accuracy}".format(**test_eval_result))


 
if not os.path.exists(path):
    os.makedirs(path)
     
with tf.Session() as sess:
     
    if load_checkpoint:
        checkpoint = tf.train.get_checkpoint_state(path)
        saver.restore(sess, checkpoint.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())



Training set accuracy: 0.8259999752044678
Test set accuracy: 0.7991999983787537


In [14]:
def get_predictions(estimator, input_fn):
    return [x["class_ids"][0] for x in estimator.predict(input_fn=input_fn)]


LABELS = [
    "negative", "positive"
]

with tf.Graph().as_default():
  cm = tf.confusion_matrix(train_df["polarity"], 
                           get_predictions(estimator, predict_train_input_fn))
  
with tf.Session() as session:
    cm_out = session.run(cm)


cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_out, annot=True, xticklabels=LABELS, yticklabels=LABELS)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


RuntimeError: The Session graph is empty.  Add operations to the graph before calling run().