In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
import numpy as np

import tensorflow as tf
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
import pickle

import matplotlib.pyplot as plt
import joblib
import itertools

import utils
import util

import sys
import argparse
import random

import nltk, re, time
from nltk.corpus import stopwords
from string import punctuation
from collections import defaultdict
from tqdm import tqdm

from tf.keras.preprocessing.text import Tokenizer
from tf.keras.preprocessing.sequence import pad_sequences
from tf.keras.utils import to_categorical
from collections import namedtuple

from contractions import get_contractions

alreadyPickled = False

## Variable Initialization

In [None]:
embedding_dim = 300
num_layers = 2

epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.01
keep_probability = 0.75
max_sequence_length = 1000


# Data Loading
If the data is already pickled, then can skip embedding and data processing

In [None]:
if alreadyPickled:
    clean_reviews = loadfiles("./data/clean_reviews.p")

    sorted_reviews = loadfiles("./data/sorted_reviews.p")
    word_embedding_matrix = loadfiles("./data/word_embedding_matrix.p")
    

In [None]:
# Helper function to grab contractions. Located in contractions.py
contractions = get_contractions()

In [2]:
def clean_text(text, remove_stopwords = True):
    text = text.lower()    
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text,  
                  flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
    return text

In [3]:
def picklefiles(filename, stuff):
    save_stuff = open(filename, "wb")
    pickle.dump(stuff, save_stuff)
    save_stuff.close()
def loadfiles(filename):
    saved_stuff = open(filename,"rb")
    stuff = pickle.load(saved_stuff)
    saved_stuff.close()
    return stuff

In [None]:
clean_reviews = []
for text in reviews.Text:
    clean_reviews.append(clean_text(text))
print("Texts are complete.")

## Embeddings
Using ConceptNet Numberbatch instead of GLoVE (supposedly outperforms GLoVE embeddings)  
  
  
On top of the embeddings, we also keep track of commonly used words in the reviews that Embeddings don't cover. This way we could have higher test accuracy when words we come across words like these. This is specified by a threshold value. Currently, threshold is set to 20 occuraces.  
  
  
We also process the reviews a bit more, sorting them into comparable lengths. This way, there is less padding necessary and (possibly) faster computation time when training/testing

In [None]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [None]:
word_counts = {}
count_words(word_counts, clean_reviews)            
print("Size of Vocabulary:", len(word_counts))

In [None]:
embed_path='./embeddings/numberbatch-en-17.02.txt'
def load_embeddings(path='./embeddings/numberbatch-en-17.02.txt'):
    embeddings_index = {}
    with open(path) as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    return embeddings_index

embeddings_index = load_embeddings(embed_path)

In [None]:
def get_features_for_layer(X, trained_model, layer_number, batches=256):
    """
    :param X: Batch with dimensions according to the models first layer input-shape
    :param trained_model: Model to extract data from
    :param layer_number: Index of the layer we want to extract features from.
    :param batches: If set it will call the function in batches to save (gpu)memory
    :return: 
    """
    get_features = K.function([trained_model.layers[0].input, K.learning_phase()],
                              [trained_model.layers[layer_number].output])
    
    if batches:
        g = array_batch_yield(X, batches)
        features = []
        for batch in g:
            feature_batch = get_features([batch, 0])
            features.append(feature_batch)
            
        features = np.concatenate(features, axis=1)[0]
        
    else:
        features = get_features([X, 0])

    
    return features

# Balance the Dataset
Want to balance the dataset, such that we have an equal number of reviews for each different category.  
For example, if our distribution of reviews is [200,500,100,300,400], for [1,2,3,4,5] stars, respectively, then we will only take 100 of each review

In [None]:
def minority_balance_dataframe_by_multiple_categorical_variables(df, categorical_columns=None, downsample_by=0.1):
    """
    :param df: pandas.DataFrame
    :param categorical_columns: iterable of categorical columns names contained in {df}
    :return: balanced pandas.DataFrame
    """
    if categorical_columns is None or not all([c in df.columns for c in categorical_columns]):
        raise ValueError('Please provide one or more columns containing categorical variables')

    minority_class_combination_count = df.groupby(categorical_columns).apply(lambda x: x.shape[0]).min()
    
    minority_class_combination_count = int(minority_class_combination_count * downsample_by)
    
    df = df.groupby(categorical_columns).apply(
        lambda x: x.sample(minority_class_combination_count)
    ).drop(categorical_columns, axis=1).reset_index().set_index('level_1')

    df.sort_index(inplace=True)

    return df

In [None]:
df_reviews = pd.read_csv('reviews.csv')#, encoding='utf-8')
df_reviews['len'] = df_reviews.text.str.len()
df_reviews = df_reviews[df_reviews['len'].between(10, 4000)]

df_balanced = minority_balance_dataframe_by_multiple_categorical_variables(
    df_reviews, 
    categorical_columns=['stars'], 
    downsample_by=0.1)

df_balanced.to_csv('balanced_reviews.csv', encoding='utf-8')
    
tokenizer = Tokenizer(nb_words=NB_WORDS)
tokenizer.fit_on_texts(text)

joblib.dump(tokenizer, 'tokenizer.pickle')

WORD_INDEX_SORTED = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))
seqs = tokenizer.texts_to_sequences(df_balanced.text.values)

padReviews = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH)

reviewsLength = MAX_SEQUENCE_LENGTH

ratings = df_balanced.stars.values.astype(int)
ratings_cat = to_categorical(ratings)

X_train, X_test, y_train, y_test = train_test_split(padReviews, ratings_cat, test_size=0.2, random_state=9)
with pd.HDFStore('x_y_test_train.h5') as store:
    store['X_train'] = pd.DataFrame(X_train)
    store['X_test'] = pd.DataFrame(X_test)
    store['y_train'] = pd.DataFrame(y_train)
    store['y_test'] = pd.DataFrame(y_test)

assert padReviews.shape[0] == ratings.shape[0] 

In [None]:
embedding_dim = 300
MAX_NB_WORDS = 20000

NB_WORDS = min(len(WORD_INDEX_SORTED), MAX_NB_WORDS)

word_embedding_matrix = np.zeros((NB_WORDS, embedding_dim),
                                 dtype=np.float32)

for word, i in tokenizer.word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [None]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [None]:
lengths_texts = create_lengths(int_texts)
print("Texts:")
print(lengths_texts.describe())

In [None]:
# Inspect the length of texts
print(np.percentile(lengths_texts.counts, 80))
print(np.percentile(lengths_texts.counts, 85))
print(np.percentile(lengths_texts.counts, 90))
print(np.percentile(lengths_texts.counts, 95))
print(np.percentile(lengths_texts.counts, 99))

In [None]:
if alreadyPickled == False:
    picklefiles("./data/good_pickles/clean_reviews.p",clean_reviews)
    picklefiles("./data/good_pickles/sorted_reviews.p",sorted_reviews)
    picklefiles("./data/good_pickles/word_embedding_matrix.p",word_embedding_matrix)
    picklefiles("./data/good_pickles/int_to_vocab.p",int_to_vocab)

# Building the Model

In [None]:
def model_inputs():
    # Should be [batch_size x review length]
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    # Should be [batch_size x num_classes]
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return input_data, labels, lr, keep_prob

In [None]:
def gru_lstm(rnn_inputs, rnn_size, num_layers, keep_prob):
    '''Create the encoding layer'''
    
    for layer in range(num_layers):
        with tf.variable_scope('GRU_lstm'):
            cell_fw = tf.contrib.rnn.GRUCell(rnn_size,
                                             initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw,
                                                    input_keep_prob = keep_prob)

            output, states = tf.nn.dynamic_rnn(cell_fw,
                                               rnn_inputs,
                                               dtype=tf.float32)
            
#             output, states = tf.nn.dynamic_rnn(cell_fw,
#                                                rnn_inputs,
#                                                sequence_length,
#                                                dtype=tf.float32)
    return output, states

# Gettin Batches
Gets batches as well as pads them to have similar length

In [None]:
def get_batches(ratings, reviews, batch_size):
    """Batch summaries, texts, and the lengths of their sentences together"""
    for batch_i in range(0, len(reviews)//batch_size):
        start_i = batch_i * batch_size
        rating_batch = ratings[start_i:start_i + batch_size, :]
        reviews_batch = reviews[start_i:start_i + batch_size, :]

        yield rating_batch, pad_texts_batch

# Building the Graph

In [None]:
vocab_size = len(WORD_INDEX_SORTED)+1
def prediction_model(input_data, target_data, keep_prob, num_classes 
                     vocab_size, rnn_size, num_layers, batch_size):
    
    W2 = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
    b2 = tf.Variable(np.zeros((1, num_classes)), dtype=tf.float32)
    
    # Use Numberbatch's embeddings and the newly created ones as our embeddings
    embeddings = word_embedding_matrix
    
    embs = tf.nn.embedding_lookup(embeddings, input_data)
    output, state = gru_lstm(input_data, rnn_size, num_layers, rnn_inputs, keep_prob)
    states_series = tf.reshape(state, [-1, rnn_size])
    
    return training_logits, inference_logits

In [None]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, labels, lr, keep_prob = model_inputs()
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits[0].rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits[0].sample_id, name='predictions')
    
    # Create the weights for sequence_loss, the sould be all True across since each batch is padded
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.losses.softmax_cross_entropy(one_hot_labels,
                                               logits)

        # Optimizer
        optimizer = tf.train.RMSOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
        
print("Graph is built.")
graph_location = "./graph"
print(graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(train_graph)

In [None]:
# # Find missing embedding words
# missing_words = 0
# threshold = 20

# for word, count in word_counts.items():
#     if count > threshold:
#         if word not in embeddings_index:
#             missing_words += 1
            
# missing_ratio = round(missing_words/len(word_counts),4)*100    

# print("Number of words missing from CN:", missing_words)

In [None]:
# Apply convert_to_ints to clean_summaries and clean_texts
# word_count = 0
# unk_count = 0x

# int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)

# unk_percent = round(unk_count/word_count,4)*100

# print("Total number of words in headlines:", word_count)
# print("Total number of UNKs in headlines:", unk_count)
# print("Percent of words that are UNK: {}%".format(unk_percent))

In [None]:
# def pad_sentence_batch(sentence_batch):
#     """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
#     max_sentence = max([len(sentence) for sentence in sentence_batch])
#     return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [None]:
# # Sort the summaries and texts by the length of the texts, shortest to longest
# # Limit the length of summaries and texts based on the min and max ranges.
# # Remove reviews that include too many UNKs
# sorted_reviews = []
# max_text_length = 200
# min_length = 2
# unk_text_limit = 1
# unk_summary_limit = 0

# for length in range(min(lengths_texts.counts), max_text_length): 
#     for count, words in enumerate(int_summaries):
#         if (len(int_texts[count]) >= min_length and
#             length == len(int_texts[count])
#            ):
#             sorted_texts.append(int_texts[count])

In [None]:
# vocab_to_int = {} 
# value = 0
# for word, count in word_counts.items():
#     if count >= threshold or word in embeddings_index:
#         vocab_to_int[word] = value
#         value += 1

# # Special tokens that will be added to our vocab
# codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# # Add codes to vocab
# for code in codes:
#     vocab_to_int[code] = len(vocab_to_int)

# # Dictionary to convert integers to words
# int_to_vocab = {}
# for word, value in vocab_to_int.items():
#     int_to_vocab[value] = word

# usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

# print("Total number of unique words:", len(word_counts))
# print("Number of words we will use:", len(vocab_to_int))
# print("Percent of words we will use: {}%".format(usage_ratio))

In [None]:
# def unk_counter(sentence):
#     '''Counts the number of time UNK appears in a sentence.'''
#     unk_count = 0
#     for word in sentence:
#         if word == vocab_to_int["<UNK>"]:
#             unk_count += 1
#     return unk_count