In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import numpy as np
import pandas as pd
import tensorflow as tf
import chakin
import time

import json
import os
import os.path  # for manipulation of file path names
import re  # regular expressions

from collections import defaultdict
import nltk
from nltk.tokenize import TreebankWordTokenizer

import time

import itertools

RANDOM_SEED = 9999

def reset_graph(seed= RANDOM_SEED):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [2]:
chakin.search(lang= 'English')

                   Name  Dimension                     Corpus VocabularySize  \
2          fastText(en)        300                  Wikipedia           2.5M   
11         GloVe.6B.50d         50  Wikipedia+Gigaword 5 (6B)           400K   
12        GloVe.6B.100d        100  Wikipedia+Gigaword 5 (6B)           400K   
13        GloVe.6B.200d        200  Wikipedia+Gigaword 5 (6B)           400K   
14        GloVe.6B.300d        300  Wikipedia+Gigaword 5 (6B)           400K   
15       GloVe.42B.300d        300          Common Crawl(42B)           1.9M   
16      GloVe.840B.300d        300         Common Crawl(840B)           2.2M   
17    GloVe.Twitter.25d         25               Twitter(27B)           1.2M   
18    GloVe.Twitter.50d         50               Twitter(27B)           1.2M   
19   GloVe.Twitter.100d        100               Twitter(27B)           1.2M   
20   GloVe.Twitter.200d        200               Twitter(27B)           1.2M   
21  word2vec.GoogleNews        300      

In [3]:


CHAKIN_INDEX = 11
NUMBER_OF_DIMENSIONS = 50
SUBFOLDER_NAME = "glove.6B"

DATA_FOLDER = "embeddings"
ZIP_FILE = os.path.join(DATA_FOLDER, "{}.zip".format(SUBFOLDER_NAME))
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME)
if SUBFOLDER_NAME[-1] == "d":
    GLOVE_FILENAME = os.path.join(
        UNZIP_FOLDER, "{}.txt".format(SUBFOLDER_NAME))
else:
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.{}d.txt".format(
        SUBFOLDER_NAME, NUMBER_OF_DIMENSIONS))
    
if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER):
    print("Downloading embeddings to '{}'".format(ZIP_FILE))
    chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(DATA_FOLDER))
else:
    print("Embeddings already downloaded.")

if not os.path.exists(UNZIP_FOLDER):
    import zipfile
    if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT):
        ZIP_FILE = ZIP_FILE_ALT
    with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref:
        print("Extracting embeddings to '{}'".format(UNZIP_FOLDER))
        zip_ref.extractall(UNZIP_FOLDER)
else:
    print("Embeddings already extracted.")

print('\nRun complete')

Embeddings already downloaded.
Embeddings already extracted.

Run complete


In [4]:
REMOVE_STOPWORDS = False
EVOCABSIZE = 10000  # specify desired size of pre-defined embedding vocabulary 


In [5]:
# Select the pre-defined embeddings source        
# Define vocabulary size for the language model    
# Create a word_to_embedding_dict for GloVe.6B.50d
embeddings_directory = 'embeddings/gloVe.6B'
filename = 'glove.6B.50d.txt'
embeddings_filename = os.path.join(embeddings_directory, filename)

In [6]:
# For word embeddings, this default value is a vector of zeros
# Documentation for the Python standard library:
#   Hellmann, D. 2017. The Python 3 Standard Library by Example. Boston: 
#     Addison-Wesley. [ISBN-13: 978-0-13-429105-5]
def load_embedding_from_disks(embeddings_filename, with_indexes=True):
    """
    Read a embeddings txt file. If `with_indexes=True`, 
    we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, 
    otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping 
    from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
  
    else:
        word_to_embedding_dict = dict()

    with open(embeddings_filename, 'r', encoding='utf-8') as embeddings_file:
        for (i, line) in enumerate(embeddings_file):

            split = line.split(' ')

            word = split[0]

            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )

            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation

    # Empty representation for unknown words.
    _WORD_NOT_FOUND = [0.0] * len(representation)
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(
            lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(
            index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

print('\nLoading embeddings from', embeddings_filename)
word_to_index, index_to_embedding = \
    load_embedding_from_disks(embeddings_filename, with_indexes=True)
print("Embedding loaded from disks.")


Loading embeddings from embeddings/gloVe.6B\glove.6B.50d.txt
Embedding loaded from disks.


In [7]:
# Additional background code from
# https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer
# shows the general structure of the data structures for word embeddings
# This code is modified for our purposes in language modeling 
vocab_size, embedding_dim = index_to_embedding.shape
print("Embedding is of shape: {}".format(index_to_embedding.shape))
print("This means (number of words, number of dimensions per word)\n")
print("The first words are words that tend occur more often.")

print("Note: for unknown words, the representation is an empty vector,\n"
      "and the index is the last one. The dictionnary has a limit:")
print("    {} --> {} --> {}".format("A word", "Index in embedding", 
      "Representation"))
word = "worsdfkljsdf"  # a word obviously not in the vocabulary
idx = word_to_index[word] # index for word obviously not in the vocabulary
complete_vocabulary_size = idx 
embd = list(np.array(index_to_embedding[idx], dtype=int)) # "int" compact print
print("    {} --> {} --> {}".format(word, idx, embd))
word = "the"
idx = word_to_index[word]
embd = list(index_to_embedding[idx])  # "int" for compact print only.
print("    {} --> {} --> {}".format(word, idx, embd))

Embedding is of shape: (400001, 50)
This means (number of words, number of dimensions per word)

The first words are words that tend occur more often.
Note: for unknown words, the representation is an empty vector,
and the index is the last one. The dictionnary has a limit:
    A word --> Index in embedding --> Representation
    worsdfkljsdf --> 400000 --> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    the --> 0 --> [0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.044457, -0.49688, -0.17862, -0.00066023, -0.6566, 0.27843, -0.14767, -0.55677, 0.14658, -0.0095095, 0.011658, 0.10204, -0.12792, -0.8443, -0.12181, -0.016801, -0.33279, -0.1552, -0.23131, -0.19181, -1.8823, -0.76746, 0.099051, -0.42125, -0.19526, 4.0071, -0.18594, -0.52287, -0.31681, 0.00059213, 0.0074449, 0.17778, -0.15897, 0.012041, -0.054223, -0.29871, -0.15749, -0.34758, -0.045637, -0.44251, 0.18785, 0.0027849, -0.18

In [8]:
# Show how to use embeddings dictionaries with a test sentence
# This is a famous typing exercise with all letters of the alphabet
# https://en.wikipedia.org/wiki/The_quick_brown_fox_jumps_over_the_lazy_dog
a_typing_test_sentence = 'The quick brown fox jumps over the lazy dog'
print('\nTest sentence: ', a_typing_test_sentence, '\n')
words_in_test_sentence = a_typing_test_sentence.split()

print('Test sentence embeddings from complete vocabulary of', 
      complete_vocabulary_size, 'words:\n')
for word in words_in_test_sentence:
    word_ = word.lower()
    embedding = index_to_embedding[word_to_index[word_]]
    print(word_ + ": ", embedding)


Test sentence:  The quick brown fox jumps over the lazy dog 

Test sentence embeddings from complete vocabulary of 400000 words:

the:  [ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]
quick:  [ 0.13967   -0.53798   -0.18047   -0.25142    0.16203   -0.13868
 -0.24637    0.75111    0.27264    0.61035   -0.82548    0.038647
 -0.32361    0.30373   -0.14598   -0.23551    0.39267   -1.1287
 -0.23636   -1.0629     0.046277   0.29143   -0.25

In [9]:
# Define vocabulary size for the language model    
# To reduce the size of the vocabulary to the n most frequently used words

def default_factory():
    return EVOCABSIZE  # last/unknown-word row in limited_index_to_embedding
# dictionary has the items() function, returns list of (key, value) tuples
limited_word_to_index = defaultdict(default_factory, \
    {k: v for k, v in word_to_index.items() if v < EVOCABSIZE})

In [10]:
# Select the first EVOCABSIZE rows to the index_to_embedding
limited_index_to_embedding = index_to_embedding[0:EVOCABSIZE,:]
# Set the unknown-word row to be all zeros as previously
limited_index_to_embedding = np.append(limited_index_to_embedding, 
    index_to_embedding[index_to_embedding.shape[0] - 1, :].\
        reshape(1,embedding_dim), 
    axis = 0)

In [11]:
# Delete large numpy array to clear some CPU RAM
del index_to_embedding

In [12]:
# Verify the new vocabulary: should get same embeddings for test sentence
# Note that a small EVOCABSIZE may yield some zero vectors for embeddings
print('\nTest sentence embeddings from vocabulary of', EVOCABSIZE, 'words:\n')
for word in words_in_test_sentence:
    word_ = word.lower()
    embedding = limited_index_to_embedding[limited_word_to_index[word_]]
    print(word_ + ": ", embedding)


Test sentence embeddings from vocabulary of 10000 words:

the:  [ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]
quick:  [ 0.13967   -0.53798   -0.18047   -0.25142    0.16203   -0.13868
 -0.24637    0.75111    0.27264    0.61035   -0.82548    0.038647
 -0.32361    0.30373   -0.14598   -0.23551    0.39267   -1.1287
 -0.23636   -1.0629     0.046277   0.29143   -0.25819   -0.094902
  0.79478   -1.2095    -0.01039   -0.092086   0.84322   

In [13]:
# code for working with movie reviews data 
# Source: Miller, T. W. (2016). Web and Network Data Science.
#    Upper Saddle River, N.J.: Pearson Education.
#    ISBN-13: 978-0-13-388644-3
# This original study used a simple bag-of-words approach
# to sentiment analysis, along with pre-defined lists of
# negative and positive words.        
# Code available at:  https://github.com/mtpa/wnds       
# ------------------------------------------------------------
# Utility function to get file names within a directory
def listdir_no_hidden(path):
    start_list = os.listdir(path)
    end_list = []
    for file in start_list:
        if (not file.startswith('.')):
            end_list.append(file)
    return(end_list)

In [14]:
# define list of codes to be dropped from document
# carriage-returns, line-feeds, tabs
codelist = ['\r', '\n', '\t']   

In [15]:
# We will not remove stopwords in this exercise because they are
# important to keeping sentences intact
if REMOVE_STOPWORDS:
    print(nltk.corpus.stopwords.words('english'))
    # previous analysis of a list of top terms showed a number of words, along 
    # with contractions and other word strings to drop from further analysis, add
    # these to the usual English stopwords to be dropped from a document collection
    more_stop_words = ['cant','didnt','doesnt','dont','goes','isnt','hes',\
        'shes','thats','theres','theyre','wont','youll','youre','youve', 'br'\
        've', 're', 'vs'] 

    some_proper_nouns_to_remove = ['dick','ginger','hollywood','jack',\
        'jill','john','karloff','kudrow','orson','peter','tcm','tom',\
        'toni','welles','william','wolheim','nikita']

    # start with the initial list and add to it for movie text work 
    stoplist = nltk.corpus.stopwords.words('english') + more_stop_words +\
        some_proper_nouns_to_remove

In [16]:
# text parsing function for creating text documents 
# there is more we could do for data preparation 
# stemming... looking for contractions... possessives... 
# but we will work with what we have in this parsing function
# if we want to do stemming at a later time, we can use
#     porter = nltk.PorterStemmer()  
# in a construction like this
#     words_stemmed =  [porter.stem(word) for word in initial_words]  
def text_parse(string):
    # replace non-alphanumeric with space 
    temp_string = re.sub('[^a-zA-Z]', '  ', string)    
    # replace codes with space
    for i in range(len(codelist)):
        stopstring = ' ' + codelist[i] + '  '
        temp_string = re.sub(stopstring, '  ', temp_string)      
    # replace single-character words with space
    temp_string = re.sub('\s.\s', ' ', temp_string)   
    # convert uppercase to lowercase
    temp_string = temp_string.lower()    
    if REMOVE_STOPWORDS:
        # replace selected character strings/stop-words with space
        for i in range(len(stoplist)):
            stopstring = ' ' + str(stoplist[i]) + ' '
            temp_string = re.sub(stopstring, ' ', temp_string)        
    # replace multiple blank characters with one blank character
    temp_string = re.sub('\s+', ' ', temp_string)    
    return(temp_string)    


In [17]:
# -----------------------------------------------
# gather data for 500 negative movie reviews
# -----------------------------------------------
dir_name = 'movie-reviews-negative'
    
filenames = listdir_no_hidden(path=dir_name)
num_files = len(filenames)

In [18]:
for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name, filenames[i]))
    assert file_exists
print('\nDirectory:',dir_name)    
print('%d files found' % len(filenames))



Directory: movie-reviews-negative
500 files found


In [19]:
# Read data for negative movie reviews
# Data will be stored in a list of lists where the each list represents 
# a document and document is a list of words.
# We then break the text into words.

def read_data(filename):

  with open(filename, encoding='utf-8') as f:
    data = tf.compat.as_str(f.read())
    data = data.lower()
    data = text_parse(data)
    data = TreebankWordTokenizer().tokenize(data)  # The Penn Treebank

  return data

In [20]:
negative_documents = []

print('\nProcessing document files under', dir_name)
for i in range(num_files):
    ## print(' ', filenames[i])

    words = read_data(os.path.join(dir_name, filenames[i]))

    negative_documents.append(words)
    # print('Data size (Characters) (Document %d) %d' %(i,len(words)))
    # print('Sample string (Document %d) %s'%(i,words[:50]))




Processing document files under movie-reviews-negative


In [21]:
# -----------------------------------------------
# gather data for 500 positive movie reviews
# -----------------------------------------------
dir_name = 'movie-reviews-positive'  
filenames = listdir_no_hidden(path=dir_name)
num_files = len(filenames)

for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name, filenames[i]))
    assert file_exists
print('\nDirectory:',dir_name)    
print('%d files found' % len(filenames))



Directory: movie-reviews-positive
500 files found


In [22]:
# Read data for positive movie reviews
# Data will be stored in a list of lists where the each list 
# represents a document and document is a list of words.
# We then break the text into words.

def read_data(filename):

  with open(filename, encoding='utf-8') as f:
    data = tf.compat.as_str(f.read())
    data = data.lower()
    data = text_parse(data)
    data = TreebankWordTokenizer().tokenize(data)  # The Penn Treebank

  return data


In [23]:
positive_documents = []

print('\nProcessing document files under', dir_name)
for i in range(num_files):
    ## print(' ', filenames[i])

    words = read_data(os.path.join(dir_name, filenames[i]))

    positive_documents.append(words)
    # print('Data size (Characters) (Document %d) %d' %(i,len(words)))
    # print('Sample string (Document %d) %s'%(i,words[:50]))



Processing document files under movie-reviews-positive


In [24]:
# -----------------------------------------------------
# convert positive/negative documents into numpy array
# note that reviews vary from 22 to 1052 words   
# so we use the first 20 and last 20 words of each review 
# as our word sequences for analysis
# -----------------------------------------------------
max_review_length = 0  # initialize
for doc in negative_documents:
    max_review_length = max(max_review_length, len(doc))    
for doc in positive_documents:
    max_review_length = max(max_review_length, len(doc)) 
print('max_review_length:', max_review_length) 


max_review_length: 1052


In [25]:
min_review_length = max_review_length  # initialize
for doc in negative_documents:
    min_review_length = min(min_review_length, len(doc))    
for doc in positive_documents:
    min_review_length = min(min_review_length, len(doc)) 
print('min_review_length:', min_review_length) 


min_review_length: 22


In [26]:
# construct list of 1000 lists with 40 words in each list
from itertools import chain
documents = []
for doc in negative_documents:
    doc_begin = doc[0:20]
    doc_end = doc[len(doc) - 20: len(doc)]
    documents.append(list(chain(*[doc_begin, doc_end])))    
for doc in positive_documents:
    doc_begin = doc[0:20]
    doc_end = doc[len(doc) - 20: len(doc)]
    documents.append(list(chain(*[doc_begin, doc_end])))    


In [27]:
# create list of lists of lists for embeddings
embeddings = []    
for doc in documents:
    embedding = []
    for word in doc:
       embedding.append(limited_index_to_embedding[limited_word_to_index[word]]) 
    embeddings.append(embedding)

In [28]:
# -----------------------------------------------------    
# Check on the embeddings list of list of lists 
# -----------------------------------------------------
# Show the first word in the first document
test_word = documents[0][0]    
print('First word in first document:', test_word)    
print('Embedding for this word:\n', 
      limited_index_to_embedding[limited_word_to_index[test_word]])
print('Corresponding embedding from embeddings list of list of lists\n',
      embeddings[0][0][:])


First word in first document: story
Embedding for this word:
 [ 0.48251    0.87746   -0.23455    0.0262     0.79691    0.43102
 -0.60902   -0.60764   -0.42812   -0.012523  -1.2894     0.52656
 -0.82763    0.30689    1.1972    -0.47674   -0.46885   -0.19524
 -0.28403    0.35237    0.45536    0.76853    0.0062157  0.55421
  1.0006    -1.3973    -1.6894     0.30003    0.60678   -0.46044
  2.5961    -1.2178     0.28747   -0.46175   -0.25943    0.38209
 -0.28312   -0.47642   -0.059444  -0.59202    0.25613    0.21306
 -0.016129  -0.29873   -0.19468    0.53611    0.75459   -0.4112
  0.23625    0.26451  ]
Corresponding embedding from embeddings list of list of lists
 [ 0.48251    0.87746   -0.23455    0.0262     0.79691    0.43102
 -0.60902   -0.60764   -0.42812   -0.012523  -1.2894     0.52656
 -0.82763    0.30689    1.1972    -0.47674   -0.46885   -0.19524
 -0.28403    0.35237    0.45536    0.76853    0.0062157  0.55421
  1.0006    -1.3973    -1.6894     0.30003    0.60678   -0.46044
  2.596

In [29]:
# Show the seventh word in the tenth document
test_word = documents[6][9]    
print('Seventh word in tenth document:', test_word)    
print('Embedding for this word:\n', 
      limited_index_to_embedding[limited_word_to_index[test_word]])
print('Corresponding embedding from embeddings list of list of lists\n',
      embeddings[6][9][:])


Seventh word in tenth document: but
Embedding for this word:
 [ 0.35934   -0.2657    -0.046477  -0.2496     0.54676    0.25924
 -0.64458    0.1736    -0.53056    0.13942    0.062324   0.18459
 -0.75495   -0.19569    0.70799    0.44759    0.27031   -0.32885
 -0.38891   -0.61606   -0.484      0.41703    0.34794   -0.19706
  0.40734   -2.1488    -0.24284    0.33809    0.43993   -0.21616
  3.7635     0.19002   -0.12503   -0.38228    0.12944   -0.18272
  0.076803   0.51579    0.0072516 -0.29192   -0.27523    0.40593
 -0.040394   0.28353   -0.024724   0.10563   -0.32879    0.10673
 -0.11503    0.074678 ]
Corresponding embedding from embeddings list of list of lists
 [ 0.35934   -0.2657    -0.046477  -0.2496     0.54676    0.25924
 -0.64458    0.1736    -0.53056    0.13942    0.062324   0.18459
 -0.75495   -0.19569    0.70799    0.44759    0.27031   -0.32885
 -0.38891   -0.61606   -0.484      0.41703    0.34794   -0.19706
  0.40734   -2.1488    -0.24284    0.33809    0.43993   -0.21616
  3.76

In [30]:
# Show the last word in the last document
test_word = documents[999][39]    
print('Last word in last document:', test_word)    
print('Embedding for this word:\n', 
      limited_index_to_embedding[limited_word_to_index[test_word]])
print('Corresponding embedding from embeddings list of list of lists\n',
      embeddings[999][39][:]) 

Last word in last document: from
Embedding for this word:
 [ 0.41037   0.11342   0.051524 -0.53833  -0.12913   0.22247  -0.9494
 -0.18963  -0.36623  -0.067011  0.19356  -0.33044   0.11615  -0.58585
  0.36106   0.12555  -0.3581   -0.023201 -1.2319    0.23383   0.71256
  0.14824   0.50874  -0.12313  -0.20353  -1.82      0.22291   0.020291
 -0.081743 -0.27481   3.7343   -0.01874  -0.084522 -0.30364   0.27959
  0.043328 -0.24621   0.015373  0.49751   0.15108  -0.01619   0.40132
  0.23067  -0.10743  -0.36625  -0.051135  0.041474 -0.36064  -0.19616
 -0.81066 ]
Corresponding embedding from embeddings list of list of lists
 [ 0.41037   0.11342   0.051524 -0.53833  -0.12913   0.22247  -0.9494
 -0.18963  -0.36623  -0.067011  0.19356  -0.33044   0.11615  -0.58585
  0.36106   0.12555  -0.3581   -0.023201 -1.2319    0.23383   0.71256
  0.14824   0.50874  -0.12313  -0.20353  -1.82      0.22291   0.020291
 -0.081743 -0.27481   3.7343   -0.01874  -0.084522 -0.30364   0.27959
  0.043328 -0.24621   0.01

In [31]:
# -----------------------------------------------------    
# Make embeddings a numpy array for use in an RNN 
# Create training and test sets with Scikit Learn
# -----------------------------------------------------
embeddings_array = np.array(embeddings)


In [32]:
# Define the labels to be used 500 negative (0) and 500 positive (1)
thumbs_down_up = np.concatenate((np.zeros((500), dtype = np.int32), 
                      np.ones((500), dtype = np.int32)), axis = 0)

In [33]:
# Scikit Learn for random splitting of the data  
from sklearn.model_selection import train_test_split

## Basic RNN 50d

In [34]:
df_basic_50d = pd.DataFrame(columns = ['RNN', 'Word Vector', 'Iteration', 'Random Seed','Learning Rate', 'Vocabulary Size', 'Train Accuracy', 'Test Accuracy'])
RANDOM_SEEDS = [9999, 9999, 9999, 9999, 64, 64, 64, 64]
Learn_Rate = [.01, .001, .01, .001, .01, .001, .01, .001]
VOCAB_SIZE = [10000, 10000, 20000, 20000, 10000, 10000, 20000, 20000]  # specify desired size of pre-defined embedding vocabulary 
#Epochs = [30, 50]
#Batch = [100, 200]
z = 1
while z <= 2:
    for a, b, c in zip(RANDOM_SEEDS, Learn_Rate, VOCAB_SIZE):
        EVOCABSIZE = c
        RANDOM_SEED = a
            # Random splitting of the data in to training (80%) and test (20%)  
        X_train, X_test, y_train, y_test = train_test_split(embeddings_array, thumbs_down_up, test_size=0.20, 
                                                            random_state = RANDOM_SEED)
        reset_graph()

        n_steps = embeddings_array.shape[1]  # number of words per document 
        n_inputs = embeddings_array.shape[2]  # dimension of  pre-trained embeddings
        n_neurons = 20  # analyst specified number of neurons
        n_outputs = 2  # thumbs-down or thumbs-up

        learning_rate = b

        X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y = tf.placeholder(tf.int32, [None])

        basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
        outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

        logits = tf.layers.dense(states, n_outputs)
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                                  logits=logits)
        loss = tf.reduce_mean(xentropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        init = tf.global_variables_initializer()

        n_epochs = 50
        batch_size = 100
        start_time = time.time()

        with tf.Session() as sess:
            init.run()
            for epoch in range(n_epochs):
                #print('\n  ---- Epoch ', epoch, ' ----\n')
                for iteration in range(y_train.shape[0] // batch_size):          
                    X_batch = X_train[iteration*batch_size:(iteration + 1)*batch_size,:]
                    y_batch = y_train[iteration*batch_size:(iteration + 1)*batch_size]
                    #print('  Batch ', iteration, ' training observations from ',  
                          #iteration*batch_size, ' to ', (iteration + 1)*batch_size-1,)
                    sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
                acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
                acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
                #print("Epoch", epoch, "Train accuracy =", acc_train, "Test accuracy =", acc_test)
            print("Iteration", z, "Random Seed", a, "Learning Rate", b, "Vocabulary Size", c, "Train accuracy", acc_train, "Test accuracy =", acc_test)
        end_time = time.time()
        run_time = end_time-start_time
        run_time
        
        result = ({'RNN': 'Basic', 'Word Vector': 'Glove 50d', 'Iteration': z, 'Random Seed': a, 'Learning Rate': b, 'Vocabulary Size': c, 'Train Accuracy': acc_train, 'Test Accuracy': acc_test})

        df_basic_50d = df_basic_50d.append(result, ignore_index=True)
        
        #results_frame = pd.DataFrame(result)
        #print(results_frame)
        #data = pd.DataFrame({'RNN': "Basic"}, {'Word Vector': "Glove 50d"}, {'Iteration': z}, {'Random Seed': a}, {'Learning Rate': b}, {'Vocab Size': c}, {'Train Accuracy': acc_train}, {'Test Accuracy': acc_test})

    z = z + 1

W0818 17:12:50.015140  8112 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0818 17:12:50.015140  8112 deprecation.py:323] From <ipython-input-34-2ee99865daaa>:27: BasicRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.
W0818 17:12:50.015140  8112 deprecation.py:323] From <ipython-input-34-2ee99865daaa>:28: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(

Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 1.0 Test accuracy = 0.56
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.8 Test accuracy = 0.68
Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 1.0 Test accuracy = 0.56
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.8 Test accuracy = 0.68
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.95 Test accuracy = 0.59
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.72 Test accuracy = 0.67
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.95 Test accuracy = 0.59
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.72 Test accuracy = 0.67
Iteration 2 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 1.0 Test accuracy = 0.56
Iteration 

## LSTM Glove 50d

In [35]:
df_LTSM_50d = pd.DataFrame(columns = ['RNN', 'Word Vector', 'Iteration', 'Random Seed','Learning Rate', 'Vocabulary Size', 'Train Accuracy', 'Test Accuracy'])

RANDOM_SEEDS = [9999, 9999, 9999, 9999, 64, 64, 64, 64]
Learn_Rate = [.01, .001, .01, .001, .01, .001, .01, .001]
VOCAB_SIZE = [10000, 10000, 20000, 20000, 10000, 10000, 20000, 20000]  # specify desired size of pre-defined embedding vocabulary 
#Epochs = [30, 50]
#Batch = [100, 200]
z = 1
while z <= 2:
    for a, b, c in zip(RANDOM_SEEDS, Learn_Rate, VOCAB_SIZE):
        EVOCABSIZE = c
        RANDOM_SEED = a
            # Random splitting of the data in to training (80%) and test (20%)  
        X_train, X_test, y_train, y_test = train_test_split(embeddings_array, thumbs_down_up, test_size=0.20, 
                                                            random_state = RANDOM_SEED)

        reset_graph()

        lstm_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_neurons)

        n_steps = embeddings_array.shape[1]  # number of words per document 
        n_inputs = embeddings_array.shape[2]  # dimension of  pre-trained embeddings
        n_neurons = 20  # analyst specified number of neurons
        n_outputs = 2  # thumbs-down or thumbs-up
        n_layers = 3

        learning_rate = b

        X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y = tf.placeholder(tf.int32, [None])

        lstm_cells = [tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_neurons)
                      for layer in range(n_layers)]
        multi_cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cells)
        outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)
        top_layer_h_state = states[-1][1]
        logits = tf.layers.dense(top_layer_h_state, n_outputs, name="softmax")
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        init = tf.global_variables_initializer()

        n_epochs = 50
        batch_size = 100
        start_time = time.time()

        with tf.Session() as sess:
            init.run()
            for epoch in range(n_epochs):
                for iteration in range(y_train.shape[0] // batch_size):          
                    X_batch = X_train[iteration*batch_size:(iteration + 1)*batch_size,:]
                    y_batch = y_train[iteration*batch_size:(iteration + 1)*batch_size]

                    sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
                acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
                acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
            #print("Epoch", epoch, "Train accuracy =", acc_train, "Test accuracy =", acc_test)
        print("Iteration", z, "Random Seed", a, "Learning Rate", b, "Vocabulary Size", c, "Train accuracy", acc_train, "Test accuracy =", acc_test)
        end_time = time.time()
        run_time = end_time-start_time
        run_time
    
        result = ({'RNN': 'LTSM', 'Word Vector': 'Glove 50d', 'Iteration': z, 'Random Seed': a, 'Learning Rate': b, 'Vocabulary Size': c, 'Train Accuracy': acc_train, 'Test Accuracy': acc_test})
        df_LTSM_50d = df_LTSM_50d.append(result, ignore_index=True)

    z = z + 1

W0818 17:13:27.534057  8112 deprecation.py:323] From <ipython-input-35-51280e2e0852>:19: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0818 17:13:27.534057  8112 deprecation.py:323] From <ipython-input-35-51280e2e0852>:34: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 1.0 Test accuracy = 0.73
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.79 Test accuracy = 0.67
Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 1.0 Test accuracy = 0.73
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.79 Test accuracy = 0.67
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 1.0 Test accuracy = 0.68
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.91 Test accuracy = 0.69
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 1.0 Test accuracy = 0.68
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.91 Test accuracy = 0.69
Iteration 2 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 1.0 Test accuracy = 0.73
Iteration 

## Using Dropout With Basic RNN and Glove 50d

In [36]:
df_Dropout_50d = pd.DataFrame(columns = ['RNN', 'Word Vector', 'Iteration', 'Random Seed','Learning Rate', 'Vocabulary Size', 'Train Accuracy', 'Test Accuracy'])

RANDOM_SEEDS = [9999, 9999, 9999, 9999, 64, 64, 64, 64]
Learn_Rate = [.01, .001, .01, .001, .01, .001, .01, .001]
VOCAB_SIZE = [10000, 10000, 20000, 20000, 10000, 10000, 20000, 20000]  # specify desired size of pre-defined embedding vocabulary 
#Epochs = [30, 50]
#Batch = [100, 200]
z = 1
while z <= 2:
    for a, b, c in zip(RANDOM_SEEDS, Learn_Rate, VOCAB_SIZE):
        EVOCABSIZE = c
        RANDOM_SEED = a
            # Random splitting of the data in to training (80%) and test (20%)  
        X_train, X_test, y_train, y_test = train_test_split(embeddings_array, thumbs_down_up, test_size=0.20, 
                                                            random_state = RANDOM_SEED)
        reset_graph()

        n_steps = embeddings_array.shape[1]  # number of words per document 
        n_inputs = embeddings_array.shape[2]  # dimension of  pre-trained embeddings
        n_neurons = 20  # analyst specified number of neurons
        n_outputs = 2  # thumbs-down or thumbs-up
        n_layers = 3

        learning_rate = b

        X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y = tf.placeholder(tf.int32, [None])

        keep_prob = tf.placeholder_with_default(.5, shape=())
        cells = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
                 for layer in range(n_layers)]
        cells_drop = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob)
                      for cell in cells]
        multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells_drop)
        outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

        states_concat = tf.concat(axis=1, values=states)
        logits = tf.layers.dense(states_concat, n_outputs)
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        init = tf.global_variables_initializer()

        n_epochs = 50
        batch_size = 100
        train_keep_prob = 0.5
        start_time = time.time()
        with tf.Session() as sess:
            init.run()
            for epoch in range(n_epochs):
                for iteration in range(y_train.shape[0] // batch_size):
                    X_batch = X_train[iteration*batch_size:(iteration + 1)*batch_size,:]
                    y_batch = y_train[iteration*batch_size:(iteration + 1)*batch_size]
                    X_batch = X_batch.reshape((-1, n_steps, n_inputs))
                    sess.run(training_op, feed_dict={X: X_batch, y: y_batch,keep_prob: train_keep_prob})
                acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
                acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
            print("Iteration", z, "Random Seed", a, "Learning Rate", b, "Vocabulary Size", c, "Train accuracy", acc_train, "Test accuracy =", acc_test)
        
        end_time = time.time()
        run_time = end_time-start_time
        run_time
        
        result = ({'RNN': 'Dropout', 'Word Vector': 'Glove 50d', 'Iteration': z, 'Random Seed': a, 'Learning Rate': b, 'Vocabulary Size': c, 'Train Accuracy': acc_train, 'Test Accuracy': acc_test})
        df_Dropout_50d = df_Dropout_50d.append(result, ignore_index=True)
    z = z + 1
    


Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.73 Test accuracy = 0.635
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.59 Test accuracy = 0.53
Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.69 Test accuracy = 0.62
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.59 Test accuracy = 0.535
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.72 Test accuracy = 0.62
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.63 Test accuracy = 0.58
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.62 Test accuracy = 0.575
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.65 Test accuracy = 0.585
Iteration 2 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.62 Test accuracy = 0.645


# Using GloVe.6B.300d

In [37]:
CHAKIN_INDEX = 21
NUMBER_OF_DIMENSIONS = 300
SUBFOLDER_NAME = "glove.6B"

DATA_FOLDER = "embeddings"
ZIP_FILE = os.path.join(DATA_FOLDER, "{}.zip".format(SUBFOLDER_NAME))
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME)
if SUBFOLDER_NAME[-1] == "d":
    GLOVE_FILENAME = os.path.join(
        UNZIP_FOLDER, "{}.txt".format(SUBFOLDER_NAME))
else:
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.{}d.txt".format(
        SUBFOLDER_NAME, NUMBER_OF_DIMENSIONS))
    
if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER):
    print("Downloading embeddings to '{}'".format(ZIP_FILE))
    chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(DATA_FOLDER))
else:
    print("Embeddings already downloaded.")

if not os.path.exists(UNZIP_FOLDER):
    import zipfile
    if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT):
        ZIP_FILE = ZIP_FILE_ALT
    with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref:
        print("Extracting embeddings to '{}'".format(UNZIP_FOLDER))
        zip_ref.extractall(UNZIP_FOLDER)
else:
    print("Embeddings already extracted.")

print('\nRun complete')

Embeddings already downloaded.
Embeddings already extracted.

Run complete


In [38]:
REMOVE_STOPWORDS = False
#EVOCABSIZE = 100000  # specify desired size of pre-defined embedding vocabulary 

In [39]:
# Select the pre-defined embeddings source        
# Define vocabulary size for the language model    
# Create a word_to_embedding_dict for GloVe.6B.50d
embeddings_directory = 'embeddings/gloVe.6B'
filename = 'glove.6B.300d.txt'
embeddings_filename = os.path.join(embeddings_directory, filename)

In [40]:
print('\nLoading embeddings from', embeddings_filename)
word_to_index, index_to_embedding = \
    load_embedding_from_disks(embeddings_filename, with_indexes=True)
print("Embedding loaded from disks.")


Loading embeddings from embeddings/gloVe.6B\glove.6B.300d.txt
Embedding loaded from disks.


In [41]:
del index_to_embedding

In [42]:
# create list of lists of lists for embeddings
embeddings = []    
for doc in documents:
    embedding = []
    for word in doc:
       embedding.append(limited_index_to_embedding[limited_word_to_index[word]]) 
    embeddings.append(embedding)

In [43]:
len(embeddings)

1000

In [44]:
embeddings_array = np.array(embeddings)

In [45]:
embeddings_array.shape

(1000, 40, 50)

In [46]:
thumbs_down_up = np.concatenate((np.zeros((500), dtype = np.int32), 
                      np.ones((500), dtype = np.int32)), axis = 0)

In [47]:
X_train.shape

(800, 40, 50)

## Basic RNN, word vector 300d

In [48]:
df_basic_300d = pd.DataFrame(columns = ['RNN', 'Word Vector', 'Iteration', 'Random Seed','Learning Rate', 'Vocabulary Size', 'Train Accuracy', 'Test Accuracy'])

RANDOM_SEEDS = [9999, 9999, 9999, 9999, 64, 64, 64, 64]
Learn_Rate = [.01, .001, .01, .001, .01, .001, .01, .001]
VOCAB_SIZE = [10000, 10000, 20000, 20000, 10000, 10000, 20000, 20000]  # specify desired size of pre-defined embedding vocabulary 
#Epochs = [30, 50]
#Batch = [100, 200]

z = 1
while z <= 2:
    for a, b, c in zip(RANDOM_SEEDS, Learn_Rate, VOCAB_SIZE):
        EVOCABSIZE = c
        RANDOM_SEED = a
            # Random splitting of the data in to training (80%) and test (20%)  
        X_train, X_test, y_train, y_test = train_test_split(embeddings_array, thumbs_down_up, test_size=0.20, 
                                                            random_state = RANDOM_SEED)

        reset_graph()

        n_steps = embeddings_array.shape[1]  # number of words per document 
        n_inputs = embeddings_array.shape[2]  # dimension of  pre-trained embeddings
        n_neurons = 20  # analyst specified number of neurons
        n_outputs = 2  # thumbs-down or thumbs-up

        learning_rate = b

        X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y = tf.placeholder(tf.int32, [None])

        basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
        outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

        logits = tf.layers.dense(states, n_outputs)
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                                  logits=logits)
        loss = tf.reduce_mean(xentropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        init = tf.global_variables_initializer()

        n_epochs = 50
        batch_size = 100
        start_time = time.time()
        with tf.Session() as sess:
            init.run()
            for epoch in range(n_epochs):
                #print('\n  ---- Epoch ', epoch, ' ----\n')
                for iteration in range(y_train.shape[0] // batch_size):          
                    X_batch = X_train[iteration*batch_size:(iteration + 1)*batch_size,:]
                    y_batch = y_train[iteration*batch_size:(iteration + 1)*batch_size]
                    #print('  Batch ', iteration, ' training observations from ',  
                          #iteration*batch_size, ' to ', (iteration + 1)*batch_size-1,)
                    sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
                acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
                acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
            print("Iteration", z, "Random Seed", a, "Learning Rate", b, "Vocabulary Size", c, "Train accuracy", acc_train, "Test accuracy =", acc_test)

        end_time = time.time()
        run_time = end_time-start_time
        run_time
        
        result = ({'RNN': 'Basic', 'Word Vector': 'Glove 300d', 'Iteration': z, 'Random Seed': a, 'Learning Rate': b, 'Vocabulary Size': c, 'Train Accuracy': acc_train, 'Test Accuracy': acc_test})
        df_basic_300d = df_basic_300d.append(result, ignore_index=True)
    z = z + 1

Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 1.0 Test accuracy = 0.56
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.8 Test accuracy = 0.68
Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 1.0 Test accuracy = 0.56
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.8 Test accuracy = 0.68
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.95 Test accuracy = 0.59
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.72 Test accuracy = 0.67
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.95 Test accuracy = 0.59
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.72 Test accuracy = 0.67
Iteration 2 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 1.0 Test accuracy = 0.56
Iteration 

## LSTM RNN, word vector 300d Learning Rate

In [49]:
df_LTSM_300d = pd.DataFrame(columns = ['RNN', 'Word Vector', 'Iteration', 'Random Seed','Learning Rate', 'Vocabulary Size', 'Train Accuracy', 'Test Accuracy'])

RANDOM_SEEDS = [9999, 9999, 9999, 9999, 64, 64, 64, 64]
Learn_Rate = [.01, .001, .01, .001, .01, .001, .01, .001]
VOCAB_SIZE = [10000, 10000, 20000, 20000, 10000, 10000, 20000, 20000]  # specify desired size of pre-defined embedding vocabulary 
#Epochs = [30, 50]
#Batch = [100, 200]

z = 1
while z <= 2:
    for a, b, c in zip(RANDOM_SEEDS, Learn_Rate, VOCAB_SIZE):
        EVOCABSIZE = c
        RANDOM_SEED = a
            # Random splitting of the data in to training (80%) and test (20%)  
        X_train, X_test, y_train, y_test = train_test_split(embeddings_array, thumbs_down_up, test_size=0.20, 
                                                            random_state = RANDOM_SEED)
    
        reset_graph()

        lstm_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_neurons)

        n_steps = embeddings_array.shape[1]  # number of words per document 
        n_inputs = embeddings_array.shape[2]  # dimension of  pre-trained embeddings
        n_neurons = 20  # analyst specified number of neurons
        n_outputs = 2  # thumbs-down or thumbs-up
        n_layers = 3

        learning_rate = b

        X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y = tf.placeholder(tf.int32, [None])

        lstm_cells = [tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_neurons)
                      for layer in range(n_layers)]
        multi_cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cells)
        outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)
        top_layer_h_state = states[-1][1]
        logits = tf.layers.dense(top_layer_h_state, n_outputs, name="softmax")
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        init = tf.global_variables_initializer()

        n_epochs = 30
        batch_size = 200
        start_time = time.time()
        with tf.Session() as sess:
            init.run()
            for epoch in range(n_epochs):
                for iteration in range(y_train.shape[0] // batch_size):          
                    X_batch = X_train[iteration*batch_size:(iteration + 1)*batch_size,:]
                    y_batch = y_train[iteration*batch_size:(iteration + 1)*batch_size]

                    sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
                acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
                acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
            print("Iteration", z, "Random Seed", a, "Learning Rate", b, "Vocabulary Size", c, "Train accuracy", acc_train, "Test accuracy =", acc_test)

        end_time = time.time()
        run_time = end_time-start_time
        run_time
        
        result = ({'RNN': 'LTSM', 'Word Vector': 'Glove 300d', 'Iteration': z, 'Random Seed': a, 'Learning Rate': b, 'Vocabulary Size': c, 'Train Accuracy': acc_train, 'Test Accuracy': acc_test})
        df_LTSM_300d = df_LTSM_300d.append(result, ignore_index=True)
    z = z + 1
        

Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.99 Test accuracy = 0.7
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.825 Test accuracy = 0.725
Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.99 Test accuracy = 0.7
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.825 Test accuracy = 0.725
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.975 Test accuracy = 0.7
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.805 Test accuracy = 0.685
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.975 Test accuracy = 0.7
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.805 Test accuracy = 0.685
Iteration 2 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.99 Test accuracy = 0.7


## Dropout With Basic RNN, Word Vector Glove 300d


In [50]:
df_Dropout_300d = pd.DataFrame(columns = ['RNN', 'Word Vector', 'Iteration', 'Random Seed','Learning Rate', 'Vocabulary Size', 'Train Accuracy', 'Test Accuracy'])

RANDOM_SEEDS = [9999, 9999, 9999, 9999, 64, 64, 64, 64]
Learn_Rate = [.01, .001, .01, .001, .01, .001, .01, .001]
VOCAB_SIZE = [10000, 10000, 20000, 20000, 10000, 10000, 20000, 20000]  # specify desired size of pre-defined embedding vocabulary 
#Epochs = [30, 50]
#Batch = [100, 200]

z = 1
while z <= 2:
    for a, b, c in zip(RANDOM_SEEDS, Learn_Rate, VOCAB_SIZE):
        EVOCABSIZE = c
        RANDOM_SEED = a
            # Random splitting of the data in to training (80%) and test (20%)  
        X_train, X_test, y_train, y_test = train_test_split(embeddings_array, thumbs_down_up, test_size=0.20, 
                                                            random_state = RANDOM_SEED)
    

        reset_graph()

        n_steps = embeddings_array.shape[1]  # number of words per document 
        n_inputs = embeddings_array.shape[2]  # dimension of  pre-trained embeddings
        n_neurons = 20  # analyst specified number of neurons
        n_outputs = 2  # thumbs-down or thumbs-up
        n_layers = 3

        learning_rate = b

        X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y = tf.placeholder(tf.int32, [None])

        keep_prob = tf.placeholder_with_default(.5, shape=())
        cells = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
                 for layer in range(n_layers)]
        cells_drop = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob)
                      for cell in cells]
        multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells_drop)
        outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

        states_concat = tf.concat(axis=1, values=states)
        logits = tf.layers.dense(states_concat, n_outputs)
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        init = tf.global_variables_initializer()

        n_epochs = 50
        batch_size = 200
        train_keep_prob = 0.5
        start_time = time.time()
        with tf.Session() as sess:
            init.run()
            for epoch in range(n_epochs):
                for iteration in range(y_train.shape[0] // batch_size):
                    X_batch = X_train[iteration*batch_size:(iteration + 1)*batch_size,:]
                    y_batch = y_train[iteration*batch_size:(iteration + 1)*batch_size]
                    X_batch = X_batch.reshape((-1, n_steps, n_inputs))
                    sess.run(training_op, feed_dict={X: X_batch, y: y_batch,keep_prob: train_keep_prob})
                acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
                acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
            print("Iteration", z, "Random Seed", a, "Learning Rate", b, "Vocabulary Size", c, "Train accuracy", acc_train, "Test accuracy =", acc_test)

        end_time = time.time()
        run_time = end_time-start_time
        run_time
        
        result = ({'RNN': 'Dropout', 'Word Vector': 'Glove 300d', 'Iteration': z, 'Random Seed': a, 'Learning Rate': b, 'Vocabulary Size': c, 'Train Accuracy': acc_train, 'Test Accuracy': acc_test})
        df_Dropout_300d = df_Dropout_300d.append(result, ignore_index=True)
    z = z + 1
        
        

Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.68 Test accuracy = 0.615
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.565 Test accuracy = 0.535
Iteration 1 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.685 Test accuracy = 0.585
Iteration 1 Random Seed 9999 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.565 Test accuracy = 0.55
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.705 Test accuracy = 0.58
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 10000 Train accuracy 0.575 Test accuracy = 0.595
Iteration 1 Random Seed 64 Learning Rate 0.01 Vocabulary Size 20000 Train accuracy 0.595 Test accuracy = 0.605
Iteration 1 Random Seed 64 Learning Rate 0.001 Vocabulary Size 20000 Train accuracy 0.625 Test accuracy = 0.57
Iteration 2 Random Seed 9999 Learning Rate 0.01 Vocabulary Size 10000 Train accuracy 0.645 Test accuracy

In [52]:
df_total = pd.concat([df_basic_50d, df_LTSM_50d, df_Dropout_50d, df_basic_300d, df_LTSM_300d, df_Dropout_300d])
print(df_total)

df_total.to_csv("Results.csv", index=False)

        RNN Word Vector Iteration Random Seed  Learning Rate Vocabulary Size  \
0     Basic   Glove 50d         1        9999          0.010           10000   
1     Basic   Glove 50d         1        9999          0.001           10000   
2     Basic   Glove 50d         1        9999          0.010           20000   
3     Basic   Glove 50d         1        9999          0.001           20000   
4     Basic   Glove 50d         1          64          0.010           10000   
..      ...         ...       ...         ...            ...             ...   
11  Dropout  Glove 300d         2        9999          0.001           20000   
12  Dropout  Glove 300d         2          64          0.010           10000   
13  Dropout  Glove 300d         2          64          0.001           10000   
14  Dropout  Glove 300d         2          64          0.010           20000   
15  Dropout  Glove 300d         2          64          0.001           20000   

    Train Accuracy  Test Accuracy  
0  