In [0]:
# Initialize drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Move to the appropriate directory

In [0]:
# Imports
from __future__ import absolute_import, division, print_function
!pip install tensorflow-gpu==2.0.0a
from constants import *
warnings.filterwarnings("ignore")

In [0]:
RUN_NUMBER = 2
DEFAULT_GLOBAL_SEED = RUN_NUMBER
ENCODER_MODEL = 'BASELINE'
MAX_INPUT_SIZE = 35

ENCODING_PATH = None # Make this the path where you want the encodings saved

BCN_DATASET = "SST-2" # or "SST-5"

if BCN_DATASET == "SST-5":
  dataPrefix = "./data/SST-5/sst5_"
elif BCN_DATASET == "SST-2":
  dataPrefix = "./data/SST-2/sst2_"

In [0]:
# Load the baseline model from the CoVe paper
from tensorflow.keras.models import load_model
cove_model = load_model('./checkpoints/Keras_CoVe_py36.h5')

In [0]:
# Read the data from files and clean them
def read_sentiment_data(file): 
  train_sent_path = dataPrefix + file + '.txt'
  f_train_sent = open(train_sent_path, 'r', encoding='utf-8')
  train_sent = f_train_sent.read()
  f_train_sent.close()
  return clean(train_sent)

def read_sentiment_labels(file): 
  labels = []
  train_sent_path = dataPrefix + file + '_label.txt'
  f_train_sent = open(train_sent_path, 'r', encoding='utf-8')
  for x in f_train_sent : 
    labels.append(float(x))
  f_train_sent.close()
  return labels 

def clean(data=None):
    data = re.sub('[0-9]+p*', 'n', data)  # replace all numbers with n
    data = re.sub('  ', ' ', data)  # remove double spaces
    data = re.sub("'", '', data)  # remove apostrophe
    data = data.split('\n')
    return data

# Read the SST data
# The -1 avoids the final blank entry resulting from the split
train_sent = read_sentiment_data("train")[:-1]
train_sent_labels = read_sentiment_labels("train")
val_sent = read_sentiment_data("val")[:-1]
val_sent_labels = read_sentiment_labels("val")
test_sent = read_sentiment_data("test")[:-1]
test_sent_labels = read_sentiment_labels("test")

# Double check that we have the same number of sentences as we do labels
assert(len(train_sent) == len(train_sent_labels))
assert(len(val_sent) == len(val_sent_labels))
assert(len(test_sent) == len(test_sent_labels))
print("train len", len(train_sent))
print("val len", len(val_sent))
print("test len", len(test_sent))

# Fit tokenizer to generate number of words
tokenizer = Tokenizer(num_words=None, lower=True, oov_token='<UNK>')
tokenizer.fit_on_texts([train_sent, val_sent, test_sent])

# Tokenize with appropriate max_word length
tokenizer = Tokenizer(num_words=len(tokenizer.word_counts.items()), lower=True, oov_token='<UNK>')
tokenizer.fit_on_texts(np.concatenate((train_sent, val_sent, test_sent), axis =0))

train_sent_tok = tokenizer.texts_to_sequences(train_sent)
val_sent_tok = tokenizer.texts_to_sequences(val_sent)
test_sent_tok = tokenizer.texts_to_sequences(test_sent)

vocab = {k: v for k, v in tokenizer.word_counts.items()}
vocab_len = len(vocab)

# Generate tokenized sentences
train_sent_tok = pad_sequences(train_sent_tok, maxlen=MAX_INPUT_SIZE, truncating='post',
                          padding='post', value=0)
val_sent_tok = pad_sequences(val_sent_tok, maxlen=MAX_INPUT_SIZE, truncating='post',
                          padding='post', value=0)
test_sent_tok = pad_sequences(test_sent_tok, maxlen=MAX_INPUT_SIZE, truncating='post',
                          padding='post', value=0)

i2w = {v: k for k, v in tokenizer.word_index.items()}

# Create Glove Embedding dictionary
glove_embedding_matrix_n = create_embedding_indexmatrix(vocab_len, 
                                                      embedding_dim=300,
                                                      dict_en=i2w)

In [0]:
# Initialize embedding layer
embed_glove = Embedding(input_dim=vocab_len, output_dim=300,
                               embeddings_initializer=Constant(glove_embedding_matrix_n),
                               input_length=MAX_INPUT_SIZE,
                               trainable=False)

In [0]:
# Generate datasets and initialize embeddings
train_dataset = tf.data.Dataset.from_tensor_slices(train_sent_tok)
train_dataset = train_dataset.batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices(val_sent_tok)
val_dataset = val_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices(test_sent_tok)
test_dataset = test_dataset.batch(BATCH_SIZE)


example_input_batch= next(iter(train_dataset))

sets = [train_dataset, val_dataset, test_dataset]
embeds = [[], [], []]

for j in range(len(sets)):
  for i in iter(sets[j]):
    glove_embeddings = embed_glove(i)
    num_in_embdded = glove_embeddings.shape[0]
    padding = embed_glove(example_input_batch[:128-glove_embeddings.shape[0]])
    if len(padding) > 0:
      glove_embeddings = np.append(glove_embeddings, padding, axis=0)
    H = cove_model.predict(glove_embeddings)
    index = 0
    for enc in H:
      embedding = np.concatenate((glove_embeddings[index],enc),axis =1)
      if index < num_in_embdded:
        embeds[j].append(embedding)
      index+=1

In [0]:
# Pickle the data
import pickle
for i in range(len(set_labels)):
  with open(ENCODING_PATH +BCN_DATASET+ 'encodings_' + set_labels[i], 'wb') as fp:
      pickle.dump(embeds[i], fp)
  fp.close()