In [0]:
import numpy as np
import argparse
import os
import io
import re
import nltk
import gensim
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional

"""
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
"""

# Stanford POS tagger
from nltk.tag import StanfordPOSTagger
stanford_dir = " "
modelfile = " "
jarfile = " "
S_tagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile)

# Load Google News Word Embeddings
embed_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True) 

SEQUENCE_LEN = 10
DIVERSITY = 0.7
QUANTITY = 15
TAGGER = 'NLTK'
NUM_GENERATE_TEXT = 100
corpusName = 'jokes.txt'
model_file = "LSTM_JOKE-epoch037-val_acc0.2754"

In [0]:
# Set up

with io.open(corpusName, encoding='utf-8') as f:
    text = f.read().lower().replace('\n', ' \n ')

text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']

for i in range(len(text_in_words)):
  text_in_words[i] = re.sub(r'[^\w\s\'\-\/]','',text_in_words[i])

word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

words = set(text_in_words)
words = sorted(set(words) - ignored_words)

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

STEP = 1
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored+1

def shuffle_and_split_training_set(sentences_original, labels_original, percentage_test=10):
    tmp_sentences = []
    tmp_next_char = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_char.append(labels_original[i])
    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_char[:cut_index], tmp_next_char[cut_index:]
    return x_train, y_train, x_test, y_test

sentences, next_words, sentences_test, next_words_test = shuffle_and_split_training_set(sentences, next_words)

In [0]:
# Load the trained model

model = load_model(model_file)
model.summary()

In [0]:
# Tag each word in the generated sentences

def postag(taggerchoice, text):
  tokens = nltk.word_tokenize(text)
  if taggerchoice == 'NLTK':
    return nltk.pos_tag(tokens)
  elif taggerchoice == 'Stanford':
    return S_tagger.tag(tokens)
  else:
    pass

In [0]:
# Generate sentences with the random seed

def generate_text(model, indices_word, word_indices, seed, sequence_length, diversity, quantity):
    sentence = seed.split(" ")
    generated_text = ""
    for i in range(quantity):
        x_pred = np.zeros((1, sequence_length, len(words)))
        for t, word in enumerate(sentence):
            x_pred[0, t, word_indices[word]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_word = indices_word[next_index]

        sentence = sentence[1:]
        sentence.append(next_word)

        generated_text = generated_text + " " + next_word
    return generated_text

In [0]:
# Test if the sentence has exactly 2 nouns

def test_noun(tagged_text):
  num_noun = 0
  for word, tag in tagged_text:
    if tag == 'NN':
      num_noun += 1
  if num_noun == 2:
    return True
  else:
    return False

In [0]:
# Generate Sentences

generated = 0
sentence_list = []

while generated < NUM_GENERATE_TEXT:
  seed_index = np.random.randint(len(sentences+sentences_test))
  seed = (sentences+sentences_test)[seed_index]
  new_sentence = seed + generate_text(model, indices_word, word_indices, seed, SEQUENCE_LEN, DIVERSITY, QUANTITY)
  tagged_sentence = postag(TAGGER, new_sentence)
  if test_noun(tagged_sentence) == True:
    generated += 1
    sentence_list.append(tagged_sentence)

In [0]:
# Retrieve identified objects

file = open("identified.txt", "r")
objects = file.readlines()
objects = [obj.strip() for obj in objects]
file.close()

In [0]:
# Replace nouns in the generated sentences with identified objects and calculate sums of cosine similarities

def ReplaceNoun(objects, tag_sentences):
  replaced = []
  sum_similarity = []
  for sent in tag_sentences:
    noun_idx = []
    for i in range(len(sent)):
      if sent[i][1] == 'NN':
        noun_idx.append(i)
    relative_a = float(embed_model.similarity(sent[noun_idx[0]][0], objects[0])) / float(embed_model.similarity(sent[noun_idx[0]][0], objects[1]))
    relative_b = float(embed_model.similarity(sent[noun_idx[1]][0], objects[0])) / float(embed_model.similarity(sent[noun_idx[1]][0], objects[1]))
    if relative_a > relative_b:
      temp_idx1 = 1
      temp_idx2 = 0
      sum_similarity.append(float(embed_model.similarity(sent[noun_idx[0]][0], objects[0])) + float(embed_model.similarity(sent[noun_idx[1]][0], objects[1])))
    else:
      temp_idx1 = 0
      temp_idx2 = 1
      sum_similarity.append(float(embed_model.similarity(sent[noun_idx[0]][0], objects[1])) + float(embed_model.similarity(sent[noun_idx[1]][0], objects[0])))
    temp_sentence = ""
    for j in range(len(sent)):
      if j == noun_idx[temp_idx2]:
        temp_sentence = temp_sentence + " " + objects[0]
      elif j == noun_idx[temp_idx1]:
        temp_sentence = temp_sentence + " " + objects[1]
      else:
        temp_sentence = temp_sentence + " " + sent[j][0]
    replaced.append(temp_sentence)
  return replaced, sum_similarity


In [0]:
candidate_sentence, sim_score = ReplaceNoun(objects, sentence_list)

In [0]:
# Find the optimal sentence

watermark = 0.0
choice = 0

for idx in range(NUM_GENERATE_TEXT):
  if sim_score[idx] > watermark:
    watermark = sim_score[idx]
    choice = idx

In [0]:
file1 = open("output.txt", "w")
file1.write(candidate_sentence[choice][1:])
file1.close()