**Imports**


In [None]:
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Flatten, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle
import re
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize

#Downloading existing universal models
nltk.download('popular')
nlp = spacy.load("en_core_web_sm")


Using TensorFlow backend.


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

**Code for Cleaning up Titles and Averaging the Vectors**
1. Remove all punctuation, numbers in titles
2. Tokenize titles by space
3. Import word2vec/glove vectors and average them



In [None]:
def generateAverageVector(test_string):
  # def stripTitle(doc):
  #   strip_punc = re.sub(r'[^\w\s]','',doc)
  #   return re.sub(r'\w*\d\w*', '', strip_punc).strip()
  def stripTitle(doc):
    return re.sub('[^A-Za-z0-9]+', ' ', doc)
  test_string = stripTitle(test_string)
  summed_vectors = np.zeros(96)
  word_count = 0;
  for token in nlp(test_string):
    if token.has_vector:
      summed_vectors = np.add(summed_vectors, token.vector)
      word_count = word_count + 1
  return np.true_divide(summed_vectors, word_count)


# Code for POS Filtering
1. Separate title into POS using NLTK
2. Remove words in list of POS
3. Return filtered title

POS Tags
POS tag list:

CC coordinating conjunction
CD cardinal digit
DT determiner
EX existential there (like: "there is" ... think of it like "there exists")
FW foreign word
IN preposition/subordinating conjunction
JJ adjective 'big'
JJR adjective, comparative 'bigger'
JJS adjective, superlative 'biggest'
LS list marker 1)
MD modal could, will
NN noun, singular 'desk'
NNS noun plural 'desks'
NNP proper noun, singular 'Harrison'
NNPS proper noun, plural 'Americans'
PDT predeterminer 'all the kids'
POS possessive ending parent's
PRP personal pronoun I, he, she
PRP$ possessive pronoun my, his, hers
RB adverb very, silently,
RBR adverb, comparative better
RBS adverb, superlative best
RP particle give up
TO to go 'to' the store.
UH interjection errrrrrrrm
VB verb, base form take
VBD verb, past tense took
VBG verb, gerund/present participle taking
VBN verb, past participle taken
VBP verb, sing. present, non-3d take
VBZ verb, 3rd person sing. present takes
WDT wh-determiner which
WP wh-pronoun who, what
WP$ possessive wh-pronoun whose
WRB wh-abverb where, when



In [None]:

stop_words = set(stopwords.words('english'))
nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'PRP']
verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
adjv = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']

def filterTitle(title, POS):
    # POS is list of parts of speech wanted included in the filtered in title

    # sent_tokenize is one of instances of  
    # PunktSentenceTokenizer from the nltk.tokenize.punkt module 
      
    # tokenized = sent_tokenize(title)
    # print(tokenized)
    
    # Word tokenizers is used to find the words
    # and punctuation in a string 
    wordsList = nltk.word_tokenize(title)

    # removing stop words from wordList 
    wordsList = [w for w in wordsList if not w in stop_words] 

    #  Using a Tagger. Which is part-of-speech  
    # tagger or POS-tagger.
    tagged = nltk.pos_tag(wordsList)

    filtered = [word[0] for word in tagged if word[1] in POS]
    filteredtitle = ' '.join(filtered[:10])
    return filteredtitle



**Creating LSTM Input**

The method cleans up each title, and stacks them to create a n x 96 matrix of word vectors for each title, with n being the number of prespecified words in a title.


In [None]:
import re
import spacy
import numpy as np

def generate_ltsm_input(titles):  # titles is list of Airbnb titles (strings)
    nlp = spacy.load("en_core_web_sm")
    
    def stripTitle(doc):
      strip_punc = re.sub(r'[^\w\s]','',doc)
      return re.sub(r'\w*\d\w*', '', strip_punc).strip()
    
    lengths = [len(title.split()) for title in titles]
    plen = 10
    titlevecs = np.zeros(shape=(1, plen, 96))  # baseline shape

    # print(titlevecs)
    for title in titles:
        title = stripTitle(title)
        summed_vectors = np.zeros(96)
        title = filterTitle(title, adjv)  # input nouns, verbs, or adjv 
        tokens = nlp(title)
        for count in range(plen):  # loop through tokens until end, then fill with null tokens
            if count < len(tokens) and tokens[count].has_vector:
                summed_vectors = np.vstack((summed_vectors, tokens[count].vector))
            else:
                summed_vectors = np.vstack((summed_vectors, np.zeros(96)))
        titlevecs = np.vstack((titlevecs, summed_vectors[1:][:].reshape(1, plen, 96)))  # add title to the stack of titles

        # count = 0
        # for token in nlp(title):
        #     if token.has_vector:
        #         summed_vectors = np.vstack((summed_vectors, token.vector))
        #     else:
        #         summed_vectors = np.vstack((summed_vectors, np.zeros(96)))
        #     count += 1
        # if count <= plen:
        #     for iterations in range(plen - count):
        #         summed_vectors = np.vstack((summed_vectors, np.zeros(96)))  ## null token

    # print(titlevecs)
    return titlevecs[1:][:][:]


**Code for Creating Training Data**
1. Loop through dataset(s) for titles. 
2. Generate title vectors for each title and create X_train. 
3. Normalize average reviews and create Y_train. 


In [None]:
import pandas as pd

import spacy


brooklynhigh = pd.read_csv('Brooklyn_high.csv', encoding = "ISO-8859-1")
brooklynlow = pd.read_csv('Brooklyn_low.csv', encoding = "ISO-8859-1")

manhattanhigh = pd.read_csv('Manhattan_high.csv', encoding = "ISO-8859-1")
manhattanlow = pd.read_csv('Manhattan_low.csv', encoding = "ISO-8859-1")

queenshigh = pd.read_csv('Queens_high.csv', encoding = "ISO-8859-1")
queenslow = pd.read_csv('Queens_low.csv', encoding = "ISO-8859-1")

bronxhigh = pd.read_csv('Bronx_high.csv', encoding = "ISO-8859-1")
bronxlow = pd.read_csv('Bronx_low.csv', encoding = "ISO-8859-1")

statenhigh = pd.read_csv('StatenIsland_high.csv', encoding = "ISO-8859-1")
statenlow = pd.read_csv('StatenIsland_low.csv', encoding = "ISO-8859-1")

dataset = brooklynhigh  # put the dataset you want to use here
titles = [title for title in dataset['name']]
print(len(titles))
combined_data = generate_ltsm_input(titles)  # dataset
print(combined_data.shape)

# combined_data = pd.DataFrame(summed_vectors)



FileNotFoundError: ignored

**Creating Categorical Output**

The following method determines what are the best discrete bins to put each continuous output value into.

In [None]:

def convert_categorical(dataset):  # dataset must be sorted
    base = dataset.min()
    diff = dataset.max() - base
    summed_vectors = np.zeros(5)
    for iteration in range(len(dataset)):
        entry = np.zeros(5)
        entry[int(iteration / (len(dataset) / 5))] = 1
        summed_vectors = np.vstack((summed_vectors, entry))

    # for point in dataset:
    #     entry = np.zeros(max_val)
    #     entry[int(point) - 1 if int(point) - 1 > 0 else 0] += 1
    #     summed_vectors = np.vstack((summed_vectors, entry))
    return summed_vectors

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

X_df = combined_data
Y_df = dataset['reviews_per_month']

X_train, X_test, Y_train, Y_test = train_test_split(X_df, Y_df, test_size=0.1, random_state=99)
print(Y_df.shape)

**Code for Training Keras Model**


In [None]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import tensorflow as tf
from sklearn.model_selection import train_test_split

  # put the dataset you want to use her
def eval_lstm(dataset):
  titles = [title for title in dataset['name']]
  print(len(titles))
  combined_data = generate_ltsm_input(titles) 
  X_df = combined_data
  Y_df = dataset['reviews_per_month']
  X_train, X_test, Y_train, Y_test = train_test_split(X_df, Y_df, test_size=0.25, random_state=99)  
  model = Sequential()
  model.add(LSTM(100, input_shape=(X_df.shape[1], 96)))
  model.add(Dropout(0.2))
  model.add(Dense(Y_df.shape[1], activation='sigmoid'))
  model.compile(loss='hinge', optimizer='adam', metrics=['accuracy'])
  print(model.summary())
  # Fit model in batches
  model.fit(X_train, Y_train, nb_epoch=100, batch_size=100)
  return model


In [None]:
brooklynhigh = pd.read_csv('Brooklyn_high.csv', encoding = "ISO-8859-1")
brooklynhigh_model = eval_lstm(brooklynhigh)
brooklynhigh_model.predict(X_test[1:])

In [None]:
import pandas as pd

brooklynhigh = pd.read_csv('Brooklyn_high.csv', encoding = "ISO-8859-1")
brooklynlow = pd.read_csv('Brooklyn_low.csv', encoding = "ISO-8859-1")

manhattanhigh = pd.read_csv('Manhattan_high.csv', encoding = "ISO-8859-1")
manhattanlow = pd.read_csv('Manhattan_low.csv', encoding = "ISO-8859-1")

queenshigh = pd.read_csv('Queens_high.csv', encoding = "ISO-8859-1")
queenslow = pd.read_csv('Queens_low.csv', encoding = "ISO-8859-1")

bronxhigh = pd.read_csv('Bronx_high.csv', encoding = "ISO-8859-1")
bronxlow = pd.read_csv('Bronx_low.csv', encoding = "ISO-8859-1")

statenhigh = pd.read_csv('StatenIsland_high.csv', encoding = "ISO-8859-1")
statenlow = pd.read_csv('StatenIsland_low.csv', encoding = "ISO-8859-1")

datasets = [brooklynhigh, brooklynlow, manhattanhigh, manhattanlow, queenshigh, queenslow, 
            bronxhigh, bronxlow, statenhigh, statenlow]

brooklynhigh_model = eval_lstm(brooklynhigh)


**Code for Testing Title Replacements Based on Best Possible Word**
1. Loop through each word in given title.
2. For each word, generate list of 10 best/closest words
3. Replace current word with each closest word and score the new title based on RNN
4. Return best title based on best score


In [None]:
# import spacy
!pip install sense2vec
# from sense2vec import Sense2VecComponent

# nlp = spacy.load("en_core_web_sm")
# s2v = Sense2VecComponent(nlp.vocab).from_disk("s2v_old")
# nlp.add_pipe(s2v)

def optimal_replacement(title):
    bestscore = 0
    besttitle = ""
    individual = title.split()
    for index in range(len(individual)):
        word = individual[index]
        doc = nlp(word)
        # freq = doc[:]._.s2v_freq
        # vector = doc[:]._.s2v_vec
        if not doc[:]._.in_s2v:
            individual[index] = word
            continue
        most_similar = doc[:]._.s2v_most_similar(10)
        for similar in most_similar:
            replacement = similar[0][0]
            individual[index] = replacement
            testtitle = ' '.join(individual)
            score = RNN/LTSM(testtitle)  # TODO: replace with LSTM run
            if score > bestscore:
                bestscore = score
                besttitle = testtitle
        individual[index] = word
    return besttitle, bestscore

# print(most_similar)

Collecting sense2vec
[?25l  Downloading https://files.pythonhosted.org/packages/52/bf/5b776ad825e30e6fa5e86a74711caa84bde65b22047868e588290367253f/sense2vec-1.0.2.tar.gz (54kB)
[K     |████████████████████████████████| 61kB 2.2MB/s 
[?25hCollecting spacy<3.0.0,>=2.2.3
[?25l  Downloading https://files.pythonhosted.org/packages/47/13/80ad28ef7a16e2a86d16d73e28588be5f1085afd3e85e4b9b912bd700e8a/spacy-2.2.3-cp36-cp36m-manylinux1_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 9.4MB/s 
Collecting catalogue>=0.0.4
  Downloading https://files.pythonhosted.org/packages/e3/8e/9391f722c58dc202cb7980a3a1f0e2499cc91e1fbda2c17632dad1b6e299/catalogue-2.0.0-py3-none-any.whl
Collecting preshed<3.1.0,>=3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/db/6b/e07fad36913879757c90ba03d6fb7f406f7279e11dcefc105ee562de63ea/preshed-3.0.2-cp36-cp36m-manylinux1_x86_64.whl (119kB)
[K     |████████████████████████████████| 122kB 44.6MB/s 
[?25hCollecting blis<0.5.0,>=0.