In [1]:
# Importing ML libraries
import numpy as np
from collections import Counter
from gensim.models import Word2Vec  #For word2vec
import pandas as pd
import re

In [6]:
# Import Deep Learning Libraries

import keras
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Activation, Dense, Flatten
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import one_hot
from keras.utils import np_utils

In [4]:
# Getting a random noise as the initialization for hidden state

n = 64 #Dimension of output from the Transform layer of the TransNet
h0 = np.random.normal(0,1,39).reshape(1,39) 

In [5]:
h0.shape

(1, 39)

### Getting the embeddings for required data

In [5]:
review_tips_data = pd.read_csv('review_tips.csv', index_col=0)

In [6]:
review_tips_data.head()

Unnamed: 0,reviews,tips
0,Location is everything and this hotel has it! ...,Awesome location!!
1,Location is everything and this hotel has it! ...,The cheap room advertised on the window is for...
2,Location is everything and this hotel has it! ...,Free wifi and full on continental breakfast wi...
3,This is a fairly new property I think. It is a...,"Good price, good location"
4,Location location location! \n\nMotel One is j...,"Nice location, but the beds are very hard agai..."


#### Input Embeddings

In [7]:
# Getting embeddings for all the reviews

review_data = list(review_tips_data['reviews'])

In [8]:
# Using a sample of data for checking if LSTM works

review_data_sample = review_data[:5]
print len(review_data_sample)

5


In [9]:
# Generating embeddings for the review data
words_list = []
for i in range(len(review_data_sample)):
    sentences = review_data_sample[i].split('.')
    word_sentences = [x.split() for x in sentences]
    words_list.append(word_sentences)

In [10]:
# Modifying this one
words_list = []
for i in range(len(review_data_sample)):
    sentences = review_data_sample[i].split('.')
    sentences = [re.sub(r'[^\w\s]','',sentence) for sentence in sentences]
    word_sentences = [x.split() for x in sentences]
    words_list.append(word_sentences)

In [11]:
np.array(words_list).shape

(5,)

In [12]:
embeddings = []
n_vocab = 0
for i in range(len(words_list)):
    w2v_model_sam = Word2Vec(words_list[i], min_count = 1, size = n)
    words = list(w2v_model_sam.wv.vocab)
    n_vocab += len(words)
    X = np.zeros(n)
    for w in words:
        X += w2v_model_sam[w]
    embeddings.append(X.reshape(1,n))

In [13]:
embeddings_np = np.array(embeddings)

In [14]:
embeddings_np.shape

(5, 1, 64)

#### Output Embeddings

In [15]:
# Generating the labels 

tips_data = list(review_tips_data['tips'])

In [16]:
tips_data_sam = tips_data[:5]

In [17]:
tips_data_sam = [re.sub(r'[^\w\s]','',sentence) for sentence in tips_data_sam]

In [18]:
# Generating word_lists for tips 

tips_word_list = []
for i in range(len(tips_data_sam)):
    words = tips_data_sam[i].split()
    tips_word_list.append(words)

In [19]:
tips_word_list = [tips_word_list]

tips_word_list = np.array(tips_word_list)
tips_word_list = tips_word_list.reshape(1,len(tips_word_list),)

In [20]:
# Generating embeddings for tips

tips_embeddings = {}
for i in range(len(tips_word_list)):
    w2v_model_tips = Word2Vec(tips_word_list[i], min_count = 1, size = n)
    words = list(w2v_model_tips.wv.vocab)
    for w in words:
        tips_embeddings[w] = w2v_model_tips[w]

In [21]:
# Generating label for LSTM

label_words = [x.split(',') for x in tips_data_sam]

In [22]:
label_words

[['Awesome location'],
 ['The cheap room advertised on the window is for basic room offseason'],
 ['Free wifi and full on continental breakfast with eggs'],
 ['Good price good location'],
 ['Nice location but the beds are very hard against your back']]

In [23]:
Y = []
n_vocab = 0
for t in label_words:
    sent = ''.join(t)
    words_tip = sent.split()
    tip_emb = []
    n_vocab += len(words_tip)
    for w in words_tip:
        tip_emb.append(tips_embeddings[w].reshape(n))
    
    Y.append(np.array(tip_emb))
    

In [24]:
# labels is the label for our output data
labels = np.array(Y)

In [25]:
labels[1].shape

(12, 64)

### Building the network

In [26]:
print n_vocab
print embeddings_np.shape

38
(5, 1, 64)


In [115]:
# Building the LSTM model

model = Sequential()
model.add(LSTM(256,input_shape = (1,39),return_sequences = False,return_state=True)(initial_state=Input(shape=(1,39)))) # This may put the output as the input in the next time step
# model.add(Dropout(0.2)) 
#model.add(Flatten())
# model.add(Dense(n_vocab))
# model.add(Activation('softmax'))
# model.compile(loss = 'mean_squared_error', optimizer= 'adadelta')

TypeError: __call__() takes at least 2 arguments (2 given)

In [119]:
inputs = Input(shape=(16,1024))
encoder = LSTM(256, return_state=True, return_sequences=True)
outputs = encoder(inputs)
output, state = outputs[0], outputs[1:]
decoder = LSTM(256, return_sequences=True)(output, initial_state=state)
output_words = keras.layers.TimeDistributed(Dense(activation='softmax'))(decoder)
model = Model(inputs, output_words)

TypeError: __init__() takes at least 2 arguments (2 given)

In [27]:
from keras.models import Model
from keras.layers import Input

In [198]:
decoder_inputs = Input( (1,100))

In [199]:
decoder_lstm = LSTM(39,return_sequences = True, return_state= True)

In [200]:
initial = Input((39,))

In [201]:
decoder_output,_,_ = decoder_lstm(decoder_inputs)

In [202]:
decoder_dense = Dense(10,activation='softmax')

In [203]:
decoder_output = decoder_dense(decoder_output)

In [204]:
model = Model(decoder_inputs,decoder_output)

In [205]:
model.compile(optimizer='adadelta', loss = 'categorical_crossentropy')

In [None]:
model.fit(h0, )

In [197]:
h0.shape

(1, 39)

In [207]:
embeddings_np.shape

(5, 1, 64)

In [76]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 1, 256)            328704    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 256)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 38)             9766      
_________________________________________________________________
activation_1 (Activation)    (None, 1, 38)             0         
Total params: 338,470
Trainable params: 338,470
Non-trainable params: 0
_________________________________________________________________


In [77]:
# Creating checkpoint to save the weight files

dest_path = 'weights_file-{epoch:02d}-{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(dest_path,monitor = 'loss',save_best_only= True, mode= 'min')
callback_list = [checkpoint]

In [81]:
model.fit(embeddings_np, labels, batch_size = 1, epochs= 10, callbacks = callback_list )

ValueError: Error when checking target: expected activation_2 to have shape (None, 38) but got array with shape (5, 1)

# What to do next

- Import tips.json file and choose the first tip as y - Done
- Check if it's actually using output as the input in the next sequence
- Display the output