In [1]:
import pandas as pd
import numpy as np

In [2]:
neg = pd.read_csv("/data/negative.csv")
pos = pd.read_csv("/data/positive.csv")
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)

In [3]:
neg_drop_df = neg_df.drop_duplicates(inplace = False)
pos_drop_df = pos_df.drop_duplicates(inplace = False)

In [4]:
neg_df = neg_df.sample(n=25000)
pos_df = pos_df.sample(n=25000)


In [5]:
neg_df.head()
neg_df = neg_df.rename(columns={"Negative" : "Score"})

In [6]:
pos_df = pos_df.rename(columns={"Positive" : "Score"})

In [7]:
combined_df = pd.concat([pos_df, neg_df])

In [8]:
combined_df = combined_df.reset_index(drop= True)

In [9]:
max_length= combined_df.reviews.map(lambda x: len(x)).max()

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(analyzer='word', ngram_range=(1,1),
                      token_pattern=r'\b\w{1,}\b', min_df=27,
                      strip_accents='ascii', encoding='utf-8',
                      stop_words='english')

In [11]:
vect.fit(combined_df['reviews'])
word_dict = vect.vocabulary_

In [12]:
import json
len(word_dict)

8073

In [13]:
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)

In [14]:
import json
json = json.dumps(word_dict, cls=MyEncoder)
#json = json.dumps(dict)
f = open("/output/google_testdic.json","w")
f.write(json)
f.close()

# loaded_words = json.loads(words)
# type(words) #Output str
# type(loaded_words) #Output dict


In [15]:
def encode_sentence(text):
    result = []
    arr = text_to_word_sequence(text, lower=True, split=" ")
    for word in arr:
        w = encode_word(word)
        if w is not None:
            result.append(w)
    return result


In [16]:
def encode_word(word):
    if word not in word_dict:
        return None
    return word_dict[word]

In [17]:
def build_dataset(max_len):
    Xts = combined_df["reviews"].values
    arr = []
    for text in Xts:
        arr.append(encode_sentence(text))
    X = sequence.pad_sequences(arr, maxlen=max_len)
    y = combined_df["Score"].values
    return (X, y)

In [18]:
from keras.models import Sequential
from keras.layers import MaxPooling1D, Conv1D, Flatten, Dropout, Dense
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [19]:
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence

In [20]:
def build_model(words, vec_len, review_len):
    model = Sequential()
    model.add(Embedding(words, vec_len, input_length=review_len))
    model.add(Dropout(0.25))
    model.add(Conv1D(32, 3, padding="same"))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(16, 3, padding="same"))
    model.add(Flatten())
    model.add(Dropout(0.25))
    model.add(Dense(100, activation="sigmoid"))
    model.add(Dropout(0.25))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.summary()
    return model

In [21]:
# Parameters
version = 4
words = len(word_dict)
review_len = 1000
vec_len = 300
patience = 5
batch_size = 40
epochs = 3

In [22]:
# Load data
X, y = build_dataset(review_len)

In [23]:
# Build model
model = build_model(words, vec_len, review_len)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 300)         2421900   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000, 300)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 32)          28832     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 500, 32)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 500, 16)           1552      
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 8000)              0         
__________

In [None]:
# Early stopping
early_stopping_monitor = EarlyStopping(patience=patience, monitor="loss", mode="auto")

In [None]:
# Fit model
model.fit(X, y, epochs=epochs, callbacks=[early_stopping_monitor], batch_size=batch_size, verbose=2, validation_split=0.25)

Train on 37500 samples, validate on 12500 samples
Epoch 1/3
74s - loss: 0.2219 - acc: 0.9083 - val_loss: 0.2310 - val_acc: 0.8997
Epoch 2/3
33s - loss: 0.1191 - acc: 0.9561 - val_loss: 0.2443 - val_acc: 0.8940
Epoch 3/3
33s - loss: 0.0774 - acc: 0.9718 - val_loss: 0.2548 - val_acc: 0.9179


<keras.callbacks.History at 0x7f694c3e0828>

In [None]:
import tensorflow as tf

In [None]:
model_builder = tf.saved_model.builder.SavedModelBuilder("GoogleTestModel")

In [None]:
inputs = {
    'input': tf.saved_model.utils.build_tensor_info(model.input)
}

In [None]:
outputs = {
    'batch': tf.saved_model.utils.build_tensor_info(model.output)
}

In [None]:
signature_def = tf.saved_model.signature_def_utils.build_signature_def(
    inputs=inputs,
    outputs=outputs,
    method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
)

In [None]:
from keras import backend as K
model_builder.add_meta_graph_and_variables(
    K.get_session(),
    tags=[tf.saved_model.tag_constants.SERVING],
    signature_def_map={tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def}
)

INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.


In [None]:
model_builder.save()

INFO:tensorflow:SavedModel written to: b'GoogleTestModel/saved_model.pb'


b'GoogleTestModel/saved_model.pb'

In [None]:
# from keras.preprocessing import sequence
# from keras.models import load_model

In [None]:
def encode_batch(arr):
    result = []
    for sentence in arr:
        result.append(encode_sentence(sentence))
    return sequence.pad_sequences(result, maxlen=review_len)

In [None]:
def predict_batch(arr):
    batch = encode_batch(arr)
    result = model.predict(batch, batch_size=len(batch), verbose=0)
    return result

In [None]:
print(predict_batch([
"yes",
"good",
"this is the best thing ever",
"nice",
"bad",
"such a horrible judgement",
"no",
"shitty"
]))

[[ 0.80668312]
 [ 0.82716978]
 [ 0.74110323]
 [ 0.94436091]
 [ 0.06486244]
 [ 0.06973273]
 [ 0.56603307]
 [ 0.56603307]]


In [None]:
print(predict_batch(["The rooms are big, the general set up of the outside is great, good colors, nice design but the food in the water trough and the breakfast are very average. The free breakfast is very basic with over cooked and crumbly scrambled eggs, cheap bread and flavored yogurt, very disappointing for a room at over 300 dollars. No real info from front desk for transport to airport or downtown."]))

[[ 0.7682057]]


In [None]:
print(predict_batch(["The door to my room was left open by housekeeping and my computer was stolen. Management swore up and down that they would make things right ?!?! After being asked my story and made to send invoices the Insurance refused any culpable negligence...management quickly followed suit and refused to help i anyway.This MOTEL has declined quickly and become very dilapidated in a short time because of poor maintenance(my room safe was broken) there are cockroaches in bathroom. Do not stay here."]))

[[ 0.00188351]]


In [None]:
print(predict_batch(["My family and I stayed at this hotel last weekend for a football game. Location is close to the university and easy access to I-35. Staff was friendly and offered me a late checkout, which most hotels are not willing to do. Room is spacious and there is a full kitchen, though we did not need it for this trip. We will choose this hotel again the next time we are on town."]))

[[ 0.98797995]]


In [None]:
# import h5py


In [None]:
# #save model
# model.save("/output/optimalfloyds3.h5")