In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [43]:
test = pd.read_csv("../../../data/external/Yelp/test.csv")
train = pd.read_csv("../../../data/external/Yelp/train.csv")

In [44]:
def tokenize_corpus(corpus, num_words=1000, oov_token="<OOV>"):
    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(corpus)
    return tokenizer

def create_review_corpus(df):
    # remove punctuation
    df["text"] = df["text"].str.replace("[^\w\s]", "")
    # remove numbers
    df["text"] = df["text"].str.replace("\d+", "")
    # make lowercase
    df["text"] = df["text"].str.lower()
    # make one long line
    df["text"] = df["text"].str.replace("\s+", " ")
    # remove leading and trailing whitespace
    df["text"] = df["text"].str.strip()
    # remove any empty reviews
    df = df[df["text"] != ""]

    return df["text"].values

create_review_corpus(train[:10])




  df["text"] = df["text"].str.replace("[^\w\s]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].str.replace("[^\w\s]", "")
  df["text"] = df["text"].str.replace("\d+", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].str.replace("\d+", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["t

array(['unfortunately the frustration of being dr goldbergs patient is a repeat of the experience ive had with so many other doctors in nyc good doctor terrible staff it seems that his staff simply never answers the phone it usually takes hours of repeated calling to get an answer who has time for that or wants to deal with it i have run into this problem with many other doctors and i just dont get it you have office workers you have patients with medical needs why isnt anyone answering the phone its incomprehensible and not work the aggravation its with regret that i feel that i have to give dr goldberg stars',
       'been going to dr goldberg for over years i think i was one of his st patients when he started at mhmg hes been great over the years and is really all about the big picture it is because of him not my now former gyn dr markoff that i found out i have fibroids he explores all options with you and is very patient and understanding he doesnt judge and asks all the right que

In [45]:
# get reviews which are shorter than 50 words
short_reviews = train[train["text"].str.split().str.len() < 50]
# print average length of short reviews
print(short_reviews["text"].str.split().str.len().mean())

test_review_corpus = create_review_corpus(short_reviews[:10000])
test_tokenizer = tokenize_corpus(test_review_corpus, num_words=10000, oov_token="<OOV>")
print(test_tokenizer.num_words)


29.522418806660568


  df["text"] = df["text"].str.replace("[^\w\s]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].str.replace("[^\w\s]", "")
  df["text"] = df["text"].str.replace("\d+", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].str.replace("\d+", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["t

10000


In [46]:
# get average length of reviews
review_lengths = [len(review.split()) for review in test_review_corpus]
print(np.mean(review_lengths))

28.782056411282255


In [61]:
# get n_grams
def get_n_grams(corpus, tokenizer, length_of_input):
    n_grams = []
    for review in corpus:
        tokens = tokenizer.texts_to_sequences([review])[0]
        for i in range(len(tokens)+1):
            n_grams.append(tokens[:i+1])
    
    # pad sequences
    n_grams = pad_sequences(n_grams, maxlen=length_of_input, padding="pre")

    # split sequences by input and output
    n_grams = np.array(n_grams)
    X = n_grams[:,:-1]
    y = n_grams[:,-1]
    y = tf.keras.utils.to_categorical(y, num_classes=tokenizer.num_words)

    return X, y

# max_length_of_input = max([len(i) for i in test_review_corpus])
max_length_of_input = 50
inputs, outputs = get_n_grams(test_review_corpus, test_tokenizer, length_of_input=max_length_of_input)


In [62]:
print(inputs.shape)
print(outputs.shape)

(297767, 49)
(297767, 10000)


In [63]:
def create_model(n_grams, num_words, max_len, embedding_dim=64):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(num_words, embedding_dim, input_length=max_len-1),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(outputs.shape[1], activation="softmax")
    ])

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

model = create_model(inputs, test_tokenizer.num_words, max_length_of_input)

# Set up early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, mode='max', restore_best_weights=True)

# Train the model with validation data
history = model.fit(inputs, outputs, epochs=100, verbose=1, validation_split=0.2, callbacks=[early_stopping])

# save best model
model.save("../../../models/yelp/review-generator.h5")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


In [65]:
seed_text = "bad food"

def generate_text(model, tokenizer, seed_text, max_len, num_words):
    for _ in range(max_len):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding="pre")
        predicted = model.predict(token_list, verbose=0)
        predicted = np.random.choice(num_words, p=predicted[0])
        # print(predicted)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

model = tf.keras.models.load_model("../../../models/yelp/review-generator.h5")
print(generate_text(model, test_tokenizer, seed_text, max_length_of_input, test_tokenizer.num_words))

bad food the interior is super sundae <OOV> when the service and ambiance is large and prices who didnt waste your time i expected because i will your beat that it had a day to listen to sedona next time the sandwich to salsa thanks experience it was salad time try go
