In [None]:
import pandas
import gensim
import numpy
from sklearn.model_selection import train_test_split
import keras.preprocessing as sequence
from keras import Sequential
from keras.layers import LSTM, Dense
from nltk.tokenize import word_tokenize

In [None]:
"""
DATA PREPARATION CODE
"""
df = pandas.read_json("yelp_academic_dataset_review.json", lines=True)
df.drop(['review_id', 'user_id', 'business_id', 'useful', 'funny',
       'cool', 'date'], axis=1, inplace=True)     # drop useless data from frame to save space, time
df = df[df.stars.isnull() == False]
df['stars'] = df['stars'].map(int)
df = df[df.text.isnull() == False]
print('dataset loaded with shape:', df.shape)

In [None]:
def tokenize_review(review):
    """Function to tokenize each review"""
    review = review.lower()  # convert to lowercase
    tokens = word_tokenize(review)  # use punkt to tokenize review
    # tokens = [x for x in tokens if x not in string.punctuation] # step to remove punctuation
    # tokens = [x for x in tokens if x not in stop_words] # step to remove stopwords
    return tokens

In [None]:
def postprocess(data, n=1000000):
    """Function to process reviews for Gensim W2V."""
    data = data.head(n)
    data['tokens'] = data['text'].progress_map(tokenize_review)
    return data

In [None]:
df = postprocess(df)

In [None]:
# Initializing pre-trained Word2Vec embedding model
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# Generating embeddings for all the reviews in the dataset

data = list()
# For each review in the corpus
for row in range(len(df)):
    review = []
    # For each word in the review
    for w in df['review'][row].split():
        # Append the w2v vector for the word to the review embedding
        try:
            review.append(w2v_model.get_vector(w.lower()))
        except KeyError:
            continue
    data.append(review)
data = numpy.array(data)
labels = numpy.array(df['label'])

print(len(data))
print(len(labels))
print(len(data[0][0]))

vocab_size=3000000
embedding_dim=300


In [None]:
# Neural network classifier model

model = Sequential()
model.add(LSTM(100, input_dim=embedding_dim))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Shuffle the data and labels *in the same order* to prevent overfitting
seed = numpy.random.get_state()
numpy.random.shuffle(data)
numpy.random.set_state(seed)
numpy.random.shuffle(labels)

In [None]:
max_input_words = 0
# Find the longest review to pad the others to match its length
for row in range(len(df)):
    if len(df['review'][row].split()) > max_input_words:
        max_input_words = len(df['review'][row].split())

print(max_input_words)

if max_input_words == 0:
    max_input_words = 500

In [None]:
traind, vald, trainl, vall = train_test_split(data, labels)
traind = sequence.pad_sequences(traind, maxlen=max_input_words)
vald = sequence.pad_sequences(vald, maxlen=max_input_words)

In [None]:
model.fit(traind, trainl, verbose=1)

In [None]:
model.evaluate(vald, vall, verbose=1)

