In [None]:
import numpy as np
import pandas
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dropout,LSTM
import matplotlib.pyplot as plt
import spacy 
nlp = spacy.load("en_core_web_sm")
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preparing data

In [None]:
def preprocess(text):
    return " ".join([token.lemma_ for token in nlp(text) if not token.is_stop and token.is_ascii and not token.is_punct])

In [None]:
def preprocess_data(df):
    tokenized_text = []
    labels = []
    for row in df.itertuples():
          tokenized_text.append(preprocess(row[1]))
          labels.append(row[2])

  
    features = {"seq": tokenized_text}
    labels = {"label": labels}
    return pandas.DataFrame(features),pandas.DataFrame(labels)

In [None]:
df = pandas.read_csv("tripadvisor_hotel_reviews.csv",encoding="utf8")
 
sequences, labels = preprocess_data(df)


In [None]:
labels["label"].value_counts().plot(kind='bar')

In [None]:
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(sequences["seq"])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(sequences["seq"])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pandas.get_dummies(labels["label"]).values
print('Shape of label tensor:', Y.shape)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model = tf.keras.Sequential()

model.add(tf.keras.layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(Dropout(0.2))
model.add(tf.keras.layers.LSTM(128))
model.add(Dropout(0.5))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(Y.shape[1], activation="softmax"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, Y_train, epochs=4, batch_size=64,validation_split=0.2)

In [None]:
tf.keras.utils.plot_model(
    model,
    to_file="model.png",
    show_shapes=True,
    show_dtype=False,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=True,
    dpi=96,
)

In [None]:
y_pred = model.predict(X_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

decoded_Y_test = []

for i in range(len(Y_test)):
  decoded_Y_test.append(np.argmax(Y_test[i], axis=0))

print(classification_report(decoded_Y_test, y_pred_bool))

In [None]:
eval = model.evaluate(X_test,Y_test)
print(eval)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n'.format(eval[0],eval[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

# Creating baseline

In [None]:
import numpy as np
from sklearn.dummy import DummyClassifier


In [None]:

dummy_clf = DummyClassifier(strategy="uniform")
dummy_clf.fit(sequences, labels)

dummy_clf.predict(sequences)

dummy_clf.score(sequences, labels)

# Naiver Bayes baseline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

X_train_baseline, X_test_baseline, Y_train_baseline, Y_test_baseline = train_test_split(sequences["seq"],labels["label"], test_size = 0.20)

In [None]:
Y_train_baseline

In [None]:
pipe = Pipeline([('vectorizer', CountVectorizer()), ('naive', MultinomialNB())])
tmp = pipe.fit(X_train_baseline, Y_train_baseline)
pred = pipe.predict(X_test_baseline)
score = pipe.score(X_test_baseline, Y_test_baseline)
print(score)
print(classification_report(Y_test_baseline, pred))

# Evaluation of hyperparameters

In [None]:
!pip install hyperas

In [None]:

# Sklearn tools
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Keras preprocessing, models, evaluators
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Activation, Dropout, GlobalMaxPool1D, Conv1D
from keras.layers.embeddings import Embedding
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from keras.preprocessing import text, sequence
from keras import utils
from keras.models import load_model


import hyperas
from hyperas import optim
from hyperas.distributions import choice, uniform
from hyperopt import Trials, STATUS_OK, tpe

In [None]:

def data():

  import spacy
  nlp = spacy.load("en_core_web_sm")
  df = pandas.read_csv("drive/MyDrive/Skola/TDDE16/tripadvisor_hotel_reviews.csv",encoding="utf8")
  #sequences, labels = preprocess_data(reviews_df)

  tokenized_text = []
  labels = []
  for row in df.itertuples():
    tokenized_text.append(" ".join([token.lemma_ for token in nlp(row[1]) if not token.is_stop and token.is_ascii and not token.is_punct]))
    labels.append(row[2])
  sequences = pandas.DataFrame({"seq": tokenized_text})
  labels = pandas.DataFrame({"label": labels})


  # The maximum number of words to be used. (most frequent)
  MAX_NB_WORDS = 5000
  # Max number of words in each complaint.
  MAX_SEQUENCE_LENGTH = 100
  # This is fixed.
  EMBEDDING_DIM = 200
  tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
  tokenizer.fit_on_texts(sequences["seq"])
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  X = tokenizer.texts_to_sequences(sequences["seq"])
  X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
  print('Shape of data tensor:', X.shape)
  Y = pandas.get_dummies(labels["label"]).values
  print('Shape of label tensor:', Y.shape)


  X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
  print(X_train.shape,Y_train.shape)
  print(X_test.shape,Y_test.shape)
  return X_train, Y_train, X_test, Y_test


In [None]:
def create_model(X_train, Y_train, X_test, Y_test):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(MAX_NB_WORDS, output_dim = 200 , input_length=X.shape[1]))
    model.add(Dropout({{choice([0,0.2, 0.4, 0.5, 0.6])}}))
    model.add(tf.keras.layers.LSTM(units = 100))
    model.add(Dropout({{choice([0,0.2, 0.4, 0.5, 0.6])}}))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(Y.shape[1], activation="softmax"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 4
    batch_size = 128
    result = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)
    validation_acc = np.amax(result.history['val_accuracy'])
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [None]:

try:
  best_run, best_model, space = optim.minimize(model=create_model,
                                           data=data,
                                           algo=tpe.suggest,
                                           max_evals=20,
                                           trials=Trials(),
                                           notebook_name='drive/MyDrive/Colab Notebooks/TDDE16_project',
                                           eval_space=True,
                                           return_space=True)
except Exception as e:
  print(e)  

In [None]:
X_train, Y_train, X_test, Y_test = data()    

In [None]:
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)

In [None]:
model.summary()

In [None]:
test_loss, test_acc = model.evaluate(features_test, labels_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))