In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
root_folder = "/content/drive/My Drive/CS182-Spring2020-NLP-Project/"
dataset_folder = "/content/drive/My Drive/CS182-Spring2020-NLP-Project/dataset/"
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging

logging.basicConfig(level=logging.INFO)

Mounted at /content/drive


In [4]:
!pip install bert-for-tf2



In [5]:
import tensorflow_hub as hub 
from bert import bert_tokenization
from tensorflow import keras
!pip install -q -U keras-tuner
import kerastuner as kt

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [8]:
!pip install hyperas



In [23]:
def data():
  def bert_preprocessing(reviews, tokenizer, length):
    tkns = []
    msks = []
    sids = []
    for review in reviews:
      tokenized = tokenizer.tokenize(review)[:length]
      input_sequence = ["[CLS]"] + tokenized + ["[SEP]"]
      pad = length - len(input_sequence) + 2
      tkns += [tokenizer.convert_tokens_to_ids(input_sequence) + [0]*pad]
      msks += [[1]*len(input_sequence) + [0]*pad]
      sids += [[0]*(length + 2)]
    return np.array(tkns), np.array(msks), np.array(sids)
  length=128
  root_folder = "/content/drive/My Drive/CS182-Spring2020-NLP-Project/"
  dataset_folder = "/content/drive/My Drive/CS182-Spring2020-NLP-Project/dataset/"
  d = pd.read_json(path_or_buf=dataset_folder+"yelp_review_training_dataset.jsonl", lines=True)
  d['stars'].value_counts(normalize=True)
  stars = [1, 2, 3, 4, 5]
  small_df = pd.DataFrame()
  for i in stars:
    temp = d[d['stars']==i]
    temp = temp.head(5000)
    if i == 1 or i == 2:
      temp['sent'] = [0] * 5000
    else:
      temp['sent'] = [1] * 5000
    small_df = pd.concat([temp, small_df])
  small_df.reset_index(drop=True)

  X_train, X_test, y_train, y_test = train_test_split(small_df['text'], small_df['stars'], test_size=0.3)
  train_labels = tf.keras.utils.to_categorical(y_train.values - 1, num_classes=5)
  test_labels = tf.keras.utils.to_categorical(y_test.values - 1, num_classes=5)
  y_train = train_labels
  y_test = test_labels

  BertTokenizer = bert_tokenization.FullTokenizer
  module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
  bert_layer = hub.KerasLayer(module_url, trainable=True)
  tokenizer = BertTokenizer(bert_layer.resolved_object.vocab_file.asset_path.numpy(), bert_layer.resolved_object.do_lower_case.numpy())

  X_train = bert_preprocessing(X_train.values, tokenizer, length - 2)
  X_test = bert_preprocessing(X_test.values, tokenizer, length - 2)
  return X_train, y_train, X_test, y_test, bert_layer


def create_models(hp):
    length=128
    tkns = tf.keras.Input(shape=(length,), dtype=tf.int32)
    msks = tf.keras.Input(shape=(length,), dtype=tf.int32)
    sids = tf.keras.Input(shape=(length,), dtype=tf.int32)

    hp_units_1 = hp.Int('units', min_value=64, max_value=64, step=16)
    do_units_1 = hp.Choice('dropout_rate_1', values=[0.2])
    hp_units_2 = hp.Int('units_2', min_value=32, max_value=32, step=16)
    do_units_2 = hp.Choice('dropout_rate_2', values=[0.2])
    hp_lr = hp.Choice('learning_rate', values=[1e-4, 1e-5, 1e-6])

    _, sequence_output = bert_layer([tkns, msks, sids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(units=hp_units_1, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(rate=do_units_1)(net)
    net = tf.keras.layers.Dense(units=hp_units_2, activation='relu')(net)
    net = tf.keras.layers.Dropout(rate=do_units_2)(net)
    out = tf.keras.layers.Dense(5, activation='softmax')(net)
    model = tf.keras.models.Model(inputs=[tkns, msks, sids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=hp_lr), loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    return model

In [12]:
X_train, y_train, X_test, y_test, bert_layer = data()

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [24]:
!rm -r untitled_project

In [25]:
tuner = kt.Hyperband(create_models,
                     objective='val_accuracy',
                     max_epochs=3,
                     factor=3)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_1[0][0]                    
                                                                 input_2[0][0]                

In [None]:
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

tuner.search(X_train, y_train, epochs=1, validation_split=0.2, callbacks=[earlystopping], verbose=1, batch_size=16)

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units is {best_hps.get('units')}, {best_hps.get('units_2')}, {best_hps.get('dropout_rate_1')}, {best_hps.get('dropout_rate_2')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

In [None]:
import matplotlib.pyplot as plt

In [None]:
res.history['val_']

In [None]:
new_model = tf.keras.models.load_model('model.h5',custom_objects={'KerasLayer':hub.KerasLayer})
new_model.summary()

In [None]:
predictions = pd.DataFrame({"pred": np.argmax(new_model.predict(c), axis=1), "actual": d})

In [None]:
pred = np.argmax(new_model.predict(c), axis=1)

In [None]:
sum(pred == np.argmax(d, axis=1))/len(pred)