### Mounting Drive and Importing Libraries

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Importing Libraries

In [1]:
import pandas as pd
import json
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

%tensorflow_version 2.x
import tensorflow as tf
!pip install -q tensorflow-hub
import tensorflow_hub as hub

from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.2.0-rc1
Eager mode:  True
Hub version:  0.7.0
GPU is available


### Loading IMDb Review Data

In [0]:
data = []
for outfile in open('/content/drive/My Drive/IMDB_reviews.json','r'):
  data.append(json.loads(outfile))

### Splitting Data

In [0]:
def SplitData(id):
  D1_text = []
  D1_spoiler = []
  D2_text = []
  D2_spoiler = []
  for d in data:
    if d['movie_id'] == id:
      D2_text.append(d['review_text'])
      D2_spoiler.append(d['is_spoiler'])
    else:
      D1_text.append(d['review_text'])
      D1_spoiler.append(d['is_spoiler'])
  X1_train, X1_test, y1_train, y1_test = train_test_split(D1_text, D1_spoiler, test_size = 0.15, random_state = 20)
  X2_train, X2_test, y2_train, y2_test = train_test_split(D2_text, D2_spoiler, test_size = 0.15, random_state = 20)
  y1_train = pd.get_dummies(np.asarray(y1_train).astype(int))
  y1_test = pd.get_dummies(np.asarray(y1_test).astype(int))
  y2_train = pd.get_dummies(np.asarray(y2_train).astype(int))
  y2_test = pd.get_dummies(np.asarray(y2_test).astype(int))
  return X1_train, X1_test, y1_train, y1_test, X2_train, X2_test, y2_train, y2_test

### Building the Model

In [0]:
def X2Model(l):
  embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
  hub_layer = hub.KerasLayer(embedding, input_shape = [], output_shape = [128], dtype=tf.string, trainable = True)
  if l < 100:
    X2_Input = Input(shape=(), dtype=tf.string)
    X2_HubLayer = hub_layer(X2_Input)
    X2_Dense1 = Dense(16, activation='relu')(X2_HubLayer)
    X2_Output = Dense(2, activation='softmax')(X2_Dense1)
  elif l < 1000:
    X2_Input = Input(shape=(), dtype=tf.string)
    X2_HubLayer = hub_layer(X2_Input)
    X2_Dense1 = Dense(36, activation='relu')(X2_HubLayer)
    X2_Dense2 = Dense(12, activation='relu')(X2_Dense1)
    X2_Output = Dense(2, activation='softmax')(X2_Dense2)
  else:
    X2_Input = Input(shape=(), dtype=tf.string)
    X2_HubLayer = hub_layer(X2_Input)
    X2_Dense1 = Dense(48, activation='relu')(X2_HubLayer)
    X2_Dense2 = Dense(20, activation='relu')(X2_Dense1)
    X2_Dense3 = Dense(8, activation='relu')(X2_Dense2)
    X2_Output = Dense(2, activation='softmax')(X2_Dense3)
  return X2_Input, X2_Output

In [0]:
def BuildModel():
  embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
  hub_layer = hub.KerasLayer(embedding, input_shape = [], output_shape = [128], dtype=tf.string, trainable = True)
  X1_Input = Input(shape=(), dtype=tf.string)
  X1_HubLayer = hub_layer(X1_Input)
  X1_Dense1 = Dense(48, activation='relu')(X1_HubLayer)
  X1_Dense2 = Dense(20, activation='relu')(X1_Dense1)
  X1_Dense3 = Dense(8, activation='relu')(X1_Dense2)
  X1_Output = Dense(2, activation='softmax')(X1_Dense3)
  X2_Input, X2_Output = X2Model(len(X2_train))
  model = Model([X1_Input,X2_Input],[X1_Output,X2_Output])
  model.compile(optimizer='adam', loss='binary_crossentropy', loss_weights=[1, 0.01*float(len(X2_train_initial)/len(X1_train))], metrics=['accuracy'])
  return model

### Training the Model

In [0]:
def TrainModel(model):
  epoch = 1
  batch_size = 1000
  sum1 = 0
  sum2 = 0
  for i in range(int(len(X1_train)/batch_size)):
    model.train_on_batch([np.asarray(X1_train[batch_size*i:batch_size*i + batch_size]),np.asarray(X2_train[batch_size*i:batch_size*i + batch_size])],
                         [y1_train[batch_size*i:batch_size*i + batch_size],y2_train[batch_size*i:batch_size*i + batch_size]])
  for i in range(int(len(X1_train)/batch_size)):
    result = model.evaluate([np.asarray(X1_train[batch_size*i:batch_size*i + batch_size]),np.asarray(X2_train[batch_size*i:batch_size*i + batch_size])],
                            [y1_train[batch_size*i:batch_size*i + batch_size],y2_train[batch_size*i:batch_size*i + batch_size]], verbose = 1)
    sum1 = sum1 + result[3]      
    sum2 = sum2 + result[4]
  testresult = model.evaluate([np.asarray(X1_test),np.asarray(X2_test)],[y1_test,y2_test], verbose = 1)
  print("Large Train Accuracy: %0.3f Large Test Accuracy: %0.3f Small Train Accuracy: %0.3f Small Test Accuracy: %0.3f" % (sum1/int(len(X1_train)/batch_size),sum2/int(len(X2_train)/batch_size),testresult[3],testresult[4]))
  return testresult[3], testresult[4]

###Running the Model

In [10]:
id_list = []
count = 0
sum1 = 0
sum2 = 0
for d in data:
  id_list.append(d['movie_id'])
id_list = np.unique(id_list)
for i in random.choices(id_list, k=10):
  count = count + 1
  X1_train, X1_test, y1_train, y1_test, X2_train, X2_test, y2_train, y2_test = SplitData(i)
  X2_train_initial = X2_train
  X2_train, y2_train = resample(X2_train, y2_train, n_samples = len(X1_train))
  X2_test, y2_test = resample(X2_test, y2_test, n_samples = len(X1_test))
  print("Movie Number : %3d Movie ID : %s Percent Complete : %0.2f" % (count, i, count/len(id_list)))
  model = BuildModel()
  accuracy1, accuracy2 = TrainModel(model)
  sum1 = sum1 + accuracy1
  sum2 = sum2 + accuracy2
print("Final Large Dataset Accuracy : %0.3f Small Dataset Accuracy : %0.3f" % (sum1/len(id_list),sum2/len(id_list)))

Movie Number :   1 Movie ID : tt0099423 Percent Complete : 0.00
Large Train Accuracy: 0.796 Large Test Accuracy: 0.751 Small Train Accuracy: 0.780 Small Test Accuracy: 0.844
Movie Number :   2 Movie ID : tt0146336 Percent Complete : 0.00
Large Train Accuracy: 0.793 Large Test Accuracy: 0.786 Small Train Accuracy: 0.779 Small Test Accuracy: 0.805
Movie Number :   3 Movie ID : tt2719848 Percent Complete : 0.00
Large Train Accuracy: 0.795 Large Test Accuracy: 0.710 Small Train Accuracy: 0.779 Small Test Accuracy: 0.716
Movie Number :   4 Movie ID : tt0258463 Percent Complete : 0.00
Large Train Accuracy: 0.795 Large Test Accuracy: 0.832 Small Train Accuracy: 0.780 Small Test Accuracy: 0.734
Movie Number :   5 Movie ID : tt0017925 Percent Complete : 0.00
Large Train Accuracy: 0.792 Large Test Accuracy: 0.724 Small Train Accuracy: 0.781 Small Test Accuracy: 0.827
Movie Number :   6 Movie ID : tt0383534 Percent Complete : 0.00
Large Train Accuracy: 0.793 Large Test Accuracy: 0.853 Small Train

ResourceExhaustedError: ignored

In [13]:
(sum2 - .5)/7

0.7755840931619916