In [1]:
# import packages to use during training

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import tensorflow as tf
from transformers import TFBertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("IMDB-Movie-Reviews/supervised.csv") 
df.head()

Unnamed: 0,path,fold,label,rating,review_id,url,preview
0,data/test/neg/1821_4.txt,test,neg,4,1821,http://www.imdb.com/title/tt0138541/usercomments,Alan Rickman & Emma Thompson give good perform...
1,data/test/neg/9487_1.txt,test,neg,1,9487,http://www.imdb.com/title/tt0202521/usercomments,I have seen this movie and I did not care for ...
2,data/test/neg/4604_4.txt,test,neg,4,4604,http://www.imdb.com/title/tt0417658/usercomments,In Los Angeles the alcoholic and lazy Hank Ch...
3,data/test/neg/2828_2.txt,test,neg,2,2828,http://www.imdb.com/title/tt0066105/usercomments,"This film is bundled along with ""Gli fumavano ..."
4,data/test/neg/10890_1.txt,test,neg,1,10890,http://www.imdb.com/title/tt0787505/usercomments,I only comment on really very good films and o...


In [4]:
train = df[df.fold == 'train'][:1000]
test = df[df.fold == 'test']

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
# function for encoding of the text reviews
def convert_example_to_feature(review):
  return tokenizer.encode_plus(review,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

In [7]:
max_length = 512
batch_size = 32

In [8]:
# write a function to format the model output 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label


In [9]:
# putting all the functions together to complete the tokenization process
label_map = {'pos': 1, 'neg': 0}

def encode_examples(ds, limit=-1):
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
      ds = ds.take(limit)
  for path, label in zip(df['path'], df['label']):
    review = open("IMDB-Movie-Reviews/"+path, "r").read()
    bert_input = convert_example_to_feature(review)
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])


    numeric_label = label_map[label]  # Convert string label using label_map
    label_list.append([numeric_label])
    # label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [None]:
# train dataset
# ds_train_encoded = encode_examples(train).shuffle(10000).batch(batch_size)
# test dataset
# ds_test_encoded = encode_examples(test).batch(batch_size)

In [10]:
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1
# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# model.fit(
#       ds_train_encoded,
#       batch_size=32,
#       epochs=number_of_epochs)

# model.save("model.keras")

In [11]:

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
# model.load_weights("model.keras")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
test_sentence = "this movie was so good, love the acting and soundtrack"

predict_input = tokenizer.encode(test_sentence,

truncation=True,

padding=True,

return_tensors="tf")

tf_output = model.predict(predict_input)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive'] #(0:negative, 1:positive)
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(labels[label[0]])

Positive
