<a href="https://colab.research.google.com/github/boyuan5022/BERT-trained-with-Stanford-data/blob/main/BERT_trained_with_Stanford_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train Model

In [None]:
!pip install transformers
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN):
    train_InputExamples = train.apply(
        lambda x: InputExample(guid=None,  # Globally unique ID for bookkeeping, unused in this case
                               text_a=x[DATA_COLUMN],
                               text_b=None,
                               label=x[LABEL_COLUMN]), axis=1)

    validation_InputExamples = test.apply(
        lambda x: InputExample(guid=None,  # Globally unique ID for bookkeeping, unused in this case
                               text_a=x[DATA_COLUMN],
                               text_b=None,
                               label=x[LABEL_COLUMN]), axis=1)

    return train_InputExamples, validation_InputExamples

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []  # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,  # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True,  # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
                                                     input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.summary()

URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz",
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

# The shutil module offers a number of high-level
# operations on files and collections of files.
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

# We create a training dataset and a validation
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2,
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2,
    subset='validation', seed=123)

for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")

for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")

DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
checkpoint_path = "training_1/cp.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
model.fit(train_data, epochs=1, validation_data=validation_data,callbacks=[cp_callback])
model.save_weights('/content/drive/MyDrive/Colab Notebooks/Checkpoint/my_checkpoint')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


Model: "tf_bert_for_sequence_classification_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_189 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
['urls_unsup.txt', 'neg', 'urls_pos.txt', 'unsupBow.feat', 'pos', 'urls_neg.txt', 'labeledBow.feat']
Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.




Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Epoch 00001: saving model to training_1/cp.ckpt


# Apply model to excel

In [10]:
!pip install transformers
!pip install openpyxl
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
import os
import numpy as np
from string import ascii_lowercase
from numpy import concatenate

def excel_sentiment(input_file,output_file,model_file,column_letter):
  #Converts Excel column letters to strings
  LETTERS = {letter: str(index) for index, letter in enumerate(ascii_lowercase, start=1)}

  #Creates list of input strings for model
  data_table = pd.read_excel(input_file)
  predsentences = data_table[data_table.columns[int(LETTERS[column_letter])-1]].to_list()
  pred_sentences = ["" if x is np.nan else " ".join(str(x).replace("\'","").replace("\"", "").splitlines()) for x in predsentences]

  #Create model and load the previously saved weights
  checkpoint_dir = os.path.dirname(model_file)
  latest = tf.train.latest_checkpoint(checkpoint_dir)
  model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
  model.load_weights(latest).expect_partial() 
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
  labels = ['Negative','Positive']

  label=[]
  confidence=[]
  for i in range(0,len(pred_sentences),200):
    upper_bound=min(i+200,len(pred_sentences))
    temp_label=[]
    pred_sentences_chunk=[]
    pred_sentences_chunk=pred_sentences[i:upper_bound]
    tf_batch = tokenizer(pred_sentences_chunk, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    temp_label = tf.argmax(tf_predictions, axis=1)
    temp_label = temp_label.numpy()
    temp_confidence=[max(x) for x in tf_predictions.numpy()]
    label=concatenate([label,temp_label])
    confidence=concatenate([confidence,temp_confidence])

  #Writes output into excel file in last column. GPU restrictions mean only 200 strings can be plugged into the model at a time.
  wb = load_workbook(filename = input_file)
  ws = wb.active
  maxcolumn=ws.max_column
  ws[get_column_letter(maxcolumn+1)+"1"]="Sentiment"
  ws[get_column_letter(maxcolumn+2)+"1"]="Confidence"
  for i in range(len(label)):
    ws[get_column_letter(maxcolumn+1)+str(i+2)]="" if pred_sentences[i]=="" else labels[label[i].astype(int)]
    ws[get_column_letter(maxcolumn+2)+str(i+2)]="" if pred_sentences[i]=="" else confidence[i]
  wb.save(filename = output_file)



# Test

In [6]:
!pip install transformers
!pip install openpyxl
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
import os
import numpy as np
from string import ascii_lowercase
from numpy import concatenate

def test_excel_sentiment(model_file,sentences):

  #Create model and load the previously saved weights
  checkpoint_dir = os.path.dirname(model_file)
  latest = tf.train.latest_checkpoint(checkpoint_dir)
  model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
  model.load_weights(latest).expect_partial() 
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
  labels = ['Negative','Positive']
  label=[]
  temp_label=[]
  confidence=[]
  output=[]
  tf_batch = tokenizer(sentences, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = model(tf_batch)
  tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
  temp_label = tf.argmax(tf_predictions, axis=1)
  temp_label = temp_label.numpy()
  temp_confidence=[max(x) for x in tf_predictions.numpy()]
  label=concatenate([label,temp_label])
  confidence=concatenate([confidence,temp_confidence])
  for i in range(0,len(label)):
    output=concatenate([output,[labels[temp_label[i].astype(int)]+" with "+str(round(confidence[i]*100))+"% confidence"]])
  return output

test1="The steering wheel is unpredictable."
test2="The audio quality of my new laptop is so cool but the display colors are not too good."
test3="This is an awesome product ! I've tried several BT options for the switch and this one seems to have the best quality with no lag and extremely easy to use ! Even has the mini mic so you can use in game chat however only in handheld unless the switch is docked right next to you. I had some questions about the mic and dealing with customer service was great ! They were extremely friendly, quick to respond and great at answering my questions."
print(test_excel_sentiment("/content/drive/MyDrive/Colab Notebooks/Checkpoints/cp.ckpt",[test1,test2,test3]))



All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['Negative with 85% confidence' 'Negative with 52% confidence'
 'Positive with 99% confidence']


# Full Demo

In [11]:
from google.colab import drive 
drive.mount('/content/drive')

excel_sentiment("/content/drive/MyDrive/Colab Notebooks/Reviews.xlsx","/content/drive/MyDrive/Colab Notebooks/output.xlsx","/content/drive/MyDrive/Colab Notebooks/Checkpoints/cp.ckpt","g")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
