<a href="https://colab.research.google.com/github/buczekEngineering/Chatbot/blob/main/SentimentAnalysisBertTokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing dependencies

In [2]:
import numpy as np
import math
import re
import pandas as pd
import random
from google.colab import drive

In [4]:
!pip install bert-for-tf2
!pip install sentencepiece



In [5]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

# 1. Get the data


In [6]:
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
import chardet

data_path = "/content/drive/MyDrive/SentimentData/train.csv"

def check_encoding(data_path): 
  with open(data_path, "rb")as file: 
    encoding = chardet.detect(file.read(100000000000))
    print(encoding)
  return encoding["encoding"]

In [8]:
encoding="latin1"


In [10]:
#encoding = check_encoding(data_path)
data = pd.read_csv(data_path, encoding = encoding, header=None, index_col=False)

In [11]:
data = data.drop(columns= [1,2,3,4], axis=1)

new_columns_names = ["sentiment", "text"]
data.columns = new_columns_names
data.columns

Index(['sentiment', 'text'], dtype='object')

In [12]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


# 2.Cleaning the data


In [13]:
def clean_data(tweet): 
  tweet = re.sub(r"@[A-Za-z0-9]+", " ", tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", " ",tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']", " ", tweet)
  tweet = re.sub(r" +", " ", tweet)
  return tweet.lower()

In [14]:
data.text = [clean_data(tweet) for tweet in data.text]
data.head()

Unnamed: 0,sentiment,text
0,0,awww that's a bummer. you shoulda got david c...
1,0,is upset that he can't update his facebook by ...
2,0,i dived many times for the ball. managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,no it's not behaving at all. i'm mad. why am ...


In [15]:
data.sentiment.value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [16]:
data.sentiment.loc[data.sentiment == 4] =1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


# 3.Tokenization, Word2Int, Shuffling, Padding, Converting to Tensors


In [17]:
# Wraps a SavedModel as a Keras Layer, to have access to meta data for the tokenizer (like vocab size).
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
# extract bert vocab size
vocab_size = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()


def create_tokenizer(vocab_size, do_lower_case): 
  FullTokenizer = bert.bert_tokenization.FullTokenizer
  tokenizer = FullTokenizer(vocab_size, do_lower_case)
  return tokenizer

tokenizer = create_tokenizer(vocab_size, do_lower_case)

In [18]:
# tokenize the sequence and convert words to id
def tokenize_tweets(tweet): 
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweet))

data_input = [tokenize_tweets(tweet) for tweet in data.text]

In [19]:
import pickle

with open("data_input.pickle", "wb") as file: 
  pickle.dump(data_input, file)

In [None]:
labels_input = data.sentiment.values

In [79]:
with open("data_input.pickle", "rb") as f: 
  loaded_data = pickle.load(f)
print(loaded_data[6])
print(labels_input[6])

[2342, 1037, 8549]
0


In [80]:
# put encoded/tokenized data and lables into a list

encoded_data = [[tweet, labels_input[i], len(tweet)] for i, tweet in enumerate(loaded_data)]

random.shuffle(encoded_data)

#[[[2054, 1045, 2081, 2005, 4596], 1],....]

In [84]:
# we will sort the data so that the sequences with similar size will be padded together
encoded_data.sort(key= lambda x: x[2])
sorted_all = [(ele[0], ele[1]) for ele in encoded_data if ele[2]>7]


In [85]:
print(sorted_all[1])

([1996, 3712, 3504, 2061, 12459, 2157, 2085, 1012], 0)


In [86]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [87]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([ 1045,  2064,  2102,  2424,  2026,  4950, 11057,  1040],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [88]:
print(all_dataset)

<FlatMapDataset shapes: (<unknown>, <unknown>), types: (tf.int32, tf.int32)>


In [91]:
BATCH_SIZE = 8
all_padded = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None,), ()))
print(all_padded)

<PaddedBatchDataset shapes: ((None, None), (None,)), types: (tf.int32, tf.int32)>


In [94]:
test_size = 20
NB_BATCHES = math.ceil(len(encoded_data)/ BATCH_SIZE)
all_padded.shuffle(NB_BATCHES)
training_data = all_padded.skip(test_size)
test_data = all_padded.take(test_size)


# 4. Building Neural Network

In [95]:
class CNN(tf.keras.Model): 
  def __init__(self, vocab_size, embedding_dim = 64, nb_filters =50, FFN_units = 512, nb_classes=2, dropout_rate = 0.5, training=False, name="cnn"):
    super(CNN, self).__init__(name=name)

    self.embedding = layers.Embedding(vocab_size, embedding_dim)
    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding="valid", activation="relu")
    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")

    self.maxPooling = layers.GlobalMaxPooling1D()

    self.dense = layers.Dense(units=FFN_units, activation="relu")
    self.dropout = layers.Dropout(rate=dropout_rate) 
    if nb_classes == 2: 
      self.final_dense = layers.Dense(1, activation="sigmoid")
    else: 
      self.final_dense = layers.Dense(nb_classes, activation="softmax")

  
  def call(self, inputs, training):
      x = self.embedding(inputs)
      x_1 = self.bigram(x) 
      x_1 = self.maxPooling(x_1) 
      x_2 = self.trigram(x) 
      x_3 = self.fourgram(x) 
      x_3 = self.maxPooling(x_3)
      
      merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
      merged = self.dense(merged)
      merged = self.dropout(merged, training)
      output = self.final_dense(merged)
      
      return output


In [96]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 64
NB_FILTERS = 50
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.5
NB_EPOCHS = 5

In [97]:
classifier = CNN(vocab_size=VOCAB_SIZE,
            embedding_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [98]:
classifier.compile(
    loss = "binary_crossentropy",
    optimizer ="adam",
    metrics = ["accuracy"]
)



In [71]:
checkpoint_path = "./drive/My Drive/projects/BERT/ckpt_bert_tok/"

ckpt = tf.train.Checkpoint(classifier=classifier)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [72]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

# 5. Training the model

In [99]:
classifier.fit(training_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fa7b7730590>

# 6. Evaluating 

In [101]:
# tokenize the sequence and convert words to id
def tokenize_tweets(tweet): 
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweet))


In [115]:
def check_sentiment(sentence): 
  tokens = tokenize_tweets(sentence)
  input = tf.expand_dims(tokens, 0)

  prediction = classifier(input, training=False)
  sentiment = math.floor(prediction*2)

  if sentiment == 0: 
    print("Output of the model: {}. \nPredicted sentiment: negative".format(prediction))

  elif sentiment == 1: 
    print("Output of the model: {}\nPredicted sentiment: positive".format(prediction))

check_sentiment("love you my sugarboo")

Output of the model: [[0.9627555]]
Predicted sentiment: positive


In [116]:
eval_result = classifier.evaluate(test_data)
print(eval_result)

[0.3686465322971344, 0.824999988079071]
