# **Dependencies**

In [2]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [3]:
!pip install bert-for-tf2
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 96 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30534 sha256=dec6789e42b6a4206b6b41eb62d3d8337ae7bab0c203b4d9de28f2f2365619cd
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Building wheel for params-flow (setup.py) ... [?25l[?25hdone
  Created wheel for params-flow: filename=params_flow-0.8.2-py3-none-any.whl size=19472 sha256=a6962a4e5a85a7208d43b42f183e43ab6b164fe

In [4]:
try:
    %tensorflow_version 2.x 
except Exception:
    pass
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


# **Data preprocessing**

**Loading files**

In [5]:
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
cols = ["sentiment","id","date","query","user","text"]
data = pd.read_csv( "/content/drive/MyDrive/Colab Notebooks/Final_Project/BERT/sentiment_data/train.csv",
                  header = None,
                  names = cols,
                  engine = "python",
                  encoding = "latin1"
)

In [7]:
data.drop(["id","date","query","user"],axis=1, inplace=True)

In [8]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


#    **Preprocessing**

**Cleaning**

In [9]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet,"lxml").get_text()
    #remove @
    tweet = re.sub(r"@[A-Za-z0-9]+",' ',tweet)
    #remove http +https
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ',tweet)
    #remove non letters or punctiations
    tweet = re.sub(r"[^a-zA-Z.!?']",' ',tweet)
    #remove double spaces
    tweet = re.sub(r" +",' ',tweet)
    return tweet

In [10]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [11]:
data_labels = data.sentiment.values
#data labels are 4 and 0 for no reason instead of 1 and 0
data_labels[data_labels==4]=1

In [12]:
data_labels

array([0, 0, 0, ..., 1, 1, 1])

Tokenezation

In [13]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
#https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A12/1
#https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
#trainable=false means we don't train bert on our data we don't change the weights!
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
#all of the above was to create a  tokenizer with all milions of weights that are already calculated by bert
tokenizer = FullTokenizer(vocab_file,do_lower_case)

In [14]:
#splitting words with tokenizer
tokenizer.tokenize("My dog loves strawberries.")

['my', 'dog', 'loves', 'straw', '##berries', '.']

In [15]:
#converting the tokens ID's
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog loves strawberries."))

[2026, 3899, 7459, 13137, 20968, 1012]

In [16]:
#converting the ID's tokens
tokenizer.convert_ids_to_tokens([2026, 3899, 7459, 13137, 20968, 1012])

['my', 'dog', 'loves', 'straw', '##berries', '.']

In [17]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [18]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [19]:
data_clean[0]

" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D"

In [20]:
data_inputs[0]

[22091,
 2860,
 2860,
 2008,
 1005,
 1055,
 1037,
 26352,
 5017,
 1012,
 2017,
 2323,
 2050,
 2288,
 2585,
 12385,
 1997,
 2353,
 2154,
 2000,
 2079,
 2009,
 1012,
 1040]

**Dataset creation**

shuffling the data since the input had the first half with positive and second with negative sentement

In [21]:
data_with_len = [[sent, data_labels[i],len(sent)] 
                for i,sent in enumerate(data_inputs)]
random.shuffle(data_with_len)


sort with a function on the fly.
the function is lambda, which takes each element and based element[2] which is len(sentence) we sort.
sort only sentences larger then 7 words

In [22]:
data_with_len.sort(key=lambda x: x[2])
sorted_all=[(sent_lab[0],sent_lab[1])
            for sent_lab in data_with_len if sent_lab[2]>7]

after we sorted the data, we want to split it into batches,if the sentence isn't long as the rest of the batch we will auto fill it with padded_shapes

In [23]:
all_dataset=tf.data.Dataset.from_generator(lambda: sorted_all, output_types=(tf.int32,tf.int32))

shows first element in itreable list (tf stracture)

In [24]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([ 1045,  2903,  2009,  2001,  2522, 25855,  2686,  2135],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [25]:
BATCH_SIZE=32
all_batched=all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), () ))

all first 32 sentences that are longer then 7 words and have been shuffled if padding was needed then it would be filled with padded_shapes

In [26]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 1045,  2903,  2009,  2001,  2522, 25855,  2686,  2135],
        [ 2307,  3980,  2204,  2111,  8225,  5821,  2005,  2449],
        [ 4986,  2055,  1037,  2261,  2477,  2012,  1996,  2617],
        [ 2168,  3110,  2205, 22861,  2100,  2057,  2097,  1012],
        [ 2833,  2003,  2205,  2204,  2000,  3413,  2039,  1012],
        [ 2572,  1045,  2006,  1996, 17186,  3482,  1029,   999],
        [ 1045,  2288,  1037,  2006,  2026,  2117, 19276,  3231],
        [ 1045,  2053,  2049,  2467, 22308,  2039,  2361,   999],
        [ 2003,  2383,  1037, 10551,  3300,  1012,  1012,  1012],
        [16861,  2064,  2022,  3255,  1999,  1996,  4632,  2823],
        [18168,  2290,  1012,  2026, 20712,  4080,  2003,  2757],
        [ 2047,  4748, 16874,  2015,  2039,  2006,  9130,   999],
        [ 2129,  2146, 13004,  1037,  5236,  6097,  3642,  2202],
        [ 1057, 13871,  2232,  3403,  2000,  2031,  6265,   999],
        [ 2200,  2200, 11471

preapare test data by taken the batches shuffling them and picking 10%

In [27]:
NB_BATCHES= math.ceil(len(sorted_all) /BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES //10
all_batched.shuffle(NB_BATCHES)  # randomize the batches order
test_dataset = all_batched.take(NB_BATCHES_TEST)  #take  the first 10%
train_dataset = all_batched.skip(NB_BATCHES_TEST) # skip the first 10%

# **Model Building**

DCNN =deep convolution neural network

the init function params:
vocab_size==amount of words in our corpous
emb_dim== amount of layers in our NN
nb_filters== amount of filters in each size(meaning 50 filters size 3,50 size 4,50 size 5) the filters are used in the convolution
nb_classes= how many classes for classification
FFN_units == when we create the 2 last layers make thier size smaller
dropout_rate == the amount of neurons we shut off every time to prevent overfiiting


In [99]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

if training is false then there will be no dropout

# **Training**

In [100]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROUPOUT_RATE = 0.2
NB_EPOCHS = 5

In [103]:
Dcnn= DCNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMB_DIM,
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROUPOUT_RATE)

In [104]:
if NB_CLASSES == 2 :
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"]
                 )
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"]
                )

In [105]:
from tensorflow.python.training.checkpoint_management import latest_checkpoint
checkpoint_path="/content/drive/MyDrive/Colab Notebooks/Final_Project/BERT/ckpt_bert_model"
print(checkpoint_path)
ckpt = tf.train.Checkpoint(Dcnn = Dcnn)
print(ckpt)
ckpt_manager= tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=1)
print(ckpt_manager)
ckpt.save("/content/drive/MyDrive/Colab Notebooks/Final_Project/BERT/ckpt_bert_model/checkpoint_1")
# ckpt_manager.latest_checkpoint
# print(ckpt_manager.latest_checkpoint)
if ckpt_manager.latest_checkpoint:
    status=ckpt.restore(ckpt_manager.latest_checkpoint)
    print(status)
    print("latest checkpoint restored!")   


/content/drive/MyDrive/Colab Notebooks/Final_Project/BERT/ckpt_bert_model
<tensorflow.python.training.tracking.util.Checkpoint object at 0x7f43ae2dd710>
<tensorflow.python.training.checkpoint_management.CheckpointManager object at 0x7f43ae41e390>
<tensorflow.python.training.tracking.util.CheckpointLoadStatus object at 0x7f43ae2dd310>
latest checkpoint restored!


In [108]:
class MyCustomCallBack(tf.keras.callbacks.Callback):

    def on_epoch_end(self,epoch,logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}".format(checkpoint_path))

# **Results**

In [109]:
Dcnn.fit(
    train_dataset, epochs=NB_EPOCHS, callbacks=[MyCustomCallBack()]
)

Epoch 1/5
  37193/Unknown - 437s 11ms/step - loss: 0.3819 - accuracy: 0.8300Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/Final_Project/BERT/ckpt_bert_model
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f43adcd4a90>

In [111]:
results= Dcnn.evaluate(test_dataset)
print(results)

[0.47465434670448303, 0.8270434737205505]


In [126]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens,0)

    output = Dcnn(inputs,training=False)
    output=output*2
    sentiment=math.floor(output)  #get a number between 0-2, 0-1 is class 1. 1-2 is class 2
    if sentiment==0:
        print("output of the model:{}\nPredicted sentiment negative.".format(output))
    if sentiment==1:
        print("output of the model:{}\nPredicted sentiment positive.".format(output))   

In [127]:
get_prediction("I'd rather not do that again")
get_prediction("this movie was preety intresting.")

output of the model:[[0.14895919]]
Predicted sentiment negative.
output of the model:[[1.9999704]]
Predicted sentiment positive.
