In [1]:
## Loading Packages
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [2]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [3]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    print('GPU device not found')

GPU device not found


In [4]:
## Loading Dataset
print("_______________________________________________________________")
print("Loading Data...........")
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    r"test.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [5]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [6]:
print("_______________________________________________________________")
print("Data Pre-processing...........")
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [7]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [8]:
print("_______________________________________________________________")
print("Tokenization and Data Preparation...........")
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 0]

In [9]:
def bert_input_data(sorted_all):
    all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                                 output_types=(tf.int32, tf.int32))
    BATCH_SIZE = 32
    all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
    return all_batched

In [10]:
print("_______________________________________________________________")
print("Model Building...........")
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

_______________________________________________________________
Model Building...........


In [11]:
VOCAB_SIZE = 30522 # len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [12]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [13]:
checkpoint_path = "final_training/cp.ckpt"
Dcnn.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x15a22fe20f0>

In [14]:
X = sorted_all
def get_test_data(size: int = 1):
    """Generates a test dataset of the specified size""" 
    num_rows = len(X)
    test_df = X.copy()

    while num_rows < size:
        test_df = test_df + test_df
        num_rows = len(test_df)

    return test_df[:size]

In [15]:
def calculate_stats(time_list):
    """Calculate mean and standard deviation of a list"""
    time_array = np.array(time_list)

    median = np.median(time_array)
    mean = np.mean(time_array)
    std_dev = np.std(time_array)
    max_time = np.amax(time_array)
    min_time = np.amin(time_array)
    quantile_10 = np.quantile(time_array, 0.1)
    quantile_90 = np.quantile(time_array, 0.9)

    basic_key = ["median","mean","std_dev","min_time","max_time","quantile_10","quantile_90"]
    basic_value = [median,mean,std_dev,min_time,max_time,quantile_10,quantile_90]

    dict_basic = dict(zip(basic_key, basic_value))
    
    return pd.DataFrame(dict_basic, index = [0])

import argparse
import logging

from pathlib import Path
from timeit import default_timer as timer

NUM_LOOPS = 100
def run_inference(num_observations:int = 1000):
    """Run xgboost for specified number of observations"""
    # Load data
    test_df = get_test_data(num_observations)
    data = bert_input_data(test_df)

    num_rows = len(test_df)
    print(f"running inference for {num_rows} sentence(s)..")
    run_times = []
    inference_times = []
    for _ in range(NUM_LOOPS):

        start_time = timer()
        Dcnn.predict(data)
        end_time = timer()

        total_time = end_time - start_time
        run_times.append(total_time*10e3)

        inference_time = total_time*(10e6)/num_rows
        inference_times.append(inference_time)

    print(num_observations, ", ", calculate_stats(inference_times))
    return calculate_stats(inference_times)

STATS = '#, median, mean, std_dev, min_time, max_time, quantile_10, quantile_90'

print("_______________________________________________________________")
print("Inferencing Started...........")
if __name__=='__main__':
    ob_ct = 1  # Start with a single observation
    logging.info(STATS)
    temp_df = pd.DataFrame()
    while ob_ct <= 100:
        temp = run_inference(ob_ct)
        temp["No_of_Observation"] = ob_ct
        temp_df = temp_df.append(temp)
        ob_ct *= 10
    print("Summary........")
    print(temp_df)

_______________________________________________________________
Inferencing Started...........
running inference for 1 sentences..
1 ,       median       mean        std_dev  min_time   max_time  quantile_10  \
0  393611.5  495221.31  846624.461136  351657.0  8904595.0     360585.7   

   quantile_90  
0     482723.5  
running inference for 10 sentences..
10 ,      median       mean      std_dev  min_time  max_time  quantile_10  \
0  42911.4  44985.775  7163.502763   36278.5   71930.0     38413.71   

   quantile_90  
0     54580.48  
running inference for 100 sentences..
100 ,      median       mean      std_dev  min_time  max_time  quantile_10  \
0  7268.22  7564.9387  1141.178707   5997.33  11999.49     6611.049   

   quantile_90  
0     9037.028  
Summary........
      median         mean        std_dev   min_time    max_time  quantile_10  \
0  393611.50  495221.3100  846624.461136  351657.00  8904595.00   360585.700   
0   42911.40   44985.7750    7163.502763   36278.50    71930.