## Importing dependencies.

In [None]:
#re is an important module for cleaning data 
#bs4 is important to read data 
import numpy as np
import math
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
#google drive is an optional addition 
from google.colab import drive

In [None]:
#try and except is hereby used to select the correct version of tensorflow
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

## Data preprocessing

### Loading files

In [None]:
''' 
to mount the google drive to your colab notebook
'''
drive.mount("/content/drive")

In [None]:
'''
This is with regards to the data we have from the sentiment140 website.

In latin1 each character is exactly one byte long. In utf8 a character can consist of more than one byte.
Consequently utf8 has more characters than latin1(and the characters they do have 
in common aren't necessarily represented by the same byte/bytesequence).

engine='python' is added to avoid parser warning while using Google colab.

'''
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv(
    "/content/drive/My Drive/CNN for NLP/Data/train.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
test_data = pd.read_csv(
    "/content/drive/My Drive/CNN for NLP/Data/test.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [None]:
train_data.head(3)


The test dataset has 3 different labels (a negative, a positive and a neutral one) while the train dataset has only two so we will not use the test file, and split the train file later by ourselves.


In [None]:
data = train_data

## Preprocessing

### Cleaning

In [None]:
'''
The columns mentioned are not required for our model as they are not related to sentiment analysis
in our following data.

inplace=True so that the data gets rewritten with the changes
axis=1 so that it selects it for columns( axis=0 for rows)
'''
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [None]:
'''
The data we have must be cleaned as a whole before processing it for training the model we have.

For this, we need to use the BeautifulSoup and then the RegEx modules are used.

The BeautifulSoup-lxml relation: https://stackoverflow.com/questions/27790415/set-lxml-as-default-beautifulsoup-parser
is a web parser, but here it can also parse through our data and get text foreach line, which we have done here.

Second, we remove all the mentions, using @ by implementing regex as (r"@[A-Za-z0-9]+,' ',tweet) this means that remove the @
and its accompanying text and do it repeatedly replacing it by whitespaces in the tweets.

Third, the URL links which might be present inside the tweets have to be removed. It's removed by (r"https?://[A-Za-z0-9./]+,' ', tweet)
which means remove https links with all the characters mentioned inside the square brackets and do it repeatedly and replace by whitespaces.

Fourth, the letters are kept using the hat symbol ('^') inside the regex. Here, + is not required.

Then, the extra whitespaces are removed using the same regex, here being, (r" +", " ", tweet).
All of these require re.sub which is substitute attribute of the regex module.
'''
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
#Using list comprehensions the cleaned data is saved in data_clean for every tweet in the data.
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
'''
After cleaning the data there might be different labels for the sentiments that are present.
We want it to 0 or1 as they need to be positive or negative. So, value 4 is changed to 1.
'''
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

In [None]:
'''
The tokenizer does the numbering of different words in the data_clean and then makes a vocabulary of 2**17 words,
using the TensorFlow_Datasets features, which has a Subword Text Encoder to build this corpus.
'''
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**17
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

### Padding

In [None]:
'''
Now, we pad the inputs so that all the sentences which are already encoded using the tokenizer.
For this we find the maximum length of the longest sentence in the data and then add the number of zeroes 
at the end to make every sentence of the same length.
We do padding so that the processes are unbiased.
'''
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

### Spliting into training/testing set



In [None]:
'''
Now we must divide our data set to training and testing, wherein our data set has around 1600000 inouts,
half is divided to test and the other 80k into training data.
Then, in the next cells test inputs and test labels are segregated.
'''
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

In [None]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

## Model building

In [None]:
'''
This is the model building cell which does the work of analysis after it is fed with data.
A class DCNN is formed with the main attribute as t.keras.Model.

First, using __init__ function , and self being the mandatory attribute, various other attributes 
are defined with initial values.

emb_dim is the size of the vectors that are used for embedding in 1D. Here, it is 128 bits.

vocab_size is given through the tokenizer which makes a word corpus of size 2**17.

nb_filters are the number of filters that need to be applied for convolution.

nb_classes is naive bayes text classification 

dropout_rate is basically the ratio to define how many of the inputs are excluded in the update cycle.
so, a dropout_rate of 0.1 means 1 out of 10 inouts are excluded.
'''
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        '''
        Embedding is for the flattening of the layers we provide initially.
        
        "The kernel size here refers to the widthxheight of the filter mask. The max pooling layer, for example, 
        returns the pixel with maximum value from a set of pixels within a mask (kernel).
        That kernel is swept across the input, subsampling it."
        
        Activation Functions: https://towardsdatascience.com/activation-functions-and-its-types-which-is-better-a9a5310cc8f
        Basically, they create the scope of the function to be a nonlinear function, here ReLU is Rectified Linear Unit
        
        dense basically concatenates all the layers after the pool_3 
        
        again one more densing is done using last dense using different activation functions such as sigmoid or softmax.
        
        lastly, the attributes are called using call function and then the results are merged using concatenation of 
        x_1 x_2 and x_3.
        '''
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.pool_1=layers.GlobalMaxPool1D()
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.pool_2=layers.GlobalMaxPool1D()
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool_3 = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

## Application

### Configuration

In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 256
NB_FILTERS = 128
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 64
NB_EPOCHS = 2

### Training

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
'''
Here, the three important functions forcompiling a model using the values that are provided are,
1) Optimizer: https://www.dlology.com/blog/quick-notes-on-how-to-choose-optimizer-in-keras/
              It is an algorithm that decides the path the code will take to go through to the final result.
              Adam or SGD are preferred ones
2) Loss: https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/
         A scalar value that we attempt to minimize during our training of the model.
         The lower the loss, the closer our predictions are to the true labels.
3) Metrics: https://machinelearningmastery.com/custom-metrics-deep-learning-keras-python/
            https://stackoverflow.com/a/47306502
'''
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "./drive/My Drive/projects/CNN_for_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
ckpt_manager.save()

### Evaluation

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

In [None]:
Dcnn(np.array([tokenizer.encode("you're fat")]), training=False).numpy()