# Stage 1: Importing dependencies

In [None]:
Dataset Link: http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

from google.colab import drive

In [2]:

import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Stage 2: Data preprocessing

## Loading files

In [3]:
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv(
    "/content/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Preprocessing

In [7]:
data = train_data

### Cleaning

In [9]:
data.drop(["id", "date", "query", "user"],axis=1, inplace=True)

In [10]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, "lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+",' ',tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ',tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']",' ',tweet)
  tweet = re.sub(r" +",' ',tweet)
  return tweet

In [11]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [12]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [13]:
set(data_labels)

{0, 1}

### Tokenization

In [14]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

### Padding

In [16]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs, value=0, padding='post', maxlen=MAX_LEN)

### Spliting into training/testing set

In [17]:
test_idx = np.random.randint(0, 800000, 8000)
text_idx = np.concatenate((test_idx, test_idx+800000))

In [18]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

# Stage 3: Model building

In [85]:
class DCNN(tf.keras.Model):
    def __init__(self, vocab_size, emb_dim=128, nb_filters=50, FFN_units=512, nb_classes=2, dropout_rate=0.1, training=False, name="dcnn"):
        super(DCNN, self).__init__(name=name)

        self.embedding = layers.Embedding(vocab_size, emb_dim)

        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding='valid', activation='relu')
        self.pool_1 = layers.GlobalMaxPool1D()

        self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding='valid', activation='relu')
        self.pool_2 = layers.GlobalMaxPool1D()

        self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding='valid', activation='relu')
        self.pool_3 = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(units=FFN_units, activation='relu')
        self.dropout = layers.Dropout(rate=dropout_rate)

        if nb_classes == 1:
            self.last_dense = layers.Dense(units=1, activation='sigmoid')
        elif nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation='sigmoid')
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation='softmax')

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x_1 = self.pool_1(self.bigram(x))
        x_2 = self.pool_2(self.trigram(x))
        x_3 = self.pool_3(self.fourgram(x))

        merged = tf.concat([x_1, x_2, x_3], axis=-1)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training=training)
        output = self.last_dense(merged)

        return output

# Stage 4: Application

## Config

In [86]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = len(set(train_labels))

DROUPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5


## Training

In [87]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE, emb_dim=EMB_DIM, nb_filters=NB_FILTERS, FFN_units = FFN_UNITS, nb_classes = NB_CLASSES, dropout_rate = DROPOUT_RATE)

In [88]:
if NB_CLASSES == 2:
  Dcnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
else:
  Dcnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])

In [89]:
print(train_inputs.shape, train_labels.shape)


(1592048, 73) (1592048,)


In [90]:
checkpoint_path = "./drive/My Drive/projects/CNN_for_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest checkpoint restored!!")

In [91]:
Dcnn.fit(train_inputs, train_labels, batch_size=BATCH_SIZE, epochs=NB_EPOCHS)
ckpt_manager.save()

Epoch 1/5
[1m49752/49752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 6ms/step - accuracy: 0.8020 - loss: 0.4263
Epoch 2/5
[1m49752/49752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 6ms/step - accuracy: 0.8591 - loss: 0.3292
Epoch 3/5
[1m49752/49752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 6ms/step - accuracy: 0.8872 - loss: 0.2732
Epoch 4/5
[1m49752/49752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 6ms/step - accuracy: 0.9131 - loss: 0.2167
Epoch 5/5
[1m49752/49752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 6ms/step - accuracy: 0.9325 - loss: 0.1714


'./drive/My Drive/projects/CNN_for_NLP/ckpt/ckpt-1'

## Evaluation

In [92]:
Dcnn(np.array([tokenizer.encode("You are so nice")]), training=False).numpy()

array([[0.9571725]], dtype=float32)