# Importing the required Packages

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
import tensorflow as tf
from tensorflow.keras import layers 
import tensorflow_datasets as tfds

# Loading the Data

In [3]:
cols = ["sentiment", "id", "date","query","user","text"]
train_data = pd.read_csv("trainingandtestdata/training.1600000.processed.noemoticon.csv", 
                         header=None, 
                         names=cols, 
                         engine="python", 
                         encoding="latin1")


In [4]:
test_data = pd.read_csv("trainingandtestdata/testdata.manual.2009.06.14.csv",
                       header=None,
                       names=cols,
                       engine="python",
                       encoding="latin1")

In [5]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
train_data.sentiment.unique()

array([0, 4], dtype=int64)

In [7]:
test_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [8]:
test_data.sentiment.unique()

array([4, 0, 2], dtype=int64)

# Data preprocessing

## Cleaning

In [9]:
data = train_data

In [10]:
data.drop(["id", "date", "query", "user"], # don't forget to run data = train_data before!
          axis=1,
          inplace=True)

In [11]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", " ", tweet)
    tweet = re.sub("https?://[A-Za-z0-9./]+", " ",tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", " ", tweet)
    tweet = re.sub(" +", " ", tweet)
    return tweet

In [12]:
data_clean = [clean_tweet(tweet) for tweet in data.text]



In [13]:
data_labels = data.sentiment.values

In [14]:
set(data_labels)

{0, 4}

In [15]:
data_labels[data_labels == 4] = 1

In [16]:
set(data_labels)

{0, 1}

## Tokenization

In [17]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

## Padding

In [18]:
max_len = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                           value=0,
                                                           padding="post",
                                                           maxlen = max_len)

## Data split train/test

In [19]:
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

In [20]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx, axis=0)

# Model Building

In [21]:
class DCNN(tf.keras.Model):
    
    def __init__(self, 
                vocab_size,
                embed_dim=128,
                nb_filters = 50,
                FFN_units = 512,
                nb_classes = 2,
                dropout_rate = 0.1,
                training=False,
                name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size, embed_dim)
        
        self.bigram = layers.Conv1D(filters = nb_filters, kernel_size=2,
                                   padding="valid",
                                   activation = "relu")
        self.pool_1 = layers.GlobalMaxPool1D()
        
        self.trigram = layers.Conv1D(filters = nb_filters, kernel_size=3,
                                   padding="valid",
                                   activation = "relu")
        self.pool_2 = layers.GlobalMaxPool1D()
        self.fourgram = layers.Conv1D(filters = nb_filters, kernel_size=4,
                                   padding="valid",
                                   activation = "relu")
        self.pool_3 = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units = FFN_units, activation='relu')
        self.dropout = layers.Dropout(rate = dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units = 1, activation='sigmoid')
        else:
            self.last_dense = layers.Dense(units = nb_classes, acitvation="softmax")
            
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output
        
            
            
    
        
        

        

# Apply the model!!!!

## configuration of model

In [22]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE=32

NB_EPOCHS = 5

## Training

In [23]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
           embed_dim=EMB_DIM,
           nb_filters=NB_FILTERS,
           FFN_units=FFN_UNITS,
           nb_classes=NB_CLASSES,
           dropout_rate=DROPOUT_RATE)

In [24]:
if NB_CLASSES == 2:
    Dcnn.compile(loss='binary_crossentropy',
                optimizer = 'adam',
                metrics=['accuracy'])
else:
    Dcnn.compile(loss = "sparse_categorical_crossentropy",
                optimizer='adam',
                metrics=['sparse_categorical_accuracy'])

In [25]:
checkpoint_path = "./ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint Restored!!!")
    

Latest Checkpoint Restored!!!


In [28]:
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  13108000  
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
global_max_pooling1d_1 (Glob multiple                  0         
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d_2 (Glob multiple                  0      

In [26]:
Dcnn.fit(train_inputs, train_labels, batch_size=BATCH_SIZE,
        epochs=NB_EPOCHS)
ckpt_manager.save()

Train on 1584082 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'./ckpt/ckpt-1'

# Evaluation

In [26]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size = BATCH_SIZE)
print(results)

[0.1330859230812639, 0.9506875]


In [32]:
if Dcnn(np.array([tokenizer.encode("You are cute!")]), training=False).numpy()>0.6:
    print("Positive")
else:
    print("Negative")

Positive


In [33]:
Dcnn(np.array([tokenizer.encode("I love you but I do not think this will work out.")]), training=False).numpy()

array([[0.22985332]], dtype=float32)