In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import pickle

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [None]:
#The training dataset does not have column names in the 1st row.
cols = ["sentiment", "id", "date", "query", "user", "text"]
#Loading training set with our columns values.
train_data = pd.read_csv(
    "train.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [None]:
data = train_data

In [None]:
#We can drop few columns that we do not need from the data.
data.drop(["id", "date", "query", "user"], # don't forget to run data = train_data before.
          axis=1,
          inplace=True)

In [None]:
def clean_tweet(tweet):
    #As some tweets come in xml format,we want to turn them into strings.
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @ (Mentions).
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
#From the data we extract the sentiment values and assign it to data_labels.
data_labels = data.sentiment.values
#There are only 2 sentiment values in the dataset.They are {0,4}.
#We turn all 4s into 1.
data_labels[data_labels == 4] = 1

In [None]:
#We convert each sentence into tokens.By setting vocabulary size to 2^16, less frequent words can be eliminated.
#They will be converted to a list of numbers.
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(data_clean, target_vocab_size=2**16)



In [None]:
with open('tokenizer.pickle','wb') as h:
    pickle.dump(tokenizer,h,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('tokenizer.pickle', 'rb') as h:
#     tokenizer = pickle.load(h)

In [None]:
#Our CNN network.
"""
Inputs : vocab_size given by us in the tokenizer part, Embedding size (128 is taken in most of the cases),
         For each of our filter we take 50 units, Feedforward network units, 
         dropout rate(For regularization only during training),
         Boolean training indicates whether it is training or testing.
"""
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
#All the parameters we will be using.
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2#len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [None]:
#Initializing our network.
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
#In our case number of classes is just 2.
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
"""
Saving the checkpoint after training.
If the checkpoint exists we restore it.
"""
checkpoint_path = "./checkpoints/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
Dcnn(np.array([tokenizer.encode("Kausik feels angry")]), training=False).numpy()

In [5]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import pickle

In [6]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [3]:
with open('tokenizer.pickle', 'rb') as h:
    tokenizer = pickle.load(h)

In [9]:
#Our CNN network.
"""
Inputs : vocab_size given by us in the tokenizer part, Embedding size (128 is taken in most of the cases),
         For each of our filter we take 50 units, Feedforward network units, 
         dropout rate(For regularization only during training),
         Boolean training indicates whether it is training or testing.
"""
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output


In [7]:
#All the parameters we will be using.
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2#len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [10]:
#Initializing our network.
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [11]:
Dcnn.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [12]:
"""
Saving the checkpoint after training.
If the checkpoint exists we restore it.
"""
checkpoint_path = "./checkpoints/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [13]:
Dcnn(np.array([tokenizer.encode("Kausik feels anger")]), training=False).numpy()

array([[7.222766e-05]], dtype=float32)

In [15]:
eec_data = pd.read_csv("Equity-Evaluation-Corpus.csv",engine="python")

eec = []
for each in eec_data['Sentence']:
    eec.append(each)
race = []
for each in eec_data['Race']:
    race.append(each)
gender = []
for each in eec_data['Gender']:
    gender.append(each)
    
sentiment = []
for each in eec:
    sentiment.append(Dcnn(np.array([tokenizer.encode(each)]), training=False).numpy())
    
    
d = {'Sentence':eec,'Sentiment':sentiment, 'Race':race, 'Gender':gender}
final_df = pd.DataFrame(d, columns=['Sentence','Sentiment','Race','Gender'])

In [16]:
final_df.to_csv('SentimentEEC.csv')