<a href="https://colab.research.google.com/github/bipulsimkhada/Sentiment-Analysis-CNN/blob/main/Sentiment_Analysis_unis_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stage 1: Importing dependencies

In [None]:
#Dataset Link: http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

from google.colab import drive

In [None]:

import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Stage 2: Data preprocessing

## Loading files

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/NLP_train.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)


In [None]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Preprocessing

In [None]:
data = train_data

### Cleaning

In [None]:
data.drop(["id","date","query","user"],
          axis =1,
          inplace = True)

In [None]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet,"lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ',tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ',tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']", ' ',tweet)
  tweet = re.sub(r" +", ' ', tweet)
  return tweet

In [None]:
data.text[0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

  tweet = BeautifulSoup(tweet,"lxml").get_text()


In [None]:
data_clean[0]

" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D"

In [None]:
len(data_clean)

1600000

In [None]:
data_labels = data.sentiment.values
data_labels[data_labels==4] = 1

In [None]:
set(data_labels)

{0, 1}

### Tokenization

In [None]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size = 2**16
    )

data_inputs =[tokenizer.encode(sentence) for sentence in data_clean]

In [None]:
data_inputs[0]

[65316,
 1570,
 113,
 65323,
 10,
 6,
 3553,
 1,
 135,
 5262,
 50,
 1484,
 38165,
 16,
 13337,
 606,
 2,
 49,
 33,
 1,
 65352]

### Padding

In [None]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs, value=0, padding ="post", maxlen = MAX_LEN)

### Spliting into training/testing set

In [None]:
test_idx = np.random.randint(0,800000,8000)


In [None]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]

train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

# Stage 3: Model building

In [None]:
class DCNN(tf.keras.Model):
  def __init__ (self,
                vocab_size,
                emb_dim =128,
                nb_filters=50,
                FFN_units = 512,
                nb_classes=2,
                dropout_rate =0.1,
                training=False,
                name="dcnn"):
    super(DCNN, self).__init__(name = name)

    self.embedding = layers.Embedding(vocab_size, emb_dim)

    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding ="valid", activation ="relu")
    self.pool_1 = layers.GlobalMaxPool1D()

    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding ="valid", activation ="relu")
    self.pool_2 = layers.GlobalMaxPool1D()

    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding ="valid", activation ="relu")
    self.pool_3 = layers.GlobalMaxPool1D()

    self.dense_1 = layers.Dense(units=FFN_units, activation ="relu")
    self.dropout =layers.Dropout(rate=dropout_rate)

    if nb_classes==2:
      self.last_dense =layers.Dense(units=1, activation ="sigmoid")
    else:
      self.last_dense =layers.Dense(units=nb_classes, activation ="softmax")

  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool_1(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_2(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool_2(x_3)

    merged = tf.concat([x_1,x_2,x_3], axis = 1)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output




# Stage 4: Application

## Config

In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES =len(set(train_labels))
DROPOUT_RATE =0.2
BATCH_SIZE =32
NB_EPOCHS = 5

## Training

In [None]:
DCnn = DCNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMB_DIM,
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes= NB_CLASSES,
            dropout_rate =DROPOUT_RATE)

In [None]:
if NB_CLASSES ==2:
  DCnn.compile(loss="binary_crossentropy",
               optimizer = "adam",
               metrics =["accuracy"])
else:
  DCnn.compile(loss="sparse_categorical_crossentropy",
               optimizer = "adam",
               metrics =["spare_categorical_accuracy"])


In [None]:
checkpoint_path = ".drive/MyDrive/Colab Notebooks/ckpt"

ckpt = tf.train.Checkpoint(DCnn= DCnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest Checkpoint restored")

In [None]:
DCnn.fit(train_inputs, train_labels, batch_size= BATCH_SIZE, epochs = NB_EPOCHS)
ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'.drive/MyDrive/Colab Notebooks/ckpt/ckpt-1'

## Evaluation

In [None]:
DCnn(np.array([tokenizer.encode("great job. keep it up")]), training=False).numpy()

array([[0.9969422]], dtype=float32)

In [None]:
DCnn(np.array([tokenizer.encode("I am not happy with your performance.")]), training=False).numpy()

array([[0.02016165]], dtype=float32)