<a href="https://colab.research.google.com/github/ahussain-ai/Sentiment-Analysis-Using-Incremental-Learning/blob/master/tf_sentiment_analysis_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import subprocess

In [2]:
# Commands to run
commands = [
    'pip install kaggle',
    'mkdir -p ~/.kaggle',
    'cp kaggle.json ~/.kaggle/',
    'chmod 600 ~/.kaggle/kaggle.json',
    'kaggle datasets download -d bittlingmayer/amazonreviews',
    'unzip amazonreviews.zip',
    'mkdir dataset',
    'bunzip2 -c /content/test.ft.txt.bz2 > /content/dataset/test.ft.txt',
    'bunzip2 -c /content/train.ft.txt.bz2 > /content/dataset/train.ft.txt',
    'rm test.ft.txt.bz2',
    'rm train.ft.txt.bz2',
    'rm amazonreviews.zip'
]

# Execute each command
for cmd in commands:
    subprocess.run(cmd, shell=True)


In [3]:

import os
import numpy as np
import tensorflow as tf
import re

In [4]:
dataset_dir = '/content/dataset/train.ft.txt'

**1. Preprocess Text**

In [5]:
def preprocess_line(line):

    pattern = r'^(__label__\d+)\s+(.*)$'
    match = tf.strings.regex_full_match(line, pattern)

    # Extract groups using tf.strings.regex_replace and capture groups
    label = tf.strings.regex_replace(line, pattern, "\\1")
    text = tf.strings.regex_replace(line, pattern, "\\2")

    # Remove leading and trailing spaces from text
    text = tf.strings.strip(text)

    # Extract label from '__label__1' format
    label = tf.strings.split(label, '__label__')[1]  # Split and get the second part
    label = tf.strings.to_number(label,out_type=tf.int32)
    label = tf.cond(tf.equal(label, 2), lambda: 1, lambda: 0)


    return text, label

In [6]:

#load train data
dataset = tf.data.TextLineDataset(dataset_dir).map(preprocess_line, num_parallel_calls=tf.data.AUTOTUNE)


In [7]:
#load test data
test_dataset = tf.data.TextLineDataset('/content/dataset/test.ft.txt').map(preprocess_line, num_parallel_calls=tf.data.AUTOTUNE)

In [8]:
dataset = dataset.take(200000)
test_dataset = test_dataset.take(100000)

**2.Vectorize and Tokanize**

In [9]:
tokenizer = tf.keras.layers.TextVectorization(max_tokens=20000, output_sequence_length = 128)
tokenizer.adapt(dataset.map(lambda text, label: text))

# Convert text to sequences within the Dataset pipeline
max_length = 0
def vectorize_text(text, label):
    # nonlocal max_length
    # text = tf.expand_dims(text, axis = 1)
    tokenized_text = tokenizer(text)
    # max_length = max(max_length, tf.shape(tokenized_text)[1])
    return tokenizer(text), label

vectorized_ds = dataset.map(vectorize_text)

In [10]:
#vcetrorize the test data
test_vectorized_ds = test_dataset.map(vectorize_text)

In [11]:
# for text, label in vectorized_ds.take(7):
#     print(f"text : {text.numpy()}")
#     print(f"label : {label.numpy()}")

In [12]:
import gensim.downloader as api

In [13]:
#check the list of models available
info = api.info()
for model_name, model_info in sorted(info['models'].items()):
    print(
        "%s (%d records): %s" % (
            model_name,
            model_info.get('num_records', -1),
            model_info['description'][:40] +"...",
        )
    )


__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai

**Calculate embeddings of words in vocabulary**

In [14]:
glove_model = api.load('glove-twitter-50')



In [15]:
#find embeddings using glove50
embedding_vector = np.zeros((len(tokenizer.get_vocabulary()), 50))
dummy_embedding = np.zeros((50))
for word in tokenizer.get_vocabulary():
    if word in glove_model:
        embedding_vector[tokenizer.get_vocabulary().index(word)] = glove_model[word]
    else :
        embedding_vector[tokenizer.get_vocabulary().index(word)] = dummy_embedding


In [16]:
# Check which rows are all zeros
zero_rows = np.all(embedding_vector == 0, axis=1)
# Count how many rows are all zeros
num_zero_rows = np.sum(zero_rows)

print(f"There are {num_zero_rows} rows that are all zeros.")

There are 994 rows that are all zeros.


In [17]:
# Check which rows are all zeros
zero_rows_indices = np.where(np.all(embedding_vector == 0, axis=1))[0]

# Print the indices of rows that are all zeros
print("Indices of rows that are all zeros:")
# print(zero_rows_indices)

Indices of rows that are all zeros:


**3. Create Tf Dataset**

In [18]:
# Shuffle, batch, and prefetch for performance
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 20000

train_ds = vectorized_ds.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE).repeat()
test_ds = test_vectorized_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

**4.Define Model**

In [19]:
batch = next(iter(train_ds))

# Get shapes of elements in the batch
text_batch, label_batch = batch
print("Shape of text batch:", text_batch.shape)
print("Shape of label batch:", label_batch.shape)


test_batch = next(iter(test_ds))
test_batch, label = test_batch
print("Shape of text batch:", test_batch.shape)
print("Shape of label batch:", label.shape)

Shape of text batch: (64, 128)
Shape of label batch: (64,)
Shape of text batch: (64, 128)
Shape of label batch: (64,)


In [20]:
def ann_model(tokenizer) :

    # Example model definition
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=len(tokenizer.get_vocabulary()), output_dim=50, mask_zero=True, weights = [embedding_vector]),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.004),
                loss='binary_crossentropy',
                metrics=['accuracy'])

    return model

In [21]:
def cnn_model(tokenizer) :

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=len(tokenizer.get_vocabulary()), output_dim=50,mask_zero = True, weights = [embedding_vector]),
        tf.keras.layers.Conv1D(16, 7, activation='relu', kernel_regularizer='l2'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(8),
        tf.keras.layers.Dense(1, activation='sigmoid')

        ])


     # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001),
                loss='binary_crossentropy',
                metrics=['accuracy'])

    return model

**5.Training**

In [22]:

# Train the ann model
model = ann_model(tokenizer)
model.summary()
history = model.fit(train_ds, epochs=5,steps_per_epoch = 3000, validation_data = test_ds)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          1000000   
                                                                 
 global_average_pooling1d (  (None, 50)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 128)               6528      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1006657 (3.84 MB)
Trainable params: 1006657 (3.84 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
cnn = cnn_model(tokenizer)
cnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 50)          1000000   
                                                                 
 conv1d_1 (Conv1D)           (None, None, 16)          5616      
                                                                 
 global_max_pooling1d_1 (Gl  (None, 16)                0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_4 (Dense)             (None, 8)                 136       
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                      

In [25]:
history = cnn.fit(train_ds, epochs=10,steps_per_epoch = 3000, validation_data = test_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
cnn.evaluate(test_ds)



[0.33138883113861084, 0.8881000280380249]

In [29]:
sample = """It's very useful product for me.I use it on daily basis.Till now this has been my favourite product which I have bought online.Blades
r nice n sharp..but after few months string causes problem otherwise it's very handy in kitchen cutting chores"""

tokenized_text = tokenizer(sample)
# print(tokenized_text)

In [30]:
prediction = cnn.predict(tf.expand_dims(tokenized_text, axis=0))
print(prediction)

[[0.96174]]


**RNN for classification**

In [32]:
def rnn_model(tokenizer) :

    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=len(tokenizer.get_vocabulary()), output_dim=50, mask_zero=True, weights = [embedding_vector]),
        tf.keras.layers.SimpleRNN(64),  # Simple RNN layer with 64 units
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0009),loss='binary_crossentropy',metrics=['accuracy'])

    return model

In [33]:
rnn_model = rnn_model(tokenizer)
rnn_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 50)          1000000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                7360      
                                                                 
 dense_6 (Dense)             (None, 16)                1040      
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1008417 (3.85 MB)
Trainable params: 1008417 (3.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
rnn_model.fit(train_ds, epochs=20,steps_per_epoch = 3000, validation_data = test_ds)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [None]:


def plot_history(history):

    # Get the loss and accuracy values from the history dictionary
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']

    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

    # Plot the loss values on the first subplot
    ax1.plot(loss, label='Training Loss')
    ax1.plot(val_loss, label='Validation Loss')
    ax1.set_title('Loss Plot')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()

    # Plot the accuracy values on the second subplot
    ax2.plot(accuracy, label='Training Accuracy')
    ax2.plot(val_accuracy, label='Validation Accuracy')
    ax2.set_title('Accuracy Plot')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()

    # Show the plot
    plt.tight_layout()
    plt.show()

plot_history(history)