## Problem Statement

To build and implement a multiclass classification deep neural network model to classify between Positive/Extremely Positive/Negative/Extremely Negative/Neutral sentiments

In [None]:
# Importing the data manipulation libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Data Preprocessing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import re
import contractions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Importing the model building libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from wordcloud import WordCloud
# Importing the evaluation libraries
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, confusion_matrix

##   **Stage 1**:  Data Loading and Perform Exploratory Data Analysis

* Load the Dataset


In [None]:
# YOUR CODE HERE
train_data = pd.read_csv('/content/corona_nlp.csv',encoding='latin-1')  # Adjust path as needed

* Check for Missing Values

In [None]:
# YOUR CODE HERE
train_data.isnull().sum()

* Visualize the sentiment column values


In [None]:
train_data['Sentiment'].value_counts()

* Visualize top 10 Countries that had the highest tweets using countplot (Tweet count vs Location)


In [None]:
train_data['Location'].value_counts().head(10)

* Plotting Pie Chart for the Sentiments in percentage


In [None]:
train_data['Sentiment'].value_counts().plot(kind='pie', autopct='%1.0f%%')

* WordCloud for the Tweets/Text

    * Visualize the most commonly used words in each sentiment using wordcloud
    * Refer to the following [link](https://medium.com/analytics-vidhya/word-cloud-a-text-visualization-tool-fb7348fbf502) for Word Cloud: A Text Visualization tool




In [None]:
for sent in train_data['Sentiment'].unique():
    all_tweets = ' '.join(train_data[train_data['Sentiment'] == sent]['OriginalTweet'].to_list())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_tweets)
    print(f"Word Cloud for Sentiment: {sent}")
    plt.figure(figsize=(10, 5))  # Set figure size
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")  # Turn off axis
    plt.show()

##   **Stage 2**: Data Pre-Processing  
####  Clean and Transform the data into a specified format


In [None]:
X = train_data['OriginalTweet']
y = train_data['Sentiment']
pd.set_option('display.max_colwidth', None)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_test = test_data['OriginalTweet']

In [None]:
plt.rcParams['figure.figsize'] = (15, 10)

plt.subplot(1, 3, 1)
plt.pie(pd.Series(y_train).value_counts(), labels=pd.Series(y_train).value_counts().index, autopct='%1.2f%%')
plt.title("TRAIN")

plt.subplot(1, 3, 2)
plt.pie(pd.Series(y_val).value_counts(), labels=pd.Series(y_val).value_counts().index, autopct='%1.2f%%')
plt.title("VALIDATION")

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [None]:
def remove_unnecessary_elements(text):
    text = text.lower()
    text = re.sub(r'(covid[-_]?19|covid2019|covid[-_]?2019|corona[-_]?virus|corona|covid)', 'covid', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)

    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub(r'â|â’', "'", text)
    text = re.sub(r'\x92|\xa0|\x85|\x95', '', text)
    text = contractions.fix(text)
    text = re.sub(r'[^\w\s]', ' ', text)

    return text

X_train = np.array([remove_unnecessary_elements(text) for text in X_train])
X_val = np.array([remove_unnecessary_elements(text) for text in X_val])
X_test = np.array([remove_unnecessary_elements(text) for text in X_test])

##   **Stage 3**: Build the Word Embeddings using pretrained Word2vec/Glove (Text Representation)


In [None]:
# tokenizing without limiting vocabulary size, to get word counts
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(X_train)

word_counts = len(tokenizer.word_index) + 1  # vocabulary size

print("Numbers of unique words present:", word_counts)

In [None]:
train_sequences = tokenizer.texts_to_sequences(X_train)

maxlen = max([len(seq) for seq in train_sequences])
print("Maximum length of all sequences:", maxlen)

padded_train_sequences = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
print("Padded TRAINING Sequences Shape:", padded_train_sequences.shape)

In [None]:
longest_sequence_index = np.argmax([len(seq) for seq in train_sequences])

# Get the longest sequence and its corresponding original sentence
longest_sequence = train_sequences[longest_sequence_index]
longest_sentence = X_train[longest_sequence_index]

print(longest_sentence)

In [None]:
# For Validation set
val_sequences = tokenizer.texts_to_sequences(X_val)
padded_val_sequences = pad_sequences(val_sequences, maxlen=maxlen, padding='post')

print("Padded VALIDATION Sequences Shape:", padded_val_sequences.shape)

In [None]:
# For test set
test_sequences = tokenizer.texts_to_sequences(X_test)
padded_test_sequences = pad_sequences(test_sequences, maxlen=maxlen, padding='post')

print("Padded TEST Sequences Shape:", padded_test_sequences.shape)

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
embedding_dim = 100
embedding_matrix = create_embedding_matrix('glove.6B.100d.txt', tokenizer.word_index, embedding_dim)

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print(f"Percent of vocabulary covered: {round(nonzero_elements/word_counts*100, 2)}%")

In [None]:
# Get the words that are not covered by GloVe
not_covered_words = []
for word, idx in tokenizer.word_index.items():
    if np.count_nonzero(embedding_matrix[idx]) == 0:  # If the embedding vector is all zeros
        not_covered_words.append(word)

# Print some of the words that are not covered
print(f"Total uncovered words: {len(not_covered_words)}")
print("Sample of uncovered words:", not_covered_words[:50])

In [None]:
print(padded_train_sequences)  # training feature
print(y_train)  # training target

##   **Stage 4**: Build model

In [None]:
model = Sequential()

model.add(Embedding(word_counts,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=True))

model.add(SpatialDropout1D(0.3))

model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Dropout(0.25))

model.add(Bidirectional(LSTM(units=32, return_sequences=False)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(24, activation='relu', kernel_regularizer=l2(0.05)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.05)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(5, activation='softmax'))

In [None]:
model.build((padded_train_sequences.shape))

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes=True, show_layer_names=True, dpi=90)

In [None]:
# Assuming `y_train_original` holds the non-one-hot encoded original sentiment labels
y_train_original = np.argmax(y_train, axis=1)  # Converting one-hot encoded y_train back to label form

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_original), y=y_train_original)
class_weights = dict(enumerate(class_weights))

num_epochs = 200

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

checkpoint = ModelCheckpoint(
    'best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

early_stop = EarlyStopping(monitor='val_loss', patience=10)

In [None]:
model.compile(loss = 'categorical_crossentropy',
              optimizer=Adam(learning_rate=0.0002, clipnorm=1.0),
              metrics = ['accuracy'])

In [None]:
history = model.fit(padded_train_sequences,
                    y_train,
                    validation_data=(padded_val_sequences, y_val),
                    epochs=num_epochs,
                    class_weight=class_weights,
                    callbacks =[reduce_lr, early_stop, checkpoint],
                    batch_size=32,
                    verbose=1)