# NLP Sentiment Analysis

- **Author:** Sakthi Santhosh
- **Created on:** 04/02/2023

## To-Do

- Integerate ```TextVectorization``` layer into the model.

## Importing Modules

In [None]:
from google.colab import drive
import os
from matplotlib import font_manager, pyplot, rcParams
from numpy import expand_dims
import pandas

from tensorflow.data import Dataset
from tensorflow.keras.layers import (
    Bidirectional,
    Dense,
    Dropout,
    Embedding,
    LSTM,
    TextVectorization
)
from tensorflow.keras.metrics import (
    Precision,
    Recall,
    CategoricalAccuracy
)
from tensorflow.keras.models import load_model, Sequential

## Global Declarations

In [None]:
MAX_TOKENS = 300000
DATASET_LOCATION = "/content/gdrive/MyDrive/Sharing/Programming/python/ai/sentiment_analysis/"

## Downloading Datasets

In [None]:
drive.mount("/content/gdrive/", force_remount=True)

## Load CSV File to Pandas Dataframe

In [None]:
training_dataframe = pandas.read_csv(DATASET_LOCATION + "sentiment_analyser_dataset")

## Preprocess the Data

In [None]:
training_data = {
    "comments": training_dataframe["comment_text"],
    "classifications": training_dataframe[
        training_dataframe.columns[2:]
    ].values
}

vectorizer = TextVectorization(
    max_tokens=MAX_TOKENS,
    output_sequence_length=1500,
    output_mode="int"
)
vectorizer.adapt(training_data["comments"].values)
vectorized_text = vectorizer(training_data["comments"].values)

dataset = Dataset.from_tensor_slices(
    (vectorized_text, training_data["classifications"])
).cache().shuffle(160000).batch(16).prefetch(8)
dataset_length = len(dataset)

partitioned_dataset = {
    "training": dataset.take(int(dataset_length * 0.7)),
    "testing": dataset.skip(int(dataset_length * 0.7)).take(int(dataset_length * 0.1)),
    "validation": dataset.skip(int(dataset_length * 0.8)).take(int(dataset_length * 0.2))
}

## Sequential Model Generation

In [None]:
model = Sequential()

model.add(Embedding(MAX_TOKENS + 1, 32))
model.add(Bidirectional(LSTM(32, activation="tanh")))
model.add(Dense(128, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(6, activation="sigmoid"))

model.compile(loss="BinaryCrossentropy", optimizer="Adam")

## Train the Model

In [None]:
history = model.fit(
    partitioned_dataset["training"],
    epochs=10,
    validation_data=partitioned_dataset["validation"],
    verbose=None
)

## Analysis of Epochs

In [None]:
font_manager.fontManager.addfont(DATASET_LOCATION + "FreeMonoBold.ttf")
rcParams["font.family"] = "FreeMono"

pyplot.figure(figsize=(10, 10))
pandas.DataFrame(history.history).plot()
pyplot.title("Loss Metrics")
pyplot.xlabel("Epochs")
pyplot.ylabel("Loss")
pyplot.show()

## Testing the Model

In [None]:
metrics_handle = {
    "precision": Precision(),
    "recall": Recall(),
    "accuracy": CategoricalAccuracy()
}

for batch in partitioned_dataset["testing"].as_numpy_iterator():
    x, y = batch
    y_res = model.predict(x, verbose=None).flatten()
    y = y.flatten()

    for handle in metrics_handle:
        metrics_handle[handle].update_state(y, y_res)

print("\033[30;01mMetrics\033[00m")
for handle in metrics_handle:
    print(f"  {handle}: {metrics_handle[handle].result().numpy()}")

## Export the Model

In [None]:
model.save(DATASET_LOCATION + "sentiment_analyser.h5")

## Predict Text

In [None]:
input_text = vectorizer("I will kill you!")

### From Live Model

In [None]:
result = model.predict(expand_dims(input_text, 0), verbose=None)

### From Saved Model

In [None]:
model = load_model(DATASET_LOCATION + "sentiment_analyser.h5")
result = model.predict(expand_dims(input_text, 0), verbose=None)

### Print Results

In [None]:
print("\033[30;01mPrediction Results\033[00m")
print("  toxic:", result[0][0])
print("  severe_toxic:", result[0][1])
print("  obscene:", result[0][2])
print("  threat:", result[0][3])
print("  insult:", result[0][4])
print("  identity_hate:", result[0][5])