# Comment sentiment analysis

## 1) Import dependencies

In [164]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [165]:
df = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')

## 2)Know the data

In [168]:
df[(df['severe_toxic'] == 1) & (df['toxic'] == 1)]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0
181,006e87872c8b370c,you are a stupid fuck \n\nand your mother's cu...,1,1,1,0,1,0
442,01208d2b76624130,Hi \n\nIm a fucking bitch.\n\n50.180.208.181,1,1,1,0,1,0
579,018663f910e0bfe6,What a motherfucking piece of crap those fuckh...,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...
159096,f871b05d4caa6f20,"You filthy, stinking crow! \n\nI am back! Dirt...",1,1,1,0,1,0
159099,f885a3e2d779342e,now\n\nyou fucking pathetic moron. \n\ncc: Ji...,1,1,1,0,1,0
159281,fb726deec64157bd,LoL!! \n\nyou're GAY!! you will never know how...,1,1,1,0,1,1
159312,fbf20e312cd4a78d,"Walter Mercado \n\nAntonio, quite frankly, you...",1,1,1,0,1,0


Observe that when a comment is severe toxic, it is also toxic, to simplify our model we will just remove the severely toxic column. This decision was made because there is no clear boundary of differences between the two categories: toxic and severe_toxic

In [169]:
df = df.drop(columns=['severe_toxic'])

In [170]:
df.head()

Unnamed: 0,id,comment_text,toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0


## 3) Preprocess the data

In [171]:
from tensorflow.keras.layers import TextVectorization

In [172]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [173]:
MAX_FEATURES = 200000

In [174]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [175]:
vectorizer.adapt(X.values)

In [152]:
vectorized_text = vectorizer(X.values)

## 4) Fix imbalance in data by SMOTE

In [176]:
from collections import Counter
print('toxic',sorted(Counter(df['toxic']).items()))
print('obscene',sorted(Counter(df['obscene']).items()))
print('threat',sorted(Counter(df['threat']).items()))
print('insult',sorted(Counter(df['insult']).items()))
print('identity_hate',sorted(Counter(df['identity_hate']).items()))

toxic [(0, 144277), (1, 15294)]
obscene [(0, 151122), (1, 8449)]
threat [(0, 159093), (1, 478)]
insult [(0, 151694), (1, 7877)]
identity_hate [(0, 158166), (1, 1405)]


We can see that many classes are imbalanced, if we do not fix this, our model will be unable to identify true positives or false negatives.

In [154]:
from imblearn.over_sampling import SMOTE

In [155]:
labels = ['toxic', 'obscene', 'threat', 'insult', 'identity_hate']
oversampled_data = {}
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [156]:
for label in labels:
    y = df[label].values  

    # SMOTE 
    X_resampled, y_resampled = smote.fit_resample(vectorized_text, y)

    # oversampled
    oversampled_data[label] = (X_resampled, y_resampled)

    unique, counts = np.unique(y_resampled, return_counts=True)
    class_counts = dict(zip(unique, counts))
    print(f'label : {label}')
    print('0 =>', class_counts[0])
    print('1 =>', class_counts[1])

label : toxic
0 => 144277
1 => 144277
label : obscene
0 => 151122
1 => 151122
label : threat
0 => 159093
1 => 159093
label : insult
0 => 151694
1 => 151694
label : identity_hate
0 => 158166
1 => 158166


## 5) Sequential model

In [157]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint

In [158]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES + 1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(1, activation='sigmoid'))

In [159]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', 'Precision', 'Recall', 'AUC']
)

## 6) Train each category 

In [160]:
result = {}

In [None]:
for label in oversampled_data:
    
    label_X = oversampled_data[label][0]
    label_y = oversampled_data[label][1]

    # Determine dataset size
    dataset_size = len(label_X)

   # Create tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((label_X, label_y)).cache().shuffle(dataset_size)

    # Split dataset into training, validation, and test sets
    train_size = int(dataset_size * 0.7)
    val_size = int(dataset_size * 0.2)

   # Split and batch datasets
    train_dataset = dataset.take(train_size).batch(24).prefetch(8)
    val_dataset = dataset.skip(train_size).take(val_size).batch(24).prefetch(8)
    test_dataset = dataset.skip(train_size + val_size).batch(24).prefetch(8)

    # Initialize ModelCheckpoint callback for this label
    checkpoint_callback = ModelCheckpoint(
        filepath=f'models/model_{label}_{{epoch}}.keras',  # Unique file for each label
        save_best_only=False,
        save_weights_only=False,
        verbose=1
    )
    
    # Train the model
    history = model.fit(
        train_dataset,
        epochs=4,
        validation_data=val_dataset,
        callbacks=[checkpoint_callback]
    )

    # Evaluate the model and unpack all returned values
    evaluation_results = model.evaluate(test_dataset, verbose=1)
    # Unpack each metric
    test_loss, test_accuracy, test_precision, test_recall, test_auc = evaluation_results

    result[label] = {
        "test_loss": test_loss,
        "test_accuracy": test_accuracy,
        "test_precision" : test_precision,
        "test_recall" : test_recall,
        "test_auc" : test_auc
    }
    

In [185]:
# Printing overall results
results_data = []

# Collect the results in a list of dictionaries
for label, metrics in result.items():
    results_data.append({
        "Label": label,
        "Loss": f"{metrics['test_loss']*100:.2f}%",
        "Accuracy": f"{metrics['test_accuracy']*100:.2f}%",
        "Precision": f"{metrics['test_precision']*100:.2f}%",
        "Recall": f"{metrics['test_recall']*100:.2f}%",
    })

# Create a DataFrame to display the results as a table
df = pd.DataFrame(results_data)

print("Test Results for All Labels:")
print(df)

Test Results for All Labels:
           Label   Loss Accuracy Precision  Recall
0          toxic  5.37%   98.27%    98.19%  98.34%
1        obscene  2.42%   99.20%    99.24%  99.17%
2         threat  0.84%   99.76%    99.91%  99.61%
3         insult  1.72%   99.37%    99.44%  99.29%
4  identity_hate  0.64%   99.77%    99.74%  99.80%
