In [18]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import re 

In [5]:
df = pd.read_csv(os.path.join('r', 'finaldataset.csv'))

In [6]:
X = df['comment_text'].astype(str) 
y = df[df.columns[1:]].values

In [7]:
def remove_emojis(text):
    if isinstance(text, str):  # Check if text is a string
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # Chinese characters
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u200d"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\u3030"
                                   u"\ufe0f"
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub('', text)
  


    # if isinstance(text, str):  # Check if text is a string
    #     text = demoji.replace_with_desc(text)
    return text

df['comment_text'] = df['comment_text'].apply(remove_emojis)
# Remove the column with empty 'comment_text' values



df = df.dropna(subset=['comment_text'])



column_name = 'comment_text'
columns_to_convert = ['toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# Remove commas from the specified column
df[column_name] = df[column_name].str.replace(',', '').str.lower()

df[columns_to_convert] = df[columns_to_convert].fillna(0).astype(int)

In [21]:
MAX_FEATURES = 20000
SEQUENCE_LENGTH = 300
BATCH_SIZE = 30
EPOCHS = 15

In [9]:
# Step 1: Tokenize the text and create a vocabulary
word_counts = {}
for sentence in X.values:
    words = sentence.lower().split()
    for word in words:
        if word not in word_counts:
            word_counts[word] = 1
        else:
            word_counts[word] += 1

# Sort the words by frequency and keep the most frequent ones
sorted_words = sorted(word_counts, key=lambda x: word_counts[x], reverse=True)
sorted_words = sorted_words[:MAX_FEATURES]

# Create a word-to-index mapping
word_to_index = {word: index + 1 for index, word in enumerate(sorted_words)}

# Step 2: Convert the text to sequences of integers
sequences = []
for sentence in  X.values:
    words = sentence.lower().split()
    sequence = []
    for word in words:
        if word in word_to_index:
            sequence.append(word_to_index[word])
    sequences.append(sequence)

# Step 3: Pad sequences to a fixed length
padded_sequences = []
for sequence in sequences:
    if len(sequence) > SEQUENCE_LENGTH:
        sequence = sequence[:SEQUENCE_LENGTH]
    else:
        sequence = sequence + [0] * (SEQUENCE_LENGTH - len(sequence))
    padded_sequences.append(sequence)

# Convert to a numpy array
vectorized_text = np.array(padded_sequences)

In [23]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(20000)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [33]:
# for data, label in dataset:
#     # Access the values of 'data' and 'label'
#     print("Data:", data)
#     print("Label:", label)

Data: tf.Tensor(
[[6816    0    0 ...    0    0    0]
 [  61    6    4 ...    0    0    0]
 [  70 2262  346 ...    0    0    0]
 ...
 [ 154 4872    2 ...    0    0    0]
 [ 527    3  145 ...    0    0    0]
 [ 522   32   54 ...    0    0    0]], shape=(30, 300), dtype=int64)
Label: tf.Tensor(
[[1 0 0 0 0]
 [0 0 0 0 0]
 [1 0 0 1 1]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 1 0]
 [0 0 0 0 0]
 [1 0 0 1 1]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [1 0 0 1 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [1 1 0 1 1]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [1 1 0 1 1]
 [1 0 0 1 0]
 [0 0 0 0 0]
 [1 0 0 0 0]
 [1 0 1 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [1 0 0 1 0]], shape=(30, 5), dtype=int64)
Data: tf.Tensor(
[[7017    0    0 ...    0    0    0]
 [  49   31 1833 ...    0    0    0]
 [3143    0    0 ...    0    0    0]
 ...
 [3160    0    0 ...    0    0    0]
 [7879    0    0 ...    0    0    0]
 [1761    0    0 ...    0    0    0]], shape=(30, 300), dtype=int64)
Label: tf.

In [24]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [9]:
model = tf.keras.Sequential([
    Embedding(MAX_FEATURES + 1, 128),
    Bidirectional(LSTM(128, activation='tanh', return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64, activation='tanh')),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(5, activation='sigmoid')
])

In [10]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [11]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(train, epochs=EPOCHS, validation_data=val, callbacks=[early_stopping])

In [17]:
model.save('finalromanized1.h5')

In [25]:
model = tf.keras.models.load_model(os.path.join('r', 'finalromanized1.h5'))

In [26]:
test_loss, test_precision, test_recall, test_auc, test_accuracy = model.evaluate(test)

print('Test Loss:', test_loss)
print('Test Precision:', test_precision)
print('Test Recall:', test_recall)
print('Test AUC:', test_auc)
print('Test Accuracy:', test_accuracy)

Test Loss: 0.03167163208127022
Test Precision: 0.9701166152954102
Test Recall: 0.952074408531189
Test AUC: 0.9987849593162537
Test Accuracy: 0.9075471758842468
