In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [2]:
data = pd.read_csv('train.csv')



In [3]:
data 

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
data.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [5]:
columns = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

# Loop through the list of columns
for column in columns:
    counts = data[column].value_counts()
    print(column + ':')
    print(counts)
    print()

toxic:
toxic
0    144277
1     15294
Name: count, dtype: int64

severe_toxic:
severe_toxic
0    157976
1      1595
Name: count, dtype: int64

obscene:
obscene
0    151122
1      8449
Name: count, dtype: int64

threat:
threat
0    159093
1       478
Name: count, dtype: int64

insult:
insult
0    151694
1      7877
Name: count, dtype: int64

identity_hate:
identity_hate
0    158166
1      1405
Name: count, dtype: int64


In [6]:
missing_values = data.isnull().sum()

print("Missing Values Count per Column:")
print(missing_values)

Missing Values Count per Column:
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [7]:
# Preprocess data
data['comment_text'] = data['comment_text'].apply(lambda x: x.lower())  # Lowercase text
data['comment_text'] = data['comment_text'].str.replace('[^\w\s]', '', regex=False)  # Remove punctuation

In [8]:
num_words_in_dataset = data['comment_text'].str.split().explode().nunique()

print(f"Number of unique words in the dataset: {num_words_in_dataset}")

Number of unique words in the dataset: 470340


In [9]:
data = data.drop('id', axis=1)

In [10]:
from gensim.models import Word2Vec


In [11]:
# Train the Word2Vec model
corpus = [doc.split() for doc in data['comment_text']]
Word2Vecmodel = Word2Vec(sentences=corpus, vector_size=100, window=10, min_count=3, workers=6)

In [12]:
# Tokenize text data
tokenizer = Tokenizer(num_words=num_words_in_dataset, oov_token='<OOV>')
tokenizer.fit_on_texts(data['comment_text'])

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# Split set
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)


In [15]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['comment_text'])
val_sequences = tokenizer.texts_to_sequences(val_data['comment_text'])
test_sequences = tokenizer.texts_to_sequences(test_data['comment_text'])

In [16]:
# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=256, truncating='post', padding='post')
val_padded = pad_sequences(val_sequences, maxlen=256, truncating='post', padding='post')
test_padded = pad_sequences(test_sequences, maxlen=256, truncating='post', padding='post')

In [17]:
# Define the vocabulary size and embedding matrix
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))  
for word, i in word_index.items():
    if word in Word2Vecmodel.wv.key_to_index:
        embedding_matrix[i] = Word2Vecmodel.wv[word]

In [18]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization, Activation


In [19]:
# Define the model 
input_layer = Input(shape=(256,))  

# Embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=256, trainable=False)(input_layer)

# Bidirectional LSTM layer 
lstm_layer = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
lstm_layer = Bidirectional(LSTM(64))(lstm_layer)

output_layers = []
for column in columns:
    dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(lstm_layer)
    batch_norm_layer = BatchNormalization()(dense_layer)
    activation_layer = Activation('relu')(batch_norm_layer)
    dropout_layer = Dropout(0.2)(activation_layer)
    output = Dense(6, activation='sigmoid')(dropout_layer)  
    output_layers.append(output)



In [20]:

# Create the model
model = Model(inputs=input_layer, outputs=output_layers)

In [21]:
from tensorflow.keras.optimizers import Adam
# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.0005), 
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [22]:

# Display the model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 256, 100)             2103390   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 bidirectional (Bidirection  (None, 256, 256)             234496    ['embedding[0][0]']           
 al)                                                                                              
                                                                                              

In [23]:
columns = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

In [24]:
# Train the model
history = model.fit(train_padded, [train_data[columns] for column in columns], epochs=15, batch_size=32, validation_data=(val_padded, [val_data[columns] for column in columns]))


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [25]:
# Evaluate the model on the test set
results = model.evaluate(test_padded, [test_data[column] for column in columns])

# Display the evaluation results
for i, metric in enumerate(model.metrics_names):
    print(f"{metric}: {results[i]}")

loss: 1.5046818256378174
dense_1_loss: 0.545199453830719
dense_3_loss: 0.14503583312034607
dense_5_loss: 0.23562633991241455
dense_7_loss: 0.1505155861377716
dense_9_loss: 0.2505280375480652
dense_11_loss: 0.16302724182605743
dense_1_accuracy: 0.9434139728546143
dense_3_accuracy: 0.9183481931686401
dense_5_accuracy: 0.9547562599182129
dense_7_accuracy: 0.9206667542457581
dense_9_accuracy: 0.8506078720092773
dense_11_accuracy: 0.9189121723175049


In [32]:
model.save('Comment_toxicity.keras')

In [27]:
text = "you fucking fool" 
sequence = tokenizer.texts_to_sequences([text])
padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post')
predictions = model.predict(padded_sequence)


# Display the predictions with confidence levels
for i, column in enumerate(columns):
    predicted_probabilities = predictions[i][0]  
    target_class_index = i  
    confidence_level = f"{predicted_probabilities[target_class_index] * 100:.2f}%"
    prediction_label = "Positive" if predicted_probabilities[target_class_index] > 0.5 else "Negative"
    print(f"{column}: Confidence Level = {confidence_level}, Prediction = {prediction_label}")


binary_predictions = [[1 if prob[i] > 0.5 else 0 for i in range(len(columns))] for prob in predictions[0]]
print("Binary Predictions:", binary_predictions)


toxic: Confidence Level = 99.98%, Prediction = Positive
severe_toxic: Confidence Level = 16.16%, Prediction = Negative
obscene: Confidence Level = 99.96%, Prediction = Positive
threat: Confidence Level = 0.00%, Prediction = Negative
insult: Confidence Level = 99.03%, Prediction = Positive
identity_hate: Confidence Level = 0.02%, Prediction = Negative
Binary Predictions: [[1, 0, 1, 0, 1, 0]]


In [28]:
text = "I like talking about things that make me happy" 
sequence = tokenizer.texts_to_sequences([text])
padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post')
predictions = model.predict(padded_sequence)


# Display the predictions with confidence levels
for i, column in enumerate(columns):
    predicted_probabilities = predictions[i][0]  
    target_class_index = i  
    confidence_level = f"{predicted_probabilities[target_class_index] * 100:.2f}%"
    prediction_label = "Positive" if predicted_probabilities[target_class_index] > 0.5 else "Negative"
    print(f"{column}: Confidence Level = {confidence_level}, Prediction = {prediction_label}")


binary_predictions = [[1 if prob[i] > 0.5 else 0 for i in range(len(columns))] for prob in predictions[0]]
print("Binary Predictions:", binary_predictions)


toxic: Confidence Level = 0.00%, Prediction = Negative
severe_toxic: Confidence Level = 0.00%, Prediction = Negative
obscene: Confidence Level = 0.00%, Prediction = Negative
threat: Confidence Level = 0.00%, Prediction = Negative
insult: Confidence Level = 0.00%, Prediction = Negative
identity_hate: Confidence Level = 0.00%, Prediction = Negative
Binary Predictions: [[0, 0, 0, 0, 0, 0]]


In [29]:
text = "I will hurt and kill all your family members you worthless piece of shit" 
sequence = tokenizer.texts_to_sequences([text])
padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post')
predictions = model.predict(padded_sequence)


# Display the predictions with confidence levels
for i, column in enumerate(columns):
    predicted_probabilities = predictions[i][0]  
    target_class_index = i  
    confidence_level = f"{predicted_probabilities[target_class_index] * 100:.2f}%"
    prediction_label = "Positive" if predicted_probabilities[target_class_index] > 0.5 else "Negative"
    print(f"{column}: Confidence Level = {confidence_level}, Prediction = {prediction_label}")


binary_predictions = [[1 if prob[i] > 0.5 else 0 for i in range(len(columns))] for prob in predictions[0]]
print("Binary Predictions:", binary_predictions)


toxic: Confidence Level = 99.87%, Prediction = Positive
severe_toxic: Confidence Level = 41.90%, Prediction = Negative
obscene: Confidence Level = 94.10%, Prediction = Positive
threat: Confidence Level = 89.01%, Prediction = Positive
insult: Confidence Level = 73.40%, Prediction = Positive
identity_hate: Confidence Level = 7.18%, Prediction = Negative
Binary Predictions: [[1, 1, 1, 1, 1, 0]]


In [30]:
text = "I hate you and your black ass, get the fuck out of here" 
sequence = tokenizer.texts_to_sequences([text])
padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post')
predictions = model.predict(padded_sequence)


# Display the predictions with confidence levels
for i, column in enumerate(columns):
    predicted_probabilities = predictions[i][0]  
    target_class_index = i  
    confidence_level = f"{predicted_probabilities[target_class_index] * 100:.2f}%"
    prediction_label = "Positive" if predicted_probabilities[target_class_index] > 0.5 else "Negative"
    print(f"{column}: Confidence Level = {confidence_level}, Prediction = {prediction_label}")


binary_predictions = [[1 if prob[i] > 0.5 else 0 for i in range(len(columns))] for prob in predictions[0]]
print("Binary Predictions:", binary_predictions)


toxic: Confidence Level = 100.00%, Prediction = Positive
severe_toxic: Confidence Level = 63.42%, Prediction = Positive
obscene: Confidence Level = 99.64%, Prediction = Positive
threat: Confidence Level = 91.54%, Prediction = Positive
insult: Confidence Level = 84.16%, Prediction = Positive
identity_hate: Confidence Level = 15.38%, Prediction = Negative
Binary Predictions: [[1, 1, 1, 1, 1, 0]]


In [31]:
text = "I will find you and make you pay dearly for this, I suggest you run" 
sequence = tokenizer.texts_to_sequences([text])
padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post')
predictions = model.predict(padded_sequence)


# Display the predictions with confidence levels
for i, column in enumerate(columns):
    predicted_probabilities = predictions[i][0]  
    target_class_index = i  
    confidence_level = f"{predicted_probabilities[target_class_index] * 100:.2f}%"
    prediction_label = "Positive" if predicted_probabilities[target_class_index] > 0.5 else "Negative"
    print(f"{column}: Confidence Level = {confidence_level}, Prediction = {prediction_label}")


binary_predictions = [[1 if prob[i] > 0.5 else 0 for i in range(len(columns))] for prob in predictions[0]]
print("Binary Predictions:", binary_predictions)


toxic: Confidence Level = 0.24%, Prediction = Negative
severe_toxic: Confidence Level = 0.01%, Prediction = Negative
obscene: Confidence Level = 0.04%, Prediction = Negative
threat: Confidence Level = 0.00%, Prediction = Negative
insult: Confidence Level = 0.04%, Prediction = Negative
identity_hate: Confidence Level = 0.01%, Prediction = Negative
Binary Predictions: [[0, 0, 0, 0, 0, 0]]
