In [1]:
import tensorflow as tf

In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  




REPLICAS:  8


In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
df.drop('id', axis = 1, inplace = True)

In [7]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
def remove_link(text):
  pattern = re.compile('https?://\S+|www\.\S+')
  cleaned = pattern.sub(r'', text)
  return cleaned

In [10]:
df['comment_text'] = df['comment_text'].apply(remove_link)

In [11]:
def remove_punct(text):
  cleaned = re.sub(r'[^\w\s]', '', text)
  return cleaned

In [12]:
df['comment_text'] = df['comment_text'].apply(remove_punct)

In [13]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,\nMore\nI cant make any real suggestions on im...,0,0,0,0,0,0
4,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0


In [19]:
def remove_newline_characters(text):
    return text.replace('\n', ' ')

# Example usage:
input_text = "This is a string\nwith new line characters\nthat need to be removed."
cleaned_text = remove_newline_characters(input_text)
print(cleaned_text)


This is a string with new line characters that need to be removed.


In [20]:
df['comment_text'] = df['comment_text'].apply(remove_newline_characters)

In [21]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,More I cant make any real suggestions on impr...,0,0,0,0,0,0
4,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0


In [22]:
def remove_numeric(text):
  pattern = re.compile(r'[^a-zA-Z\s]')
  cleaned = pattern.sub('', text)
  return cleaned

In [23]:
df['comment_text'] = df['comment_text'].apply(remove_numeric)

In [24]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,More I cant make any real suggestions on impr...,0,0,0,0,0,0
4,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0


In [26]:
nltk.download('stopwords')
Stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [31]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [33]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [27]:
lemmatizer = WordNetLemmatizer()

In [29]:
def aply_nlp(text):
  words = word_tokenize(text)
  review = [lemmatizer.lemmatize(word) for word in words if word not in Stop_words]
  review = ' '.join(review)
  return review

In [34]:
df['comment_text'] = df['comment_text'].apply(aply_nlp)

In [35]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why edits made username Hardcore M...,0,0,0,0,0,0
1,Daww He match background colour Im seemingly s...,0,0,0,0,0,0
2,Hey man Im really trying edit war Its guy cons...,0,0,0,0,0,0
3,More I cant make real suggestion improvement I...,0,0,0,0,0,0
4,You sir hero Any chance remember page thats,0,0,0,0,0,0


In [36]:
df['comment_text'] = df['comment_text'].str.lower()

In [37]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why edits made username hardcore m...,0,0,0,0,0,0
1,daww he match background colour im seemingly s...,0,0,0,0,0,0
2,hey man im really trying edit war its guy cons...,0,0,0,0,0,0
3,more i cant make real suggestion improvement i...,0,0,0,0,0,0
4,you sir hero any chance remember page thats,0,0,0,0,0,0


In [38]:
corpus = df['comment_text']

In [39]:
corpus

0         explanation why edits made username hardcore m...
1         daww he match background colour im seemingly s...
2         hey man im really trying edit war its guy cons...
3         more i cant make real suggestion improvement i...
4               you sir hero any chance remember page thats
                                ...                        
223544    jerome i see never got around im surprised i l...
223545        lucky bastard heh famous i kida envy congrats
223546                    shame you want speak gay romanian
223547    mel gibson is a nazi bitch who makes shitty mo...
223548    unicorn lair discovery supposedly unicorn lair...
Name: comment_text, Length: 223549, dtype: object

In [40]:
import tensorflow
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

In [41]:
vocab_size = 5000

In [42]:
one_hot = [one_hot(word, vocab_size) for word in corpus]

In [54]:
sent_length = 200

In [55]:
embedded_doc = pad_sequences(one_hot, padding = 'pre', maxlen = sent_length)

In [58]:
embedded_doc[100]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [59]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

In [60]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [61]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 lstm (LSTM)                 (None, None, 64)          33024     
                                                                 
 dropout (Dropout)           (None, None, 64)          0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 6)                 198       
                                                                 
Total params: 365638 (1.39 MB)
Trainable params: 365638 

In [62]:
X = np.array(embedded_doc)

In [64]:
y = df.drop('comment_text', axis = 1)

In [66]:
y_final = np.array(y)

In [69]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y_final, test_size = 0.2, random_state = 2022)

In [74]:
model.fit(X_train, y_train, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x795c5cabbdc0>

In [75]:
y_pred = model.predict(X_test)



In [77]:
y_pred_classes = np.argmax(y_pred, axis=1)

In [81]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [84]:
threshold = 0.5
y_pred = (y_pred>= threshold).astype(int)

# Now you can use the classification metrics
accuracy = accuracy_score(y_test, y_pred)


print(f"Accuracy: {accuracy}")

Accuracy: 0.8974055021248043


In [87]:
from keras.models import load_model

# Save model
model.save("multilabel_comment_classifier.h5")