# Toxic comment classification

In [1]:
from itertools import product
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob 

In [57]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
train = pd.read_csv("./drive/MyDrive/datasets/toxic_comments/train.csv")
test = pd.read_csv("./drive/MyDrive/datasets/toxic_comments/test.csv")

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Converting from multiclass to binary class

In [5]:
classSum = train.iloc[:,2:].sum(axis=1)
nonToxic =(classSum==0)
print("Total of comments, ",len(train))
print("Total number of non toxic comments: ", nonToxic.sum())
print("Total number of toxic comments: ", classSum.sum())

Total of comments,  159571
Total number of non toxic comments:  143346
Total number of toxic comments:  35098


In [6]:
print("Number of comments with more than one label: ", (classSum > 1).sum())

Number of comments with more than one label:  9865


In [7]:
labelScore = list(train.sum(axis = 1, skipna = True))
newLabelScore = [1 if (x>0) else 0 for x in labelScore]

In [8]:
train['Toxic'] = newLabelScore
labelColumns=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
data = train.drop(columns=labelColumns)

In [9]:
data.head(10)

Unnamed: 0,id,comment_text,Toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0
9,00040093b2687caa,alignment on this subject and which are contra...,0


In [109]:
class PreProcessor:
    
    def __init__(self, noise_chars):
        self.noise_chars = noise_chars #declare the noisy characters, we do not use @ or # as we need them for regex extraction
    
    def removeNoise(self, string):
        """
        This function removes noise from the text by lowering cases, removing character entities
        references, removing links, and removing given noisy characters.
        """
        string = string.lower() #converts the text to lower case 
        string = re.sub('&[a-zA-Z]+;', '', string) #remove character entities reference
        string = re.sub('http\S+', '', string)#remove links
        string = re.sub('www\S+', '', string)
        string = re.sub("\\xa0·", " ", string)
        string = re.sub("(UTC)", " ", string)
        string = re.sub('^b\s+', '', string)
        string = re.sub('@[A-Za-z0-9]+', 'user', string)
        string = re.sub("â€\x9d&lt;", "", string)
        string = re.sub("â€œ:", "", string)
        string = re.sub('pleas', 'please', string)
        string = re.sub('dont', 'do not', string)
        for char in self.noise_chars:
            string = string.replace(char, '') #removes any noisy character from the list noise_chars
        cleaned = string #the cleaned string is passed and returned
        return cleaned
    
    def removeStopwords(self, string):
        """
        This function removes stopwords from the text. It uses a corpus of stopwords from NLTK and they 
        can be extended.
        """
        words = []
        for i in string.split(" "):#split the text into spaces so it is divided into words
            words.append(i) #each word is appended in the list of words
        stop_words = stopwords.words('english') #the stopwords corpus provided by NLTK is used for remove them
        stop_words.extend(['that','thats','oh', 'aww', 'mr', 'r', 'what', 'etc', 'hey', 'within', 'foi', 'yeah', 'www', 'wa', 'em', 'am', 'i', 'me', 'dialmformurderjpg' ]) #we can extend the list of stopwords in this line
        cleaned = [w for w in words if w not in stop_words] #each word in the list of words is checked for stopwords in the corpus
        cleaned_stopwords = " ".join(cleaned)
        return cleaned_stopwords
    
    def textNormalization(self, string):
        """
        This function normalizes text by reducing length of letters in words and correcting spelling of words.
        """
        normalized = []
        tokenizer = nltk.tokenize.TweetTokenizer() #we use the TweetTokenizer to reduce length of letters in the words
        len_reduced = tokenizer.tokenize(string) #the function is applied to the text returning the length reduced
        for word in len_reduced:
            check_spell = TextBlob(word) #we use TextBlob spelling checking as the minimum lenght applied by TweetTokenizer is three letters and there can be some mispelled words
            normalized.append(str(check_spell.correct())) #it is returned the correct spell of the word and appended to normalized list of words
        normalized = " ".join(normalized)
        return normalized
    
    def stemWords(self, string):
        """
        This function performs stemming of words by chopping off inlfections of words using Port Stemmer algorithm.
        Also it corrects mispelling of words.
        """
        words = []
        stemmed = []
        for i in string.split(" "):
            words.append(i)
        stemmer = PorterStemmer() #it is used PorterStemmer to reducing inflection of words to their original word form
        stemmed_words = [stemmer.stem(w) for w in words] #it is applied to each word
        for word in stemmed_words:
            check_spell = TextBlob(word) #again, we use spelling checking as some words might result mispelled or not complete
            stemmed.append(str(check_spell.correct()))
        stemmed = " ".join(stemmed)
        return stemmed
    
    def lemmatizeWords(self, string):
        """
        This function performs word lemmatization by transforming inflections of words in their root form using
        WordNetLemmatization from NLTK. Also it checks mispelling of words.
        """
        words = []
        lemmatized = []
        for i in string.split(" "):
            words.append(i)
        lemmatizer = WordNetLemmatizer() #it is used WordNetLemmatizer to transform word inflections to root form. It is similar to stemming but it does not just chop off words.
        lemmatized_words = [lemmatizer.lemmatize(w) for w in words] #it is applied to each word
        for word in lemmatized_words:
            check_spell = TextBlob(word) #again, we use spelling checking as some words might result mispelled or not complete
            lemmatized.append(str(check_spell.correct()))
        lemmatized = " ".join(lemmatized)
        return lemmatized
    
    def wordTokenize(self, string):
        """
        This function performs word tokenization using regular expressions.
        """
        regex = "[a-zA-Z]+" #this is another form to split the sentences into words by using regular expressions
        tokenized = re.findall(regex, string) #it finds all the matching cases of the regex in the string text and it return a list of words
        return tokenized

    
    def process(self, string):
        """
        This function summarizes and apply all the preprocessing tasks.
        """
        cleaned = self.removeNoise(string)
        cleaned_stopwords = self.removeStopwords(cleaned)
        normalized = self.textNormalization(cleaned_stopwords)
        stemmed = self.stemWords(normalized)
        #lemmatized = self.lemmatizeWords(normalized)
        tokenized = " ".join(self.wordTokenize(stemmed))
        cleaned = self.removeNoise(tokenized)
        preprocessed = self.removeStopwords(cleaned)
        return preprocessed

In [37]:
sample = data.sample(frac=0.0001)

In [38]:
tweets = sample["comment_text"].fillna("NODATA").values

In [110]:
preprocessor = PreProcessor("#@,.?!¬-\''=()")

In [111]:
tweets_cleaned = []
for tweet in tweets:
    tweets_cleaned.append(preprocessor.process(tweet))

In [112]:
tweets_cleaned

['sorry go go sorry admit sir worth please block boot lick perform polite expect circuit got progress less polite face reason ignore made mistake response fix biggest lower',
 'also post comment one discuss place second part please take care meet',
 'actual comment meant bad intent made response confuse',
 'image grace rag image grace proved fair use ration believe image accept fair use accord wikipedia policy please proved ration explain much accord fair use ration guideline image rescript page please also consider use one tag list wikipedia image copyright tagsfair us thank last',
 'note robert suggest matter tone include avoid note good idea return project april may ill thank',
 'quit watch watch benefit genet outweigh detriment carrot',
 'anyone remember pressure hull old submarine rest near southern tunnel entrance late early rumor german submarine probably wrong never identify got',
 'finer come one pass grave',
 'read article wiki meddle persian enclopendia little bite wiki medd

In [None]:
#train = train.sample(frac=1)

In [None]:
X = train["comment_text"].fillna("NODATA").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [None]:
print(X.shape, y.shape)

(159571,) (159571, 6)


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(127656,) (31915,) (127656, 6) (31915, 6)


In [None]:
max_features = 20000
maxlen = 100

In [None]:
list_sentences_train = X_train
list_sentences_test = X_test

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) #receive text and return a sequence (indexes of the words)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
V_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen) #pads with zeros to fulfill the maxlen if not reached
V_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
print(V_train.shape,V_test.shape)

(127656, 100) (31915, 100)


In [None]:
def get_model():
    #define the architecture of the neural network
    embed_size = 128
    inp = Input(shape=(maxlen, )) #input layer
    x = Embedding(max_features, embed_size)(inp) #useful for NLP tasks
    x = Bidirectional(LSTM(50, return_sequences=True))(x) #50 neurons; bidirectional used for give information backwards
    x = GlobalMaxPool1D()(x) #to get highest activation of the previous layer
    x = Dropout(0.1)(x) #Regularization strategy
    x = Dense(50, activation="relu")(x) #fully connected  
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x) #6 for 6 classes, change according to the number of classes 
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
model = get_model()
batch_size = 32 #number of samples of the network in order to estimate the gradient
epochs = 2

#Defining the callbacks
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') #after each epoch, saves the best weights of the best neural network

early = EarlyStopping(monitor="val_loss", mode="min", patience=20) #monitors if validation loss is getting worse, it stops


callbacks_list = [checkpoint, early]
model.fit(V_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)

Epoch 1/2
Epoch 00001: val_loss improved from inf to 0.04921, saving model to weights_base.best.hdf5
Epoch 2/2
Epoch 00002: val_loss improved from 0.04921 to 0.04897, saving model to weights_base.best.hdf5


In [None]:
y_pred = model.predict(V_test)

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [None]:
precision, recall, f1score, _ = precision_recall_fscore_support(y_test, y_pred.round(), average='macro', zero_division=0)

In [None]:
print('Precision: ', round(precision, 6))
print('Recall: ', round(recall, 6))
print('F1-score: ', round(f1score, 6))

Precision:  0.575797
Recall:  0.344494
F1-score:  0.391629


In [None]:
print(classification_report(y_test, y_pred.round(), digits=6, zero_division=0))

              precision    recall  f1-score   support

           0   0.913004  0.650571  0.759764      3065
           1   0.882353  0.049020  0.092879       306
           2   0.871935  0.770620  0.818153      1661
           3   0.000000  0.000000  0.000000        87
           4   0.787489  0.596753  0.678980      1540
           5   0.000000  0.000000  0.000000       289

   micro avg   0.870141  0.605642  0.714189      6948
   macro avg   0.575797  0.344494  0.391629      6948
weighted avg   0.824608  0.605642  0.685331      6948
 samples avg   0.058841  0.053096  0.053671      6948

