In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
import pandas as pd
import numpy as np

# Below option allows us to see the entire comment_text column
pd.set_option('display.max_colwidth', None)
# Read in the dataset
train = pd.read_csv("../../data/kaggle_train.csv")
train = train.drop(columns=['id'])

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

print("Stats of training set: ", train.shape)
print("Labels:", labels)

Stats of training set:  (159571, 7)
Labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [3]:
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"Explanation\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


# Text Preprocessing

Below I have noticed some inconsistencies in the data and by preprocessing it, we can ensure a clean dataset.

In [4]:
# Convert comment to lowercase
def to_lowercase(text):
    return text.lower()

train['comment_text'] = train['comment_text'].apply(to_lowercase)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [5]:
import re
# Remove HTML tags from the comments
def remove_html(text):
    return re.sub(r"<.*>", "", text, flags=re.MULTILINE)
    
train['comment_text'] = train['comment_text'].apply(remove_html)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [6]:
# Remove links from the comments
def remove_links(text):
    text= re.sub(r"http\S+"," ",text, flags=re.MULTILINE)
    return re.sub(r"www\S+"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_links)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [7]:
import string
# Remove punctuation marks 
def remove_punctuation(text):
    for i in string.punctuation:
        text = text.replace(i, "")
    return text

train['comment_text'] = train['comment_text'].apply(remove_punctuation)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now892053827,0,0,0,0,0,0
1,daww he matches this background colour im seemingly stuck with thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0,0,0,0,0,0
3,\r\nmore\r\ni cant make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up its listed in the relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,you sir are my hero any chance you remember what page thats on,0,0,0,0,0,0


In [8]:
# Remove special characters such as: \n \r \t
def remove_special(text):
    return re.sub(r"[\n\t\\\/\r]"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_special)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now892053827,0,0,0,0,0,0
1,daww he matches this background colour im seemingly stuck with thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0,0,0,0,0,0
3,more i cant make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up its listed in the relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,you sir are my hero any chance you remember what page thats on,0,0,0,0,0,0


In [9]:
# Remove stopwords using nltk's stopwords package
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

train['comment_text'] = train['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
train.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired now892053827,0,0,0,0,0,0
1,daww matches background colour im seemingly stuck thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info,0,0,0,0,0,0
3,cant make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later noone else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


In [10]:
# As you can see above, there are numbers and/or dates
# I will remove those as they are not helpful

def remove_numbers(text):
    return re.sub(r'\d'," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_numbers)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired now,0,0,0,0,0,0
1,daww matches background colour im seemingly stuck thanks talk january utc,0,0,0,0,0,0
2,hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info,0,0,0,0,0,0
3,cant make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later noone else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


# Train, Test, and Validation Split

Below I need to split the dataset into train and test datasets.
However, sklearn's `train_test_split` function does not work for
multi-class classification.

Therefore, I will be creating a train, test, and validation split for each label in the classification.

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train[["comment_text"]], train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]], test_size=0.20)
X_train

Unnamed: 0,comment_text
94096,note dots names hong kong football clubs also omitted thus page needs renamed consistence
24139,hey asked question still havent responded yet suck fucking balls fucking assholes mrivera
98697,days sail boys sent different kinds oil port starboard lights glad sometimes wikipedia entirely serious
37047,yes point id contacted today logging find blocked would none wiser eventually discover block notice could another days away ps blocked could least extend day two long
111796,appreciate question however qualified answer live michigan spend winters myrtle beach area years often go atalaya castle brookgreen gardens basically enjoyment taking pictures know little history places brochures given state pictures brookgreen gardens mine including bottom slide show presentation additional pictures pictures commons also mine pictures atalaya castle also mine except b w one also recently added slide show presentation bottom group pictures additional pictures took slide show presentations work checked various computers various locations appear work properly especially high speed connections sorry couldnt help think probably best bet others edit articles especially ones started still around article huntington beach state park edited however basically formating clean standpoint removing peacock terms notice requesting additional photos hbsp go take enter article maybe even make slide show think idea slide show pictures narrated classical music alright amateur home movies places like bty atalya castle directly across highway brookgreen gardens brookgreen gardens mile directly inland slide show atalaya first slide shows original road going directly residence atalaya castle brookgreen gardens huntingtons would used s move large sculptures since highway built cut half property owned huntingtons talk
...,...
25145,new tim smith biography completely revamped tim smith biography page added detail discography aware work done
88293,wikiproject bangladeshbengalwest bengal wikiproject bengal wikiproject west bengal wikiproject bangladesh looking contributors like join wp bangladesh also use awesome userbox bakatalk
125886,repeatedly expressed opinion idiotic opposed merely lazy thing tyson revealed done listen bushs biblical quotation write note event lazy locate effect apparently bush clowned taking credit star names actually arabic since wrote words complain obvious intended make tyson seem way correct would favor making clearer quoting someone effect attributing isaiah intent take credit star charts drawn millennia isaiahs death profoundly stupid hurry replace vacancy page appropriate source didnt spring mind case adamantly opposed change propose since removes specifics leaving tasteless gruel dont see anything particularly encyclopedic bad writing bit yahweh directly tysons acknowledgement error facebook btw still shows inkling moronic interpretation bushs biblical quotation
129098,spend hours night contributing article space whereas


In [12]:
print("Train shape:",X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (127656, 1)
Test shape: (31915, 1)


# Vectorizing the Comment Text

*Logistic Regression can't take text values as input*

Since the independent variable I have is only text, we will need to use a vectorizer to convert the text into usable data for Logistic Regression.

```

# Max_features = Build a vocabulary that only consider the top max_features ordered by term frequency

# Analyzer = Whether the feature should be made of word or character n-grams. Option ‘char_wb’ creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space.

# ngram_range = (1,1) means only unigrams, (1,2) means unigrams and bigrams, (1,3) means unigrams, bigrams, and trigrams

# Further ngrams knowledge = bigrams means it will learn the occurence of every two words, trigrams would be every 3, etc.

# dtype = type of the matrix returned, default is float64
```

We will use a word and char n-grams as some people like to obfuscate words by using multiple characters, by using both we can hope to catch these.
The idea from this came from [here](https://www.kaggle.com/code/tunguz/logistic-regression-with-words-and-char-n-grams/comments) which has one of the best results for Logistic Regression. This user optimized the ngram_range.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

tfidf_word = TfidfVectorizer(max_features=1000, analyzer="word", ngram_range=(1,3), dtype=np.float32)
tfidf_char = TfidfVectorizer(max_features=1000, analyzer="char", ngram_range=(3,6), dtype=np.float32)

In [21]:
from scipy.sparse import hstack

X_train_word = tfidf_word.fit_transform(X_train["comment_text"])
X_test_word = tfidf_word.fit_transform(X_test["comment_text"])

X_train_char = tfidf_char.fit_transform(X_train["comment_text"])
X_test_char = tfidf_char.fit_transform(X_test["comment_text"])

In [22]:
X_train = hstack([X_train_char, X_train_word])
X_test = hstack([X_test_char, X_test_word])

# Importing Keras Libraries

In [23]:
# Importing libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers import Bidirectional
from keras.layers import InputLayer
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.layers import Embedding
from keras.utils import pad_sequences

# Building the Model

In [28]:
import tensorflow as tf

embedding_dim = 64

model = Sequential()
model.add(Embedding(X_train.shape[0], embedding_dim, input_length=2000))
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 2000, 64)          8169984   
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 6)                 390       
                                                                 
Total params: 8,244,678
Trainable params: 8,244,678
Non-trainable params: 0
_________________________________________________________________


In [29]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [30]:
np.asarray(X_train.toarray()).itemsize * np.asarray(X_train.toarray()).size

1021248000

In [31]:
from keras.callbacks import EarlyStopping

results = {}

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True, patience=5)
history = model.fit(np.asarray(X_train.toarray()), y_train.values,
                    epochs=50,
                    validation_data=(np.asarray(X_train.toarray()), y_train.values),
                    batch_size=32,
                    callbacks=[es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 16: early stopping


In [32]:
loss, training_accuracy = model.evaluate(np.asarray(X_train.toarray()), y_train.values)
print(f"Training Accuracy: {training_accuracy}")

loss, testing_accuracy = model.evaluate(np.asarray(X_test.toarray()), y_test.values)
print(f" Testing Accuracy:  {testing_accuracy}")

Training Accuracy: 0.9941405057907104
 Testing Accuracy:  0.9942660331726074


In [33]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix

predict = model.predict(np.asarray(X_test.toarray()))



In [34]:
predict

array([[0.09475025, 0.00834456, 0.05137648, 0.00300421, 0.05115644,
        0.00766138],
       [0.09475025, 0.00834456, 0.05137648, 0.00300421, 0.05115644,
        0.00766138],
       [0.09475025, 0.00834456, 0.05137648, 0.00300421, 0.05115644,
        0.00766138],
       ...,
       [0.09475025, 0.00834456, 0.05137648, 0.00300421, 0.05115644,
        0.00766138],
       [0.09475025, 0.00834456, 0.05137648, 0.00300421, 0.05115644,
        0.00766138],
       [0.09475025, 0.00834456, 0.05137648, 0.00300421, 0.05115644,
        0.00766138]], dtype=float32)

In [35]:
y_predicted = []
for i in predict:
    test = []
    for j in i:
        if j > 0.5:
            test.append(1)
        else:
            test.append(0)
    y_predicted.append(test)

# y_predicted = np.argmax(predict, axis=1)
# y_predicted

In [36]:
for i in y_predicted:
    print(i)

[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0,

In [37]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(y_test, y_predicted))

[[[28864     0]
  [ 3042     9]]

 [[31588     0]
  [  327     0]]

 [[30201     0]
  [ 1714     0]]

 [[31827     0]
  [   88     0]]

 [[30319     0]
  [ 1596     0]]

 [[31659     0]
  [  256     0]]]


# Saving the Best Model

In [38]:
model.save(filepath="F:/Thesis/models/rnn/rnn")



INFO:tensorflow:Assets written to: F:/Thesis/models/rnn/rnn\assets


INFO:tensorflow:Assets written to: F:/Thesis/models/rnn/rnn\assets


# Loading the Saved Models

In [None]:
# keras.models.load_model("F:/Thesis/models/cnn/cnn")