In [1]:
import matplotlib.pyplot as plt
import os
import numpy as np
import re 
import pandas as pd
import string

from tensorflow import keras

from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, GlobalMaxPool1D, SpatialDropout1D, Conv1D, MaxPooling1D
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam, SGD

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')

In [3]:
embeddings_index = {}
embedding_dim = 300
GLOVE_DIR = "C:\\Users\\Abhinav\\Desktop\\Project Final"
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [4]:
train1 = pd.read_csv('toxic.csv')
train1 = pd.concat([train1[5000:16227],train1[16227+5000:]],axis=0)
train1.shape

(22287, 2)

In [5]:
train = pd.read_csv('dataset.csv')
train0 = train[train['toxic'] == 0][0:35000]
# train0 = train[train['toxic'] == 0]
# train0 = train0.sample(n=32287)
train0.shape
# 64574

(35000, 3)

In [6]:
# train0 = pd.read_csv('nontoxic.csv')
# train0.shape

In [7]:
train = pd.concat([train1,train0],axis=0)
train = train.sample(frac=1)
train.head()

Unnamed: 0,comment_text,toxic,id
29795,I can't find any source in English that says m...,0,4f17b9997721d774
2996,and asking top stop involving me,0,081166fea250a5af
18313,Pic of the day Wednesday,0,305b3ee7e7771b7d
26032,"OK, Sarfatti called me worried about it. Is th...",0,44f0dc4275bae7f1
35557,@ and @ What you say to it?,0,5f0301f2ee850b98


In [8]:
train.shape

(57287, 3)

In [9]:
def preprocess(text):
    text = str(text)
    text = re.sub("[^a-zA-Z ]+", "", text)
    text = text.lower()
    text_p = "".join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text_p)
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in filtered_words]   
    return stemmed

#     return filtered_words

In [10]:
X_train = train['comment_text']
# X_test = test['comment_text']
y_train = train['toxic'].values
# y_test = test['toxic'].values

In [None]:
X_train = X_train.map(preprocess)
# X_test = X_test.map(preprocess)

In [None]:
max_length = 100

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences, maxlen=max_length, padding='post')

In [None]:
word_index_train = tokenizer.word_index 
vocab_train = len(tokenizer.word_index) + 1

In [None]:
word_index_train

In [None]:
len(word_index_train)

In [None]:
embedding_matrix_train = np.zeros((len(word_index_train) + 1, embedding_dim))
for word, i in word_index_train.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_train[i] = embedding_vector

In [None]:
embedding_layer = Embedding(len(word_index_train) + 1,
                            embedding_dim,
                            weights=[embedding_matrix_train],
                            input_length=max_length,
                            trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
# model.add(Bidirectional(LSTM(60, return_sequences = True)))
model.add(GRU(60, return_sequences = True))
model.add(SpatialDropout1D(0.1))
model.add(GlobalMaxPool1D())
# model.add(Dropout(0.1))
model.add(Dense(50,activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

opt = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=6, validation_split=0.2, verbose=1)

In [None]:
test = pd.read_csv('labeled_data.csv')

# test = pd.read_csv('dataset.csv')
# test = test[test['toxic'] == 0][84268:84268+10000]

test.head()

In [None]:
X_test = test['tweet']
# X_test = test['comment_text']
y_test = test['toxic'].values

In [None]:
X_test = X_test.map(preprocess)

In [None]:
sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences, maxlen=max_length, padding='post')

In [1076]:
results = model.evaluate(X_test, y_test, batch_size=64)
print("test loss, test acc:", results)

test loss, test acc: [0.37957093119621277, 0.8317394852638245]


In [1077]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

y_pred = model.predict(X_test, batch_size=64, verbose=1)



In [1078]:
tx_indices = y_pred>=0.5
ntx_indices = y_pred<0.5
y_pred[tx_indices] = 1
y_pred[ntx_indices] = 0

print(classification_report(y_test, y_pred))
print(f1_score(y_test,y_pred,average='weighted'))

              precision    recall  f1-score   support

           0       0.50      0.51      0.51      4163
           1       0.90      0.90      0.90     20620

    accuracy                           0.83     24783
   macro avg       0.70      0.70      0.70     24783
weighted avg       0.83      0.83      0.83     24783

0.8325647066421723


In [1079]:
from sklearn.metrics import confusion_matrix

In [1080]:
print(confusion_matrix(y_test,y_pred))

[[ 2130  2033]
 [ 2137 18483]]


In [1081]:
[tn, fp], [fn, tp] = confusion_matrix(y_test,y_pred)

In [1082]:
print('Non-toxic correctly recognised:',tn)

Non-toxic correctly recognised: 2130


In [1083]:
print('Toxic correctly recognised:',tp)

Toxic correctly recognised: 18483


In [1084]:
print('Toxic misclassified as Non-toxic:',fn)

Toxic misclassified as Non-toxic: 2137


In [1085]:
print('Non-Toxic misclassified as Toxic:',fp)

Non-Toxic misclassified as Toxic: 2033


In [1086]:
test2t = pd.read_csv('toxic.csv')
test2t = test2t[:5000]
test2t.shape

(5000, 2)

In [1087]:
test2n = pd.read_csv('dataset.csv')
# train0 = train[train['toxic'] == 0][0:64574]
test2nt = test2n[test2n['toxic'] == 0][30000:30000+15000]
test2nt.shape

(15000, 3)

In [1088]:
test2 = pd.concat([test2t,test2nt],axis=0)
test2.head()

Unnamed: 0,comment_text,toxic,id
0,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,
1,Hey... what is it..\r\n@ | talk .\r\nWhat is i...,1,
2,"Bye! \r\n\r\nDon't look, come or think of comm...",1,
3,You are gay or antisemmitian? \r\n\r\nArchange...,1,
4,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,


In [1089]:
# test2 = pd.read_csv('gab.csv')
# test2.head()

In [1090]:
X_test2 = test2['comment_text']
y_test2 = test2['toxic'].values

In [1091]:
X_test2 = X_test2.map(preprocess)

In [1092]:
sequences = tokenizer.texts_to_sequences(X_test2)
X_test2 = pad_sequences(sequences, maxlen=max_length, padding='post')

In [1093]:
results2 = model.evaluate(X_test2, y_test2, batch_size=64)
print("test loss, test acc:", results2)

test loss, test acc: [0.2578614354133606, 0.8964999914169312]


In [1094]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

y_pred2 = model.predict(X_test2, batch_size=64, verbose=1)



In [1095]:
tx_indices2 = y_pred2>=0.5
ntx_indices2 = y_pred2<0.5
y_pred2[tx_indices2] = 1
y_pred2[ntx_indices2] = 0

print(classification_report(y_test2, y_pred2))
print(f1_score(y_test2,y_pred2,average='weighted'))

              precision    recall  f1-score   support

           0       0.95      0.90      0.93     15000
           1       0.75      0.87      0.81      5000

    accuracy                           0.90     20000
   macro avg       0.85      0.89      0.87     20000
weighted avg       0.90      0.90      0.90     20000

0.8988893186312785
