In [28]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional,Embedding, Flatten, Dropout
import pandas as pd
import numpy as np
import cleantext
import re


In [29]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv('hate.csv')


In [30]:
df1['is_bad'] = df1.apply(lambda row: 0 if (row['toxic'] == 0 and row['severe_toxic'] ==0 and row['obscene']==0 and 
                                           row['threat']==0 and row['insult'] == 0 and row['identity_hate'] == 0) else 1, axis=1)


df2['is_bad'] = df2.apply(lambda row: 1 if (row['class'] == 0 or row['class'] == 1 ) else 0, axis=1)



In [31]:
def cleaning(string):
    string = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",string).split())
    string = cleantext.clean(string, extra_spaces=True, lowercase=True, numbers=True, punct=True)
    return string
    

In [32]:
df1['clean_text'] = df1['comment_text'].apply(cleaning)   
df2['clean_text'] = df2['tweet'].apply(cleaning)

In [33]:
df1.drop(['id','comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate'], axis=1, inplace=True)
df2.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither','class', 'tweet'], axis=1, inplace= True)

In [34]:
print(df1.head())
print(df2.head())

   is_bad                                         clean_text
0       0  explanation why the edits made under my userna...
1       0  d aww he matches this background colour i m se...
2       0  hey man i m really not trying to edit war it s...
3       0  more i can t make any real suggestions on impr...
4       0  you sir are my hero any chance you remember wh...
   is_bad                                         clean_text
0       0  rt as a woman you shouldn t complain about cle...
1       1  rt boy dats cold tyga dwn bad for cuffin dat h...
2       1  rt dawg rt you ever fuck a bitch and she start...
3       1         rt g anderson based she look like a tranny
4       1  rt the shit you hear about me might be true or...


In [35]:
df = pd.concat([df1,df2])
df.head()


Unnamed: 0,is_bad,clean_text
0,0,explanation why the edits made under my userna...
1,0,d aww he matches this background colour i m se...
2,0,hey man i m really not trying to edit war it s...
3,0,more i can t make any real suggestions on impr...
4,0,you sir are my hero any chance you remember wh...


In [36]:
len(df)

184354

In [37]:
num_words = 50000
embedding_dim = 16
tokenizer = Tokenizer(num_words=num_words, oov_token='<oov>')

tokenizer.fit_on_texts(df['clean_text'].tolist())

In [38]:
word_index  = tokenizer.word_index
word_count = len(word_index)
word_count

170077

In [39]:
df['sequences'] = tokenizer.texts_to_sequences(df['clean_text'])
df.drop(['clean_text'], inplace=True, axis=1)
df.head()

Unnamed: 0,is_bad,sequences
0,0,"[677, 78, 2, 133, 135, 183, 30, 658, 4286, 112..."
1,0,"[154, 13939, 51, 2564, 14, 552, 3698, 4, 68, 4..."
2,0,"[400, 365, 4, 68, 138, 15, 248, 3, 80, 319, 11..."
3,0,"[62, 4, 36, 22, 100, 59, 299, 1402, 16, 2061, ..."
4,0,"[8, 1609, 21, 30, 3281, 59, 1036, 8, 556, 43, ..."


In [40]:
pd.Index(df['is_bad']).value_counts()

0    147509
1     36845
Name: is_bad, dtype: int64

In [41]:
max_len = max([len(x) for x in df['sequences'].tolist()])
max_len

1403

In [42]:
sequences = np.array(df['sequences'])
labels = np.array(df['is_bad'])

In [43]:
sequences

array([list([677, 78, 2, 133, 135, 183, 30, 658, 4286, 11238, 985, 89, 340, 48, 2164, 22, 11239, 55, 6712, 16, 63, 2546, 153, 4, 2839, 38, 120, 1164, 15391, 2718, 5, 52, 60, 22, 247, 2, 369, 34, 2, 42, 31, 149, 4, 68, 3266, 90]),
       list([154, 13939, 51, 2564, 14, 552, 3698, 4, 68, 4383, 2440, 24, 98, 42, 967, 186]),
       list([400, 365, 4, 68, 138, 15, 248, 3, 80, 319, 11, 19, 55, 10, 14, 564, 9, 2172, 488, 497, 108, 5, 543, 3, 39, 327, 133, 360, 6, 30, 42, 31, 51, 219, 3, 418, 62, 40, 2, 2333, 96, 2, 716, 471]),
       ...,
       list([1377, 7683, 1608, 809, 2558, 6333, 49, 4, 2572, 1645, 4410, 75, 132]),
       list([41920, 234, 2936, 366, 16823, 8, 1187]),
       list([41756, 1, 35043, 19792, 2732, 546, 4387, 6, 3847, 3148, 2064, 1175, 560, 7, 22935])],
      dtype=object)

In [44]:
padded_data = pad_sequences(sequences=sequences, maxlen=max_len, truncating='post')

In [45]:
padded_data.shape

(184354, 1403)

In [50]:
model =Sequential([
    Embedding(num_words,embedding_dim, input_length=max_len),
    LSTM(64),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16,activation='relu'),
    Dense(1,activation='sigmoid')
])

In [51]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [52]:
model.fit(padded_data,labels,validation_split=0.3,epochs = 6, verbose = 1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x22996ebca88>

In [55]:
model.save('abusive-text-filter-model')



INFO:tensorflow:Assets written to: abusive-text-filter-model\assets


INFO:tensorflow:Assets written to: abusive-text-filter-model\assets


In [56]:
#save the tokenizer
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [58]:
##let's load the tokenizer and the model again just to show how it is done

# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
imported_model = tf.keras.models.load_model('abusive-text-filter-model')

In [71]:
text = str(input("Please insert your text"))
   

text = tokenizer.texts_to_sequences([text])
prediction = imported_model.predict_classes(text)

if prediction == 1:
    print("This is abusive")
else:
    print("This is clean")
 

Please insert your textI always dream about your beautiful face
This is clean
