In [26]:
import pickle
import math
import pandas as pd
import numpy as np
from numpy import array

# Neural Net Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Neural Net Layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

# Neural Net Training
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

from pickle import load
import re
import sklearn.utils 
import math


# --- Clean up the tweet strings --- 
def regex_to_pattern_objects(regex_list):
    #regex_list = array of strings to be interpreted as regex
    pattern_objs = []
    for regex in regex_list:
        pattern_objs.append(re.compile(regex)) 
    return pattern_objs 

def remove_regex(tweet, *bad_patterns):
    #tweet = string
    #bad_patterns = a list of pattern objects to remove
    for pattern in bad_patterns:
        tweet = re.sub(pattern, "", tweet)
    return tweet
        
def to_lowercase(tweet): #is this necessary lol
    return tweet.lower()

#TODO: modify the regex list if needed
regex_list = ['\n', 'RT', '&amp', '&#\d*;', '@\S*:', '@\S*', '!+', '"+', 'https?:\/\/t\.co\/\w*', '#', '&\S*;']
pattern_list = regex_to_pattern_objects(regex_list)

In [27]:
# Import the data
train_df = pd.read_csv('labeled_data.csv')
train_df["tweet"] = train_df["tweet"].apply(remove_regex, args = (pattern_list))
train_df["tweet"] = train_df["tweet"].apply(to_lowercase)
hate_speech = train_df[train_df['class'] == 0]["tweet"] 

In [28]:
hate_speech

85                                            queer gaywad
89         alsarabsss hes a beaner smh you can tell hes...
110        you're fucking gay, blacklisted hoe holding ...
184       lmfaoooo i hate black people  this is why the...
202                             at least i'm not a nigger 
                               ...                        
24576                  this guy is the biggest faggot omfg
24685    which one of these names is more offensive kik...
24751           you a pussy ass nigga and i know it nigga.
24776                                   you're all niggers
24777    you're such a retard i hope you get type 2 dia...
Name: tweet, Length: 1430, dtype: object

In [29]:

max_words =5000# Max size of the dictionary
tokens = Tokenizer(num_words=max_words)
tokens.fit_on_texts(hate_speech.values)
sequences = tokens.texts_to_sequences(hate_speech.values)
print(sequences[:3])

[[98, 1386], [1387, 429, 1, 230, 127, 2, 63, 156, 429, 1, 430], [57, 30, 95, 1388, 72, 652, 71, 28, 1389, 528]]


In [30]:
words = [item for sublist in sequences for item in sublist]
num_words = len(tokens.word_index)
print('Number of words in the document: ', num_words)

Number of words in the document:  3686


In [31]:
sentence_len = 10
pred_len = 1
train_len = sentence_len - pred_len
seq = []
# Sliding window to generate train data
for i in range(len(words)-sentence_len):
    seq.append(words[i:i+sentence_len])
# Reverse dictionary to decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokens.word_index.items()))

In [32]:
trainX = []
trainy = []
for i in seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])
print(len(trainX),len(trainy))

17946 17946


In [33]:
num_words

3686

In [34]:
# define model
model = Sequential([
    Embedding(num_words+1, 50, input_length=train_len),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dense(num_words-2, activation='softmax')
])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 9, 50)             184350    
_________________________________________________________________
lstm_8 (LSTM)                (None, 9, 100)            60400     
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 3684)              372084    
Total params: 707,334
Trainable params: 707,334
Non-trainable params: 0
_________________________________________________________________


In [11]:


# Train model with checkpoints
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
filepath = "./hate_speech.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history = model.fit(np.asarray(trainX),
         pd.get_dummies(np.asarray(trainy)),
         epochs = 60,
         batch_size = 128,
         callbacks = callbacks_list,
         verbose = 1)

Epoch 1/60
Epoch 00001: loss improved from inf to 7.14863, saving model to .\hate_speech.hdf5
Epoch 2/60
Epoch 00002: loss improved from 7.14863 to 6.71247, saving model to .\hate_speech.hdf5
Epoch 3/60
Epoch 00003: loss improved from 6.71247 to 6.66649, saving model to .\hate_speech.hdf5
Epoch 4/60
Epoch 00004: loss improved from 6.66649 to 6.63310, saving model to .\hate_speech.hdf5
Epoch 5/60
Epoch 00005: loss improved from 6.63310 to 6.58860, saving model to .\hate_speech.hdf5
Epoch 6/60
Epoch 00006: loss improved from 6.58860 to 6.51183, saving model to .\hate_speech.hdf5
Epoch 7/60
Epoch 00007: loss improved from 6.51183 to 6.42149, saving model to .\hate_speech.hdf5
Epoch 8/60
Epoch 00008: loss improved from 6.42149 to 6.31962, saving model to .\hate_speech.hdf5
Epoch 9/60
Epoch 00009: loss improved from 6.31962 to 6.20392, saving model to .\hate_speech.hdf5
Epoch 10/60
Epoch 00010: loss improved from 6.20392 to 6.09270, saving model to .\hate_speech.hdf5
Epoch 11/60
Epoch 00011

In [7]:
num_words

0

In [35]:

model.load_weights('hate_speech.hdf5')

In [36]:
def generate_hate(model,text,length):

    # Tokenize the input string
    passing_tokens = tokens.texts_to_sequences([text])
    length = length+len(passing_tokens[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 19 long, the number of input features.
    while len(passing_tokens[0]) < length:
        padded_sentence = pad_sequences(passing_tokens[-19:],maxlen=19)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        passing_tokens[0].append(op.argmax()+1)
        
    return " ".join(map(lambda x : reverse_word_map[x],passing_tokens[0]))

In [37]:
hate_speech

85                                            queer gaywad
89         alsarabsss hes a beaner smh you can tell hes...
110        you're fucking gay, blacklisted hoe holding ...
184       lmfaoooo i hate black people  this is why the...
202                             at least i'm not a nigger 
                               ...                        
24576                  this guy is the biggest faggot omfg
24685    which one of these names is more offensive kik...
24751           you a pussy ass nigga and i know it nigga.
24776                                   you're all niggers
24777    you're such a retard i hope you get type 2 dia...
Name: tweet, Length: 1430, dtype: object

In [38]:
for i in range(10):
    test_words = hate_speech.iloc[i].split()
    test_string = hate_speech.iloc[i]
    new_speech = generate_hate(model,test_string,len(test_words))
    new_words = new_speech.split()
    print('New hate speech: ',' '.join([j for j in new_words[len(test_words):]]))

New hate speech:  he's white
New hate speech:  he's he's a faggot faggot if he stupid bitch don't bitch
New hate speech:  faggot bitch he's a faggot a hoe u don't bitch
New hate speech:  he's he's a faggot for the school im was trash me trash trash
New hate speech:  when he's trash trash trash queer
New hate speech:  is a faggot bitch he pussy if the faggot is im you say it your can unfollow he retarded
New hate speech:  fag this trash and trash if you look this a faggot and because he trash if he give you
New hate speech:  niggas don't bitch he's if the
New hate speech:  y was the faggot pussy sit it as the
New hate speech:  bitch you can and because a hoes and you can can fuckin to fuckin feminist a


#### Reference - 'Simple Text Generation' https://towardsdatascience.com/simple-text-generation-d1c93f43f340