In [82]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

# save for later
#from sklearn.decomposition import PCA
#import seaborn as sns
#import matplotlib.pyplot as plt
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
#from sklearn.linear_model import LogisticRegression
#from sklearn.datasets import make_classification
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import classification_report

#os.chdir(r"C:\Users\raned\Documents\GitHub\PostModeration")


This notebook will start off with preprocessing the two csv files to train different supervised learning models. 
- Removal of usernames, URLs, and special characters
- Lowercasing text
- Tokenization (nltk or spaCy): breaking text into smaller units 
- Stopword removal: remove common words that become index terms ("and", "or", "the", "in")
- Lemmatization: reduces words to their base or dictionary form
- TF-IDF vectorization for feature extraction: a technique that converts text data into numerical vectors, representing the importance of words in a document relative to a collection of documents, by combining term frequency with inverse document frequency

In [3]:
df = pd.read_csv("TrainingData/labeled_data.csv")
print(df.describe())
print(df.shape)
print(df.head())
print(df.info())


         Unnamed: 0         count   hate_speech  offensive_language  \
count  24783.000000  24783.000000  24783.000000        24783.000000   
mean   12681.192027      3.243473      0.280515            2.413711   
std     7299.553863      0.883060      0.631851            1.399459   
min        0.000000      3.000000      0.000000            0.000000   
25%     6372.500000      3.000000      0.000000            2.000000   
50%    12703.000000      3.000000      0.000000            3.000000   
75%    18995.500000      3.000000      0.000000            3.000000   
max    25296.000000      9.000000      7.000000            9.000000   

            neither         class  
count  24783.000000  24783.000000  
mean       0.549247      1.110277  
std        1.113299      0.462089  
min        0.000000      0.000000  
25%        0.000000      1.000000  
50%        0.000000      1.000000  
75%        0.000000      1.000000  
max        9.000000      2.000000  
(24783, 7)
   Unnamed: 0  count  hat

**count**: number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were

**hate_speech**: number of CF users who judged the tweet to be hate speech

**offensive_language**: number of CF users who judged the tweet to be offensive

**neither**: number of CF users who judged the tweet to be neither offensive nor non-offensive

**class**: class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither


In [4]:
# scrubbing text: removing usernames, URLs, special characters and ensuring all text is lowercase
tweet_column = df['tweet'].astype(str).str.casefold()  # lowercase
tweet_column.head()


0    !!! rt @mayasolovely: as a woman you shouldn't...
1    !!!!! rt @mleew17: boy dats cold...tyga dwn ba...
2    !!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...
3    !!!!!!!!! rt @c_g_anderson: @viva_based she lo...
4    !!!!!!!!!!!!! rt @shenikaroberts: the shit you...
Name: tweet, dtype: object

In [5]:
#removes usernames first, urls, then any special characters
clean_tweet = tweet_column.str.replace(r'(rt)?\s?@\w+:?', ' ', regex=True).str.replace(r'http.+', ' ', regex=True).str.replace(r'\W+', ' ', regex=True)
clean_tweet.head()


0     as a woman you shouldn t complain about clean...
1     boy dats cold tyga dwn bad for cuffin dat hoe...
2     dawg you ever fuck a bitch and she start to c...
3                               she look like a tranny
4     the shit you hear about me might be true or i...
Name: tweet, dtype: object

In [6]:
#tokenization, stop words, and lemmatization
from nltk.corpus import stopwords

nltk.download('punkt_tab')
#nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) #stopwords

def clean_tokenize(text): 
    tokens = word_tokenize(text)  # Keeps contractions like "don't"; tokenization
    tokens = [t.lower() for t in tokens if t.isalpha() or "'" in t]  # keep letters + contractions
    tokens = [t for t in tokens if t != "rt" and t not in stop_words]  # remove 'rt' and stopwords
    lemmatized = [lemmatizer.lemmatize(t) for t in tokens] #lemmatization
    return lemmatized

cleaned_tokens = clean_tweet.apply(clean_tokenize)
print(cleaned_tokens.head())




[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0    [woman, complain, cleaning, house, amp, man, a...
1    [boy, dat, cold, tyga, dwn, bad, cuffin, dat, ...
2    [dawg, ever, fuck, bitch, start, cry, confused...
3                                 [look, like, tranny]
4    [shit, hear, might, true, might, faker, bitch,...
Name: tweet, dtype: object


In [7]:
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

df1['cleaned_text'] = cleaned_tokens.apply(lambda tokens: ' '.join(tokens))


tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df1['cleaned_text'])

feature_names = tfidf.get_feature_names_out()
print(feature_names[:100])  
print(df1)


['aa' 'aaaaaaaaand' 'aaahhhhh' 'aahahah' 'aaliyah' 'aan' 'aap' 'aaron'
 'aaronmacgruder' 'aaryn' 'ab' 'abandonado' 'abbey' 'abby' 'abc' 'abdelka'
 'abduction' 'abdullah' 'abdurahman' 'abed' 'abel' 'aberdeen' 'ability'
 'able' 'abo' 'aborted' 'abortion' 'abou' 'abound' 'abouta' 'abouttime'
 'abraham' 'absent' 'absolute' 'absolutely' 'absoluteyvile' 'absolved'
 'abstract' 'absurd' 'abt' 'abu' 'abundance' 'abus' 'abuse' 'abused'
 'abuser' 'abusive' 'ac' 'aca' 'acab' 'academic' 'accelerated' 'accent'
 'accept' 'acceptable' 'acceptance' 'accepted' 'access' 'accessible'
 'accessorize' 'accessory' 'accident' 'accidentally' 'accipiter'
 'accipitridae' 'accnt' 'accolade' 'accompanied' 'accord' 'according'
 'accordingly' 'account' 'accountable' 'accountant' 'acct' 'accuracy'
 'accurate' 'accurately' 'accused' 'accuses' 'accustomed' 'acdc' 'ace'
 'aceptar' 'aceves' 'ach' 'achieve' 'achilles' 'aching' 'acid' 'ackin'
 'acknowledge' 'acknowledged' 'acknowledging' 'acl' 'acne' 'acoustic'
 'acquire' '

In [8]:
#PREPROCESSING FOR HateSpeechDatasetBalanced.csv

#Load dataset and take a 27,000-row sample; easier to have random 27,000 samples due to how big the actual dataset is 
df = pd.read_csv("TrainingData/HateSpeechDatasetBalanced.csv")
df_subset = df.sample(n=27000, random_state=42).copy()

df_subset['Content'] = df_subset['Content'].astype(str).str.casefold()

def clean_tokenize(text):
    tokens = word_tokenize(text)  # splits into words and keeps contractions
    tokens = [t for t in tokens if t.isalpha()]  # keep only alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]
    lemmatized = [lemmatizer.lemmatize(t) for t in tokens]
    return lemmatized


df_subset['cleaned_tokens'] = df_subset['Content'].apply(clean_tokenize)
df_subset['cleaned_text'] = df_subset['cleaned_tokens'].apply(lambda x: ' '.join(x))


df_subset['cleaned_text'] = df_subset['cleaned_text'].fillna('')


tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df_subset['cleaned_text'])  # Features
y = df_subset['Label']  # Target labels

print("TF-IDF shape:", X_tfidf.shape)
print(tfidf.get_feature_names_out()[:100])

TF-IDF shape: (27000, 33004)
['aa' 'aaa' 'aaaa' 'aaaaa' 'aaaaaaaaaaaaaaaaa' 'aaaaaaacopyrighta'
 'aaaaaaareaareaaaaaaaaaaaaaa' 'aaaaarrrrrggggghhhhh' 'aaaacg'
 'aaadonaaat' 'aaah' 'aaand' 'aachen' 'aaeyou' 'aag' 'aah' 'aaib' 'aaj'
 'aak' 'aalukkoru' 'aand' 'aanti' 'aap' 'aardvark' 'aaron' 'aaroncrick'
 'aarp' 'aau' 'ab' 'aba' 'aback' 'abacus' 'abandon' 'abandoned' 'abash'
 'abated' 'abaxial' 'abb' 'abba' 'abbey' 'abbott' 'abbreviated'
 'abbreviation' 'abc' 'abd' 'abdf' 'abdomen' 'abduce' 'abdul' 'abdullah'
 'abe' 'abecedary' 'abeh' 'abel' 'abelson' 'aberdeen' 'abet' 'abeyance'
 'abf' 'abhishek' 'abhorrent' 'abidance' 'abide' 'abiding' 'abigail'
 'ability' 'abiogenic' 'abject' 'abk' 'abkhazia' 'able' 'abm' 'abnegation'
 'abner' 'abnormal' 'abnormality' 'aboard' 'abode' 'abolish' 'abolished'
 'abolishment' 'abolitionist' 'abominable' 'abominably' 'abominate'
 'abomination' 'aboridzinima' 'aboriginal' 'aborigine' 'abort' 'aborted'
 'abortion' 'abortive' 'abortively' 'abound' 'abp' 'abraha

---
---
---


## Start Here
The code below combines the two datasets into one dataframe before any thing gets preprocessed. Since SVMs and LSTMs require different formats for their text input data, I created new columns 'tokenized_clean_text' and 'cleaned_text' that contain different formats of text so that training_data_df['Content'] doesn't have to be re-preprocessed all over again wheneveer we switch between models. 

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

# since both datasets have "Content/Tweet" and "Label/class" columns, the following code will merge the two datasets into one dataframe while maintaining balance

#============================================ labeled_data.csv ============================================
df1 = pd.read_csv("TrainingData/labeled_data.csv")
#reduced_df1['class'].value_counts()    #1430 tweets marked at hate-speech, so we will extract 1430 marked for hate speech and 1430 that aren't

# get rid of all the extra columns that aren't relevant
reduced_df1 = df1[['tweet','class']]

# extracts 1430 marked for hate speech and 1430 that aren't and combine into one df
hatespeech = reduced_df1[reduced_df1['class']==0].sample(n=1430, random_state=42).copy() # hate speech
nonHateful = reduced_df1[reduced_df1['class']==2].sample(n=1430, random_state=42).copy() # not hate speech
sampled_hatespeech_df = pd.concat([hatespeech,nonHateful])

# edit the values in 'Class' so that they match the values for HateSpeechDatasetBalanced.csv 
# Clean: 0, Hate speech: 1
sampled_hatespeech_df['Content'] = sampled_hatespeech_df['tweet']
sampled_hatespeech_df['Label'] = sampled_hatespeech_df['class'].replace(to_replace=[0,2], value = [1,0])
sampled_hatespeech_df = sampled_hatespeech_df.drop(columns=['tweet','class'])


#============================================ HateSpeechDatasetBalanced.csv ============================================
df = pd.read_csv("TrainingData/HateSpeechDatasetBalanced.csv")
df_subset = df.sample(n=27000, random_state=42).copy()

temp_df = df_subset[['Content','Label']]

# combine both datasets into one:
training_data_df = pd.concat([sampled_hatespeech_df,temp_df])



training_data_df['Content']= training_data_df['Content'].astype(str).str.casefold()
clean_content = training_data_df['Content'].str.replace(r'(rt)?\s?@\w+:?', ' ', regex=True).str.replace(r'http.+', ' ', regex=True)
training_data_df['Content'] = clean_content




In [87]:
training_data_df.head()

Unnamed: 0,Content,Label
2499,i do not like talking to you faggot and i di...,1
19697,what straight guys take a picture of themsel...,1
5749,"america today, the rule of thumb is: when i...",1
4615,"you tell me, coon.",1
15140,this nigguh chris paul,1


### SVM

In [88]:
# Cleaning for models that require sparse vectors as text  (SVMs,Logistic Regression, etc)

#tokenization, stop words, and lemmatization
from nltk.corpus import stopwords

nltk.download('punkt_tab')
#nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) #stopwords

def clean_tokenize(text):
    tokens = word_tokenize(text)  # splits into words and keeps contractions
    tokens = [t for t in tokens if t.isalpha()]  # keep only alphabetic tokens
    tokens = [t for t in tokens if t != "rt" and t not in stop_words]  # remove 'rt' and stopwords
    lemmatized = [lemmatizer.lemmatize(t) for t in tokens]
    return lemmatized


training_data_df['cleaned_tokens'] = training_data_df['Content'].apply(clean_tokenize)
training_data_df['tokenized_cleaned_text'] = training_data_df['cleaned_tokens'].apply(lambda x: ' '.join(x))
training_data_df['tokenized_cleaned_text'] = training_data_df['tokenized_cleaned_text'].fillna('')

print(training_data_df.head())

tfidf = TfidfVectorizer(ngram_range=(1,3))      # playing around with ngram_range, might go up to 3

X = tfidf.fit_transform(training_data_df['tokenized_cleaned_text'])  # Features
y = training_data_df['Label']  # Target labels

print("TF-IDF shape:", X.shape)
print(tfidf.get_feature_names_out()[:100])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                                 Content  Label  \
2499     i do not like talking to you faggot and i di...      1   
19697    what straight guys take a picture of themsel...      1   
5749      america today, the rule of thumb is: when i...      1   
4615                                  you tell me, coon.      1   
15140                             this nigguh chris paul      1   

                                          cleaned_tokens  \
2499           [like, talking, faggot, nicely, way, fag]   
19697    [straight, guy, take, picture, naked, hot, fag]   
5749   [america, today, rule, thumb, doubt, blame, wh...   
4615                                        [tell, coon]   
15140                              [nigguh, chris, paul]   

                            tokenized_cleaned_text  
2499            like talking faggot nicely way fag  
19697      straight guy take picture naked hot fag  
5749   america today rule thumb doubt blame whitey  
4615                    

In [89]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC  # for text, linear SVM usually works best
from sklearn.metrics import classification_report, confusion_matrix


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearSVC(C=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

legend = {0: "' is permitted", 1: "' has been flagged for hate speech."}


def moderate(tweet):
    tweet_clean = " ".join(clean_tokenize(tweet))
    tweet_vec = tfidf.transform([tweet_clean])
    pred = model.predict(tweet_vec)[0]
    print("The tweet: '",tweet,legend[pred],". Value: ",pred)
    



moderate("stfu you piece of trash")  # 1
moderate("Hope you're having a good day you cute piece of shit")  # 0
moderate("hello there")
moderate("today was fine")



[[2305  604]
 [ 555 2508]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      2909
           1       0.81      0.82      0.81      3063

    accuracy                           0.81      5972
   macro avg       0.81      0.81      0.81      5972
weighted avg       0.81      0.81      0.81      5972

The tweet: ' stfu you piece of trash ' has been flagged for hate speech. . Value:  1
The tweet: ' Hope you're having a good day you cute piece of shit ' is permitted . Value:  0
The tweet: ' hello there ' is permitted . Value:  0
The tweet: ' today was fine ' is permitted . Value:  0


In [70]:
# hyperparamter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(LinearSVC(), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)



Best Params: {'C': 1}




### LSTM

In [75]:
# Cleaning data for models that require sequences of integers as text input  (LSTM/Deep Nets)
import re

def clean_for_lstm(text):
    text = re.sub(r'(rt)?\s?@\w+:?', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    return text.lower().strip()

# clean_tweet = tweet_column.str.replace(r'(rt)?\s?@\w+:?', ' ', regex=True).str.replace(r'http.+', ' ', regex=True).str.replace(r'\W+', ' ', regex=True)
training_data_df['cleaned_text'] = training_data_df['Content'].apply(clean_for_lstm)
training_data_df.head()


Unnamed: 0,Content,Label,cleaned_tokens,tokenized_cleaned_text,cleaned_text
2499,@AustinG1135 I do not like talking to you fagg...,1,"[I, like, talking, faggot, I, nicely, way, fag]",I like talking faggot I nicely way fag,i do not like talking to you faggot and i did ...
19697,RT @mitchmancuso: @BrantPrintup:What straight ...,1,"[RT, mitchmancuso, BrantPrintup, What, straigh...",RT mitchmancuso BrantPrintup What straight guy...,rtwhat straight guys take a picture of themsel...
5749,"@clinchmtn316 @sixonesixband AMERICA today, th...",1,"[sixonesixband, AMERICA, today, rule, thumb, d...",sixonesixband AMERICA today rule thumb doubt b...,america today the rule of thumb is when in dou...
4615,"@STACCS_WNT_FOLD you tell me, coon.",1,"[tell, coon]",tell coon,you tell me coon
15140,RT @FAAMMoverALL: This nigguh Chris Paul,1,"[RT, FAAMMoverALL, This, nigguh, Chris, Paul]",RT FAAMMoverALL This nigguh Chris Paul,rt this nigguh chris paul


In [79]:
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
from keras._tf_keras.keras.models import Sequential
from keras._tf_keras.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# LSTM requires sequences of integers, NOT vectors
texts = training_data_df['cleaned_text'].values
labels = training_data_df['Label'].values

# Tokenize
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=50)  # 50 = max length

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42, stratify=labels)
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=50),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(4, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

history = model.fit(X_train, y_train,
                    epochs=5, # 7 epochs: 77% , 5 epochs had 77% accuracy, 3 epochs has 79%
                    batch_size=64,
                    validation_split=0.1)

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))




Epoch 1/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 34ms/step - accuracy: 0.6169 - loss: 0.6548 - val_accuracy: 0.7422 - val_loss: 0.5879
Epoch 2/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 39ms/step - accuracy: 0.8186 - loss: 0.4985 - val_accuracy: 0.7890 - val_loss: 0.5251
Epoch 3/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 38ms/step - accuracy: 0.8477 - loss: 0.4271 - val_accuracy: 0.7781 - val_loss: 0.5217
Epoch 4/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 39ms/step - accuracy: 0.8678 - loss: 0.3755 - val_accuracy: 0.7815 - val_loss: 0.5507
Epoch 5/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 39ms/step - accuracy: 0.8922 - loss: 0.3195 - val_accuracy: 0.7769 - val_loss: 0.5904
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
              precision    recall  f1-score   support

           0       0.78      0.77      0.78      2968
 

In [80]:
def moderate_lstm(tweet):
    cleaned = clean_for_lstm(tweet)
    print(cleaned)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, padding='post', maxlen=50)
    pred = model.predict(padded)[0][0]
    xx = lambda x : 1 if x > 0.5 else 0
    print('The tweet: "',tweet,legend[xx(pred)], ". Value: ",pred )
    return 1 if pred > 0.5 else 0


moderate_lstm("Hope you're having a good day you cute piece of shit")  # 0
moderate_lstm("hello there")
moderate_lstm("today was fine")
moderate_lstm("stfu you piece of trash")

hope youre having a good day you cute piece of shit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
The tweet: " Hope you're having a good day you cute piece of shit ' is permitted . Value:  0.23419806
hello there
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
The tweet: " hello there ' is permitted . Value:  0.23175243
today was fine
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
The tweet: " today was fine ' has been flagged for hate speech. . Value:  0.9286559
stfu you piece of trash
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
The tweet: " stfu you piece of trash ' has been flagged for hate speech. . Value:  0.9852149


1

In [81]:
# 64,32,4,1 and 5 epochs, batch size 32
moderate_lstm("Hope you're having a good day you cute piece of shit")  # 0
moderate_lstm("hello there")
moderate_lstm("today was fine")
moderate_lstm("stfu you piece of trash")

hope youre having a good day you cute piece of shit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
The tweet: " Hope you're having a good day you cute piece of shit ' is permitted . Value:  0.23419806
hello there
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
The tweet: " hello there ' is permitted . Value:  0.23175243
today was fine
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
The tweet: " today was fine ' has been flagged for hate speech. . Value:  0.9286559
stfu you piece of trash
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
The tweet: " stfu you piece of trash ' has been flagged for hate speech. . Value:  0.9852149


1