Melakukan import libraries yang dibutukan

In [2]:
import pandas as pd
import tensorflow as tf
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import codecs
factory=StemmerFactory()
stemmer=factory.create_stemmer()

Melakukan reading data hate speech tweet

In [None]:
data=pd.read_csv('data/data.csv',encoding='latin-1')
stopword=pd.read_csv('data/stopwordbahasa.csv',header=None)
stopword=stopword.rename(columns={0:'stopwords'})
kamus_alay=pd.read_csv('data/new_kamusalay.csv',header=None,encoding='latin-1')
kamus_alay=kamus_alay.rename(columns={0:'asli',1:'baru'})
data

Menggunakan data HS dan Abusive

In [None]:
data=data[['Tweet','HS','Abusive']]
data

Preprocessing Text

In [None]:
#Stemming text
def stemWord(x):
    return stemmer.stem(x)

In [None]:
#Lower text
def lowercase(text):
    return text.lower()

In [None]:
#Remove unnecessary charracters
def removeUnnecessary(text):
    text=re.sub('\n','',text) # menghapus newline
    text=re.sub('rt','',text) # menghapus kata retweet
    text=re.sub('user','',text) # menghapus kata user
    text=re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # menghapus url
    text = re.sub('  +', ' ', text) # menghapus karakter ekstra
    text=re.sub('x.{3} | x.{2}','',text) # menghapus pola emoji 
    return text

In [None]:
#Remove non alphanumeric character
def removeNonAlphanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    text="".join(filter(lambda x: not x.isdigit(), text))
    return text

In [None]:
#remove indonesian stopword
def removeStopWord(text):
    text = ' '.join(['' if word in stopword.stopwords.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text


In [None]:
#normalize alay
alay=dict(zip(kamus_alay['asli'],kamus_alay['baru']))
def normalizeAlay(text):
    return ' '.join([alay[word] if word in alay else word for word in text.split(' ')])

In [None]:
def preProcessing(sentences):
    sentences=lowercase(sentences)
    sentences=removeNonAlphanumeric(sentences)
    sentences=removeUnnecessary(sentences)
    sentences=normalizeAlay(sentences)
    sentences=stemWord(sentences)
    sentences=removeStopWord(sentences)
    return sentences

In [None]:
data['Tweet']=data['Tweet'].apply(preProcessing)
data

In [None]:
data.to_csv('data_clean.csv',columns=['Tweet','HS','Abusive'],index=False)

Import Clean Data

In [3]:
data=pd.read_csv('data_clean.csv')
data.Tweet=data.Tweet.astype(str)
data

Unnamed: 0,Tweet,HS,Abusive
0,cowok usaha lacak perhati gue lantas remeh per...,1,1
1,telat tau edan sarap gue gaul cigax jifla cal ...,0,1
2,kadang pikir percaya tuhan jatuh kali kali kad...,0,0
3,ku tau mata sipit lihat,0,0
4,kaum cebong kafir lihat dongok dungu haha,1,1
...,...,...,...
13164,bicara ndasmu congor kate anjing,1,1
13165,kasur enak kunyuk,0,1
13166,hati hati bisu bosan huftxaa,0,0
13167,bom real mudah deteksi bom kubur dahsyat ledak...,0,0


In [4]:
data.to_excel('data_clean.xlsx')

Split Data Into Train, Validation and Test

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [9]:
# Split data into train and validation
x_train,x_val,y_train,y_val=train_test_split(data['Tweet'],data['HS'],test_size=0.2,random_state=1)

#Split train data into train and test
x_train,x_test,y_train,y_test=train_test_split(x_train,y_train,test_size=0.25,random_state=1)

Tokenize the train,val and test data

In [10]:
vocab_size=1000
max_len=100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

In [11]:
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)

In [12]:
sequences=tokenizer.texts_to_sequences(x_train)
padded=pad_sequences(sequences=sequences,padding=padding_type,truncating=trunc_type,maxlen=max_len)
sequences_val=tokenizer.texts_to_sequences(x_val)
padded_val=pad_sequences(sequences=sequences_val,padding=padding_type,truncating=trunc_type,maxlen=max_len)

Create the model architecture

In [14]:
model=tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64, input_length=max_len),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
         tf.keras.layers.Dense(128,activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           64000     
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         66048     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 245,505
Trainable params: 245,505
Non-trai

Setting the loss function, optimizer and metrics for the model training

In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

Train the model

In [16]:
history=model.fit(padded,y_train,epochs=20,validation_data=(padded_val,y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
