In [None]:
import pandas as pd
import numpy as np
import re
import csv
import random
import tensorflow as tf
import tensorflow_hub as hub

from numpy import array
from tensorflow.keras.optimizers import SGD
import tensorflow_datasets as tfds
from __future__ import absolute_import, division, print_function, unicode_literals
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/capstone/dataset_safe/data.csv', encoding='latin-1')
alay_dict = pd.read_csv('/content/drive/MyDrive/capstone/dataset_safe/new_kamusalay.csv', encoding='latin-1', header=None)
stopwords = pd.read_csv('/content/drive/MyDrive/capstone/dataset_safe/stopword.csv', encoding='latin-1')
alay_dict = alay_dict.rename(columns={0: 'original', 
                                      1: 'replacement'})
stopwords = stopwords.rename(columns={0: 'stopword'})

In [None]:
EMBEDDING_DIM = 64
MAXLEN = 300
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"

In [None]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = re.sub(r"\d+", "", text) # Remove number
    text = text.encode('ascii', 'replace').decode('ascii') # Remove non ASCII
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split()) #Remove hastag, mention
    return text

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    text = ' '.join(['' if word in stopwords.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def preprocess(text):
    text = lowercase(text) # 1
    text = remove_nonaplhanumeric(text) # 2
    text = remove_unnecessary_char(text) # 3
    text = normalize_alay(text) # 4
    text = remove_stopword(text) # 5
    return text

In [None]:
alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))

In [None]:
data = data[['Tweet', 'HS']]
data['Tweet'] = data['Tweet'].apply(preprocess)

In [None]:
train_size = 10000
sentences = data['Tweet']
labels = data['HS']
    
train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

val_sentences = sentences[train_size:]
val_labels = labels[train_size:]

In [None]:
def fit_tokenizer(sentences, oov_token):
    tokenizer = Tokenizer(oov_token=oov_token)
    tokenizer.fit_on_texts(sentences)
    return tokenizer

In [None]:
tokenizer = fit_tokenizer(train_sentences, OOV_TOKEN)
word_index = tokenizer.word_index
VOCAB_SIZE = len(tokenizer.word_index)

In [None]:
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):
    sequences = tokenizer.texts_to_sequences(sentences)
    pad_trunc_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding, truncating=truncating)
    return pad_trunc_sequences

In [None]:
sentences_train_pad_trunc_seq = seq_pad_and_trunc(train_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)
sentences_val_pad_trunc_seq = seq_pad_and_trunc(val_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)

In [None]:
# training_padded = np.array(sentences_train_pad_trunc_seq).astype('float32').reshape((-1,1))
training_padded = np.array(sentences_train_pad_trunc_seq)
training_labels = np.array(train_labels)
testing_padded = np.array(sentences_val_pad_trunc_seq)
testing_labels = np.array(val_labels)

In [None]:
print("Total words", VOCAB_SIZE)

Total words 16574


In [None]:
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM))

In [None]:
# from keras.layers.pooling import GlobalAveragePooling1D
# model = tf.keras.Sequential([ 
        
#         tf.keras.layers.Embedding(VOCAB_SIZE+1, EMBEDDING_DIM, input_length=MAXLEN),
#         # tf.keras.layers.GlobalAveragePooling1D(),
#         # tf.keras.layers.Dense(64, activation='relu'),
#         # tf.keras.layers.Dropout(0.5),
#         # tf.keras.layers.Dense(1, activation='sigmoid')
#         tf.keras.layers.LSTM(64, dropout=0.1),
#         tf.keras.layers.Dense(128, activation='relu'),
#         tf.keras.layers.Dense(64, activation='relu'),
#         tf.keras.layers.Dense(1, activation='sigmoid')
#     ])
# model.summary()

# Embedding layers
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Bidirectional
from keras import utils
from keras.callbacks import ReduceLROnPlateau

embedding_layer = Embedding(VOCAB_SIZE+1, EMBEDDING_DIM, weights=[EMBEDDINGS_MATRIX], input_length=MAXLEN)
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(units=64, recurrent_dropout = 0.3, dropout = 0.3, return_sequences = True)))
model.add(Bidirectional(LSTM(units=32, recurrent_dropout = 0.1, dropout = 0.1)))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 300, 64)           1060800   
                                                                 
 bidirectional (Bidirectiona  (None, 300, 128)         66048     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,168,129
Trainable params: 1,168,129
Non-trainable params: 0
_________________________________________________________________


In [None]:
opt = SGD(lr=0.00001, momentum=0.9)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

# Train the model and save the training history
history = model.fit(training_padded, training_labels, epochs=50, validation_data=(testing_padded, testing_labels))

Epoch 1/50


  super(SGD, self).__init__(name, **kwargs)


Epoch 2/50
 14/313 [>.............................] - ETA: 7:35 - loss: 0.6922 - accuracy: 0.6049

In [None]:
prediksi = loss, accuracy=model.evaluate(testing_padded, testing_labels)



In [None]:
#Predict
def decode(score):
  if (score < 0.0):
    return "NEGATIVE"
  if (score == 0.0):
    return "NETRAL"
  else:
    return "POSITIF"

def predict(text):
 test = pad_sequences(tokenizer1.texts_to_sequences([text]), maxlen=MAXLEN)
 score = model.predict(test)
 label = decode(score)
 return {"label": label, "score": (score)}

In [None]:
predict("""Cabe-cabean, orang bilang cewek kampungan
Cabe-cabean, sukanya kelayapan
Cabe-cabean, orang bilang cewek kampungan
Cabe-cabean, nongkrong-nongkrong di jalan""")

{'label': 'POSITIF', 'score': array([[0.37247175]], dtype=float32)}

In [None]:
def encode(x):
  x = tokenizer1.texts_to_sequences(x)
  x = pad_sequences(x, maxlen=MAXLEN, padding='post')
  return x

In [None]:
x = "kasar"
x = encode(x)

In [None]:
model.predict(x)

array([[0.37113756],
       [0.37113756],
       [0.37113753],
       [0.37113756],
       [0.37113756]], dtype=float32)