In [1]:
import keras.optimizers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences v 2.9 kebawah
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM, GRU, Conv1D, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
import matplotlib.pyplot as plt

## BIGU-CNN_TFIDF_UNIGRAM_BIGRAM_TRIGRAM_DROPOUT0.5
Copyright @I Gde Bagus Janardana Abasan

In [2]:
def process_data(test_size):
  dataset = pd.read_csv('..//..//../data/data-for-test/dataset_kepolisian_clean_test.csv', usecols=['preprocess_final', 'label_fase_1'])
  # COPY ONLY 100 data
  # make data['label_fase_1'] to 0 and 1 binary classifier karena tensor hanya bisa input 0 =< label =< 1
  dataset['label_fase_1'] = dataset['label_fase_1'].apply(lambda x: 1 if x == 'HS' else 0)

  tf_idf_vec = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,3), max_features=10000)

  #   transform
  tf_idf_data = tf_idf_vec.fit_transform(dataset['preprocess_final'])
  x_final = pd.DataFrame(tf_idf_data.toarray(), columns=tf_idf_vec.get_feature_names_out())
  y_final = dataset['label_fase_1'].copy()

  # split data
  x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=test_size, random_state=42)

  return x_train, x_test, y_train, y_test

In [3]:
def create_bigru_cnn_model(x_train, x_test, y_train, y_test, epochs, batch_size):
  # BiGRU + CNN model
  #reshape
  X_train = np.array(x_train).reshape(x_train.shape[0], x_train.shape[1], 1)
  X_test = np.array(x_test).reshape(x_test.shape[0], x_test.shape[1], 1)

  model = Sequential()
  model.add(Bidirectional(GRU(input_shape=X_train.shape, units=32, return_sequences=True)))
  model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(X_train.shape[1], 1)))
  model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
  model.add(Flatten())
  model.add(Dense(units=32, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(units=1, activation='sigmoid'))
  model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
  model.build(input_shape=(None, X_train.shape[1], 1))
  model.summary()
  model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)
  loss = model.history.history['loss']
  val_loss = model.history.history['val_loss']
  print("=== MODEL EVALUATE TEST DATA ===")
  score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=0)
  print()
  print('Validation Accuracy:', score[1])
  print('Validation Loss:', score[0])
  print()

  return model, loss, val_loss

In [4]:
def call():
  x_train, x_test, y_train, y_test = process_data(test_size=0.2)
  model, loss, val_loss = create_bigru_cnn_model(x_train, x_test, y_train, y_test, epochs=10, batch_size=64)
  # predict
  y_pred = model.predict(x_test)
  print()

  # confusion matrix
  classreport = classification_report(y_test, y_pred.round(), digits=4)
  print(classreport)
  print()
  accscore = accuracy_score(y_test, y_pred.round())
  precscore = precision_score(y_test, y_pred.round())
  recscore = recall_score(y_test, y_pred.round())
  f1score = f1_score(y_test, y_pred.round())

  return [accscore, precscore, recscore, f1score, loss, val_loss]

In [None]:
result = []
for i in range (5):
  result.append([i+1] + call())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 10000, 64)        6720      
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 10000, 32)         6176      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 5000, 32)         0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 5000, 32)          0         
                                                                 
 flatten (Flatten)           (None, 160000)            0         
                                                                 
 dense (Dense)               (None, 32)                5

In [None]:
def loss_func(df):
  for idx, val in enumerate(df['No']):
    plt.plot(df['Loss'].iloc[idx])
    plt.plot(df['Val Loss'].iloc[idx])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper right')
    plt.savefig(f'hasil/grafik/lossfunctionGraphics_BIGRU_CNN_UNIGRAM_BIGRAM_TRIGRAM_DROPOUT0.5-{idx+1}.png')
    plt.show()
    plt.clf()

In [None]:
df = pd.DataFrame(result, columns=['No', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Loss', 'Val Loss'])
df.to_csv('hasil/hasil_BIGRU_CNN_UNIGRAM_BIGRAM_TRIGRAM_DROPOUT0.5.csv', index=False)

#average
print('Average Accuracy :', df['Accuracy'].mean())
print('Average Precision :', df['Precision'].mean())
print('Average Recall :', df['Recall'].mean())
print('Average F1 Score :', df['F1 Score'].mean())
loss_func(df)