# Indonesian Text Classification

This is a text classification  of indonesian corpus using several different technique such as Naive Bayes, SVM, Random Forest, Convolition Neural Network (CNN), LSTM or GRU. An indonesian [pre-trained word vectors](https://fasttext.cc/docs/en/pretrained-vectors.html) from FastText has ben also used in our Neural Network models.
We use [Word Bahasa Indonesia Corpus and Parallel English Translation](https://www.panl10n.net/english/outputs/Indonesia/BPPT/0902/BPPTIndToEngCorpusHalfM.zip) dataset from PAN Localization.
It contains 500,000 words from various online sources translated into English.
For our text classification, we use only the indonesian part.
The corpus has 4 classes:
  - 0: Economy
  - 1: International
  - 2: Science
  - 3: Sport
 
Originally each class is in separate file, we combine, randomize and split it to train and test file with 90:10.  



In [1]:
!pip install xgboost

[33mYou are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
from sklearn import model_selection, preprocessing
from sklearn import linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import tensorflow as tf
#import pandas as pd, xgboost, numpy, textblob, string
import pandas as pd, xgboost, numpy, string
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import layers, models, optimizers
from keras.preprocessing.sequence import pad_sequences
from pathlib import Path

import os
import numpy as np
#import ntlk

Using TensorFlow backend.


In [3]:
# Parameters
#LMDATA = Path('/content/drive/My Drive/lmdata')
LMDATA = Path('/mnt/mldata/data/LM/id/dataset')
params = {'batch_size': 1024,
          'n_classes': 2,
          'max_len': 100,
          'n_words': 50000,
          'shuffle': True}
try:
  TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  strategy=tf.contrib.tpu.TPUDistributionStrategy(
      tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER))
except KeyError:
  TPU_WORKER = None

np.random.seed(seed=10)

In [3]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pwd
print(LMDATA)
!ls -lh "$LMDATA"
!ls -lh "$LMDATA/BPPTIndToEngCorpus"

/home/cahya/Work/Machine Learning/LM/language-modeling/indonesia
/mnt/mldata/data/LM/id/dataset
total 445M
drwx------ 2 cahya cahya 4.0K Nov  6 06:56 BPPTIndToEngCorpus
-rw-rw-r-- 1 cahya cahya 2.3M Mar 26  2012 BPPTIndToEngCorpusHalfM.zip
drwx------ 7 cahya cahya   76 Jan 29  2010 Parallel Corpus
-rw-rw-r-- 1 cahya cahya 1.2M Mar 26  2012 Parallel Corpus.zip
-rw-rw-r-- 1 cahya cahya 8.4M Oct 27  2009 UI-1M-tagged.txt
-rw-rw-r-- 1 cahya cahya 2.3M Mar 26  2012 UI-1M-tagged.zip
-rw-rw-r-- 1 cahya cahya  31M Oct 16 11:09 bard.h5
drwxrwxr-x 2 cahya cahya   98 Jan 20 19:56 clickbait
-rw-rw-r-- 1 cahya cahya  31M Oct 16 11:09 cnn.h5
-rw-rw-r-- 1 cahya cahya  35M Oct 16 14:01 cnn_kimyoon.h5
-rw-rw-r-- 1 cahya cahya  31M Oct 22 17:49 rcnn.h5
-rw-rw-r-- 1 cahya cahya  32M Oct 22 17:46 rnn_bidirectional.h5
-rw-rw-r-- 1 cahya cahya  31M Oct 22 17:43 rnn_gru.h5
-rw-rw-r-- 1 cahya cahya  32M Oct 22 17:41 rnn_lstm.h5
-rw-r--r-- 1 cahya cahya  25M Oct 15 17:30 wiki.id.10K.vec
-rw-r--r-- 1 cahya cahy

In [5]:
train_df = pd.read_csv(LMDATA/'BPPTIndToEngCorpus/bppt_panl_train.csv')
train_df.columns = ['label', 'text']
test_df = pd.read_csv(LMDATA/'BPPTIndToEngCorpus/bppt_panl_test.csv')
test_df.columns = ['label', 'text']

In [6]:
print(train_df.head())
#print(train_df['label'][:10].values)
#print(train_df['text'][:10].values)

   label                                               text
0      0  Pertumbuhan ekonomi 2007 yang diproyeksikan me...
1      3  Pelatih Real Bernd Schuster harus mengeluarkan...
2      2  Laporan itu adalah pengumuman kedua dari badan...
3      0  Lonjakan laba bersih tersebut, selain didorong...
4      3  LeBron James menyumbang 24 poin, 11 assist dan...


In [7]:
!set |grep -i tpu|grep -v grep

In [8]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_df['text'], train_df['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [9]:
print(train_x[:5])
print(train_y[:5])
print(valid_x[:5])
print(valid_y[:5])


19555    Layanan fantastis ini membutuhkan waktu lama u...
9774     Gasparotto memimpin lomba keseluruhan disusul ...
10459    Pereli Prancis itu unggul dua menit 33,2 detik...
19916    Karena itu Antam juga melakukan diversifikasi ...
15343    Para kapitalis ini menginginkan pemerintah Ind...
Name: text, dtype: object
[2 3 3 0 0]
3081     Menurut Shahab, peralatan Rig pada BJP-1R1 sud...
1601     Anak pepsis memakan daging tarantula dan berli...
20221    Dalam hal kepemilikan asset, maka sektor-sekto...
2118     Ilmuwan telah menemukan informasi genetik yang...
17469    Anda menyesuaikan dengan itu dan lawan juga sama.
Name: text, dtype: object
[0 2 0 2 3]


In [10]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_df['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             max_features=5000)
tfidf_vect.fit(train_df['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                                   ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train_df['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}',
                                         ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train_df['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [13]:
# load the pre-trained word-embedding vectors
max_words = params['n_words']
embeddings_index = {}
for i, line in enumerate(open(LMDATA/'wiki.id.300K.vec', encoding='utf8')):
  if i%50000 == 0:
    print(i)
  values = line.split()
  try:
    embeddings_index[" ".join(values[0:-300])] = numpy.asarray(values[-300:], dtype='float32')
  except ValueError:
    print("Values: {}: {}".format(i, values))

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(train_df['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), 
                                     maxlen=params['max_len'])
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), 
                                     maxlen=params['max_len'])

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    if i>=max_words:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

0
50000
Values: 73989: ['pembinaannya', '-0.087859', '0.17738', '-0.25976', '0.24112', '-0.26144', '0.10555', '-0.37918', '-0.11241', '-0.15916', '-0.31183', '0.037231', '-0.42092', '0.18129', '-0.12284', '0.11765', '0.09406', '0.26735', '-0.21884', '-0.11559', '-0.22417', '-0.25178', '-0.13234', '-0.17492', '-0.2665', '0.060694', '-0.085088', '0.080088', '0.057243', '0.18208', '-0.29722', '0.28678', '-0.21725', '-0.25867', '-0.40978', '-0.021054', '-0.16561', '0.15877', '-0.276', '0.24313', '-0.31692', '-0.096804', '0.012354', '-0.010719', '0.40314', '0.22857', '-0.089445', '-0.083771', '0.31009', '-0.0004702', '-0.044815', '0.25317', '-0.14032', '0.0075435', '-0.082932', '-0.09254', '-0.40893', '0.30927', '0.21751', '0.26585', '0.098622', '0.12569', '-0.092163', '-0.22007', '0.22404', '0.12774', '-0.47525', '-0.076955', '0.038016', '0.032717', '-0.020418', '-0.13611', '-0.13123', '0.38105', '-0.20649', '0.088231', '-0.23918', '-0.22516', '-0.20772', '-0.016647', '-0.21131', '-0.07584

In [14]:
print(LMDATA)

/mnt/mldata/data/LM/id/dataset


## Conventional Machine Learning

In [15]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, 
                is_neural_net=False, epochs=1):
    # fit the training dataset on the classifier
    if is_neural_net:
      classifier.fit(feature_vector_train, label, epochs=epochs)
    else:
      classifier.fit(feature_vector_train, label)   
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
      predictions = [int(round(p[0])) for p in predictions]
      #predictions = predictions.argmax(axis=-1)

    print(" predictions:", predictions[:20])
    print("ground truth:", valid_y[:20])
    
    return metrics.accuracy_score(predictions, valid_y)

### Naive Bayes

In [17]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

 predictions: [0 2 0 2 3 0 0 2 0 1 0 2 2 0 1 1 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
NB, Count Vectors:  0.9269195189639223
 predictions: [0 2 0 2 3 0 1 2 0 1 0 2 2 0 1 1 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
NB, WordLevel TF-IDF:  0.9161887141535615
 predictions: [0 2 0 2 1 0 1 2 0 1 0 2 2 0 1 0 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
NB, N-Gram Vectors:  0.7822386679000926
 predictions: [0 2 0 2 2 0 0 2 2 1 2 2 0 0 1 2 3 2 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
NB, CharLevel Vectors:  0.8432932469935245


### Linear Classifier

In [18]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print( "LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

 predictions: [0 2 0 2 2 0 0 2 2 1 0 2 2 0 0 1 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
LR, Count Vectors:  0.9265494912118409
 predictions: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 1 0 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
LR, WordLevel TF-IDF:  0.9178538390379278
 predictions: [0 2 2 2 1 0 1 2 2 1 0 2 2 0 1 0 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
LR, N-Gram Vectors:  0.8085106382978723
 predictions: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 1 2 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
LR, CharLevel Vectors:  0.8888066604995375


### SVM

In [20]:
# SVM on Ngram Level TF IDF Vectors
# We have to use the option kernel='linear', the default kernel (rbf) doesn't work 
# properly (thanks to Leksono Nanto for the hint)
accuracy = train_model(svm.SVC(kernel='linear'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)


 predictions: [0 2 2 2 1 0 1 2 2 1 0 2 2 0 1 0 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
SVM, N-Gram Vectors:  0.7970397779833488


### Random Forest

In [21]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)

 predictions: [0 2 0 2 1 0 0 2 2 1 0 2 0 0 0 0 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
RF, Count Vectors:  0.8392229417206291
 predictions: [0 2 0 2 1 0 0 2 0 2 0 2 2 0 1 0 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
RF, WordLevel TF-IDF:  0.8296022201665125


###  Extreme Gradient Boosting

In [18]:
# Extreme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

# Extreme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

# Extreme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: ", accuracy)

  if diff:


 predictions: [0 2 2 2 2 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
Xgb, Count Vectors:  0.808695652173913


  if diff:


 predictions: [0 2 2 2 2 0 0 2 2 1 0 2 2 0 0 2 3 2 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
Xgb, WordLevel TF-IDF:  0.8070305272895467
 predictions: [0 2 0 2 2 0 1 2 2 2 0 2 2 0 1 2 3 2 1 3]
ground truth: [0 2 0 2 3 0 0 2 2 1 0 2 2 0 0 2 3 0 1 3]
Xgb, CharLevel Vectors:  0.8201665124884366


  if diff:


## Neural Network

In [13]:
from tensorflow.keras.utils import Sequence, to_categorical

def tokenize(texts, n_words=1000):
    tokenizer = Tokenizer(num_words=n_words)
    tokenizer.fit_on_texts(texts)
    return tokenizer
  
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, texts, labels, tokenizer, batch_size=32, max_len=100,
                 n_classes=2, n_words=1000, shuffle=True):
        'Initialization'
        self.max_len = max_len
        self.batch_size = batch_size
        self.texts = texts
        self.labels = labels
        self.n_classes = n_classes
        self.shuffle = shuffle

        self.tokenizer = tokenizer
        self.steps_per_epoch = int(np.floor(self.texts.size / self.batch_size))
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.steps_per_epoch

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        texts = np.array([self.texts[k] for k in indexes])
        sequences = self.tokenizer.texts_to_sequences(texts)
        X = pad_sequences(sequences, maxlen=self.max_len)
        y = np.array([to_categorical(self.labels[k], 4) for k in indexes])

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.texts.size)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

In [14]:
# Create data generator
training_generator = DataGenerator(train_x.values, train_y, token, **params)
valid_generator = DataGenerator(valid_x.values, valid_y, token,  **params)

In [15]:
def tpu_wrapper(func):
  if TPU_WORKER is not None:
    tpu_model = tf.contrib.tpu.keras_to_tpu_model(func, strategy)
    return tpu_model
  else:
    return func

In [0]:
# This model doesn't work with current dataset
def create_simple_model(input_length=100):
    # create input layer 
    input_layer = layers.Input((input_length, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu", name="D1")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(4, activation="softmax")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy')
    return classifier 

#classifier = create_simple_model(xtrain_tfidf_ngram.shape[1])
#accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, 
#                       is_neural_net=True, epochs=1)
#print("NN, Ngram Level TF IDF Vectors",  accuracy)

In [0]:
%%time

#NN, Ngram Level TF IDF Vectors
tf.keras.backend.clear_session()
classifier = create_simple_model(input_length=params['max_len'])
classifier = tpu_wrapper(classifier)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=20
)

classifier.save_weights(str(LMDATA/'bard.h5'), overwrite=True)

In [25]:
xtrain_tfidf_ngram.shape

(16214, 5000)

In [0]:
"""
def to_tpu(func):
    def wrapper(*args, **kwargs):
      print("TPU Wrapper start")
      print(func)
      if TPU_WORKER is not None:
        tpu_model = tf.contrib.tpu.keras_to_tpu_model(func, strategy)
        print("TPU exist")
        return tpu_model(*args, **kwargs)
      else:
        print("TPU not exist")
        return func(*args, **kwargs)
    print("TO_TPU")
    return wrapper
"""

### CNN

In [16]:
def create_cnn(input_length=100):
    # Add an Input Layer
    input_layer = layers.Input((input_length, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, 
                                       weights=[embedding_matrix], 
                                       trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(100, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.3)(output_layer1)
    output_layer2 = layers.Dense(4, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [19]:
%%time

tf.keras.backend.clear_session()
classifier = create_cnn(input_length=params['max_len'])
classifier = tpu_wrapper(classifier)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=20
)

classifier.save_weights(str(LMDATA/'bard.h5'), overwrite=True)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 27.4 s, sys: 2.78 s, total: 30.1 s
Wall time: 24.1 s


In [20]:

classifier.save_weights(str(LMDATA/'cnn.h5'), overwrite=True)

In [21]:
# read test dataset
test_x = test_df['text'].values
sequences = token.texts_to_sequences(test_x)
test_x_seq = pad_sequences(sequences, maxlen=params['max_len'])
#print(valid_x_seq[:5])

In [22]:
# predict the labels on test dataset

classifier = create_cnn(input_length=params['max_len'])
classifier.load_weights(str(LMDATA/'bard.h5'))
# TPU can be enabled here if we need  
# classifier = tpu_wrapper(classifier)
labels = np.array([list(to_categorical(label, 4).astype(int)) for label in test_df['label'].values])
score = classifier.evaluate(test_x_seq, labels, verbose=1)
print('Loss for final step: {}, accuracy: {}'.format(score[0], score[1]))

Loss for final step: 0.21042589038039722, accuracy: 0.9263114071606994


In [0]:
"""
# Just for testing
labels = [list(to_categorical(label, 4).astype(int)) for label in test_df['label'].values]
#print(labels[:5])
print(np.array(labels)[:5])
print(test_df['label'].values)
print(to_categorical(3, 4))
"""

In [18]:
prediction = classifier.predict(test_x_seq, verbose=1)
#print(prediction)
print('Predictions for final step: {}'.format(np.argmax(prediction, axis=1)[:10]))
print(test_x[:10])

Predictions for final step: [0 0 2 3 1 0 0 1 2 2]
['Paradoksnya di sisi lain, sinyal akan diakuinya keberadaan lembaga keuangan mikro non formal juga membuat kegelisahan bagi pelakunya.'
 'Menurut dia, harga minyak mentah dunia saat ini berada di posisi 126 dolar AS per barel sedikit melemah dibanding hari sebelumnya yang mencapai 127 dolar AS lebih.'
 'Pengkajian ini memberikan konfirmasi bahwa kita menghadapi sejenis materi yang berbeda sama sekali, tak seperti yang kita bayangkan.'
 'Gol akhir Marco Borriello membuat Genoa mendapat satu angka setelah striker asal Honduras, David Suazo, mencetak angka pada menit ke-12 untuk Inter, yang pemain tengahnya dari Portugal, Pele, dikeluarkan dari lapangan karena mendapat kartu kuning kedua sebelum turun minum.'
 'Namun, bandar udara itu ditutup sebagai langkah pencegahan.'
 'Departemen Keuangan telah menetapkan 16 calon agen penjual obligasi negara ritel ORI pada 2007 melalui SK Dirjen Pengelolaan Utang Depkeu No KEP-07/PU/2007 tertanggal 1

In [37]:
!free

              total        used        free      shared  buff/cache   available
Mem:       32931128     7942452    12817856      139512    12170820    24099684
Swap:      16777212           0    16777212


In [0]:

#print('Accuracy ', score[1])
"""
# First, run the seed forward to prime the state of the model.
#prediction_model.reset_states()
strategy = tf.contrib.tpu.TPUDistributionStrategy(
    tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER))
prediction_model = tf.contrib.tpu.keras_to_tpu_model(
    prediction_model, strategy=strategy)

predictions = prediction_model.predict(valid_x_seq)

print("predictions", predictions[:20])

predictions = tpu_model.predict(valid_x_seq)

predictions = [int(round(p[0])) for p in predictions]
#predictions = predictions.argmax(axis=-1)

print("predictions", predictions[:20])
print("valid_y", valid_y[:20])

return metrics.accuracy_score(predictions, valid_y)
"""

In [19]:
print(prediction)
print(np.argmax(prediction, axis=1))


[[9.9131995e-01 2.5257450e-03 6.1324406e-03 2.1910730e-05]
 [9.9915075e-01 4.9755513e-04 3.4053557e-04 1.1170353e-05]
 [1.5233310e-01 4.2050965e-02 8.0292422e-01 2.6917094e-03]
 ...
 [8.2090375e-04 9.9912328e-01 3.7728907e-05 1.8159501e-05]
 [1.7484943e-04 7.3114820e-02 3.3964191e-03 9.2331380e-01]
 [1.0749870e-04 3.1766740e-05 9.9985719e-01 3.5256546e-06]]
[0 0 2 ... 1 3 2]


### Kim Yoon’s CNN

In [20]:
# The following model is similar to 
# Kim Yoon’s Convolutional Neural Networks for Sentence Classification
# (https://arxiv.org/abs/1408.5882)
#
# Model Hyperparameters
embedding_dim = 300
filter_sizes = (3, 5, 7)
num_filters = 100
dropout_prob = (0.5, 0.5)
hidden_dims = 50

def create_cnn_kimyoon(input_length=100):  
    # Add an Input Layer
    input_layer = layers.Input((input_length, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedding_dim, 
                                       weights=[embedding_matrix], 
                                       trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(dropout_prob[0])(embedding_layer)
    
    conv_array = []
    for sz in filter_sizes:
        conv = layers.Convolution1D(filters=num_filters,
                             kernel_size=sz,
                             padding="valid",
                             activation="relu",
                             strides=1)(embedding_layer)
        conv = layers.MaxPooling1D(pool_size=2)(conv)
        conv = layers.Flatten()(conv)
        conv_array.append(conv)
    
    layer = layers.Concatenate()(conv_array) if len(conv_array) > 1 else conv_array[0]
    
    layer = layers.Dropout(dropout_prob[1])(layer)
    layer = layers.Dense(hidden_dims, activation="relu")(layer)
    output_layer = layers.Dense(4, activation="softmax")(layer)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=optimizers.Adam(),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [34]:
%%time

tf.keras.backend.clear_session()
classifier = create_cnn_kimyoon(input_length=params['max_len'])
classifier = tpu_wrapper(classifier)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=20
)

classifier.save_weights(str(LMDATA/'cnn_kimyoon.h5'), overwrite=True)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 34 s, sys: 5.09 s, total: 39.1 s
Wall time: 34 s


In [23]:
# predict the labels on test dataset

classifier = create_cnn_kimyoon(input_length=params['max_len'])
classifier.load_weights(str(LMDATA/'cnn_kimyoon.h5'))
# TPU can be enabled here if we need  
# classifier = tpu_wrapper(classifier)
labels = np.array([list(to_categorical(label, 4).astype(int)) for label in test_df['label'].values])
score = classifier.evaluate(test_x_seq, labels, verbose=1)
print('Loss for final step: {}, accuracy: {}'.format(score[0], score[1]))

Loss for final step: 0.2389291900704048, accuracy: 0.9163197335553706


### RNN-LSTM

In [24]:
def create_rnn_lstm(input_length=100):
    # Add an Input Layer
    input_layer = layers.Input((input_length, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.2)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)
    #lstm_layer = layers.LSTM(100)(lstm_layer)
    #lstm_layer = layers.LSTM(100, return_sequences=True)(lstm_layer)
    #lstm_layer = layers.TimeDistributed(layers.Dense(100))(lstm_layer)
    # Add the output Layers
    output_layer1 = layers.Dense(100, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.2)(output_layer1)
    output_layer2 = layers.Dense(4, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [29]:
%%time

# RNN-LSTM, Word Embeddings
# Speed comparison between CPU vs TPU
# First we test CPU with 2 epochs
tf.keras.backend.clear_session()
classifier = create_rnn_lstm(input_length=params['max_len'])
#classifier = tf.contrib.tpu.keras_to_tpu_model(classifier, strategy=strategy)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=2
)

Epoch 1/2
Epoch 2/2
CPU times: user 20.4 s, sys: 2.24 s, total: 22.6 s
Wall time: 16.4 s


In [25]:
%%time

# RNN-LSTM, Word Embeddings
# We test now the TPU
# The result is:
# CPU: 65s/epoch
# TPU: 3.15s/epoch
tf.keras.backend.clear_session()
classifier = create_rnn_lstm(input_length=params['max_len'])
classifier = tpu_wrapper(classifier)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=20
)

classifier.save_weights(str(LMDATA/'rnn_lstm.h5'), overwrite=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 1min 46s, sys: 6.95 s, total: 1min 53s
Wall time: 56.7 s


In [26]:
# predict the labels on test dataset

classifier = create_rnn_lstm(input_length=params['max_len'])
classifier.load_weights(str(LMDATA/'rnn_lstm.h5'))
# TPU can be enabled here if we need  
# classifier = tpu_wrapper(classifier)
labels = np.array([list(to_categorical(label, 4).astype(int)) for label in test_df['label'].values])
score = classifier.evaluate(test_x_seq, labels, verbose=1)
print('Loss for final step: {}, accuracy: {}'.format(score[0], score[1]))

Loss for final step: 0.21542491647821976, accuracy: 0.9304746044962531


### RNN-GRU

In [27]:
def create_rnn_gru(input_length=100):
    # Add an Input Layer
    input_layer = layers.Input((input_length, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(4, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [28]:
%%time

# RNN-GRU, Word Embeddings
tf.keras.backend.clear_session()
classifier = create_rnn_gru(input_length=params['max_len'])
classifier = tpu_wrapper(classifier)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=20
)

classifier.save_weights(str(LMDATA/'rnn_gru.h5'), overwrite=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 1min 33s, sys: 6.27 s, total: 1min 39s
Wall time: 49.2 s


In [30]:
# predict the labels on test dataset

classifier = create_rnn_gru(input_length=params['max_len'])
classifier.load_weights(str(LMDATA/'rnn_gru.h5'))
# TPU can be enabled here if we need  
# classifier = tpu_wrapper(classifier)
labels = np.array([list(to_categorical(label, 4).astype(int)) for label in test_df['label'].values])
score = classifier.evaluate(test_x_seq, labels, verbose=1)
print('Loss for final step: {}, accuracy: {}'.format(score[0], score[1]))

Loss for final step: 0.21358667354996655, accuracy: 0.9296419650291424


### Biderectional RNN

In [31]:
# RNN-Bidirectional, Word Embeddings
# It doesn't work with TPU, but it works on CPU/GPU
def create_bidirectional_rnn(input_length=100):
    # Add an Input Layer
    input_layer = layers.Input((input_length, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(4, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [32]:
%%time

# RNN-Bidirectional, Word Embeddings
# It doesn't work with TPU, but it works on CPU/GPU
tf.keras.backend.clear_session()
classifier = create_bidirectional_rnn(input_length=params['max_len'])
#classifier = tpu_wrapper(classifier)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=20
)

classifier.save_weights(str(LMDATA/'rnn_bidirectional.h5'), overwrite=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 3min 1s, sys: 14.4 s, total: 3min 15s
Wall time: 1min 18s


In [33]:
# predict the labels on test dataset

classifier = create_bidirectional_rnn(input_length=params['max_len'])
classifier.load_weights(str(LMDATA/'rnn_bidirectional.h5'))
# TPU can be enabled here if we need  
# classifier = tpu_wrapper(classifier)
labels = np.array([list(to_categorical(label, 4).astype(int)) for label in test_df['label'].values])
score = classifier.evaluate(test_x_seq, labels, verbose=1)
print('Loss for final step: {}, accuracy: {}'.format(score[0], score[1]))

Loss for final step: 0.21609353537761997, accuracy: 0.9267277268942548


### RCNN

In [34]:
def create_rcnn(input_length=100):
    # Add an Input Layer
    input_layer = layers.Input((input_length, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(4, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [35]:
%%time

# RCNN, Word Embeddings
tf.keras.backend.clear_session()
classifier = create_rcnn(input_length=params['max_len'])
classifier = tpu_wrapper(classifier)
classifier.fit_generator(
    generator=training_generator,
    validation_data=valid_generator,
    #use_multiprocessing=True,
    #workers=6,
    epochs=20
)

classifier.save_weights(str(LMDATA/'rcnn.h5'), overwrite=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 27.3 s, sys: 2.7 s, total: 30 s
Wall time: 23.8 s


In [36]:
# predict the labels on test dataset

classifier = create_rcnn(input_length=params['max_len'])
classifier.load_weights(str(LMDATA/'rcnn.h5'))
# TPU can be enabled here if we need  
# classifier = tpu_wrapper(classifier)
labels = np.array([list(to_categorical(label, 4).astype(int)) for label in test_df['label'].values])
score = classifier.evaluate(test_x_seq, labels, verbose=1)
print('Loss for final step: {}, accuracy: {}'.format(score[0], score[1]))

Loss for final step: 0.21368380979038495, accuracy: 0.9221482098251457
