In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import string
import gensim
from gensim.models import Word2Vec, KeyedVectors
import keras
from keras.layers.core import Reshape, Flatten
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, LSTM, MaxPooling1D, concatenate
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.models import Model
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, hamming_loss
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
def get_all_label(x_train, y_train):
    y_train_set = []
    sent = x_train[0]
    X_train_set = [sent]
    labels = []
    for i in range (len(x_train)):
        if sent != x_train[i]:
            if (len(labels)>0):
                y_train_set.append(labels)
            sent = x_train[i]
            X_train_set.append(sent)
            labels = []
        labels.append(y_train[i])
    return X_train_set, y_train_set

def tokenize(msg):
    clean = [char for char in msg if char not in string.punctuation]
    clean = ''.join(clean)
    return clean.lower().split()

def find_maxlen(reviews):
    longest = 0
    strlong = "test"
    for review in reviews:
        if len(tokenize(review)) > longest:
            longest = len(tokenize(review))
            strlong = review
    return longest

def train_w2v(corpus, size, min_count):
    docs = [tokenize(doc) for doc in corpus]
    model = gensim.models.Word2Vec(docs,
                                   size=size,
                                   window=10,
                                   min_count=min_count,
                                   workers=10)
    model.train(docs, total_examples=len(docs), epochs=10)
    return model

def tokenize_text(vocab_size, reviews,maxlen):
    tokenizer = Tokenizer(num_words=vocab_size, lower=True)
    tokenizer.fit_on_texts(reviews)
    sequences = tokenizer.texts_to_sequences(reviews)
    x = pad_sequences(sequences, maxlen=maxlen)
    word_index = tokenizer.word_index
    return x, word_index

def createEmbeddingMatrix(word_index, vocab_size, dim, word_vectors):
    EMBEDDING_DIM=dim
    vocabulary_size=min(len(word_index)+1,vocab_size)
    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i>=vocab_size:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
    return embedding_matrix

def createCNNModel(filter_sizes, num_filters, embedding_matrix, embedding_dim, vocabulary_size, maxlen, num_classes):
    filter_sizes = filter_sizes
    num_filters = num_filters
    drop = 0.5
    
    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocabulary_size,
                            embedding_dim,
                            weights=[embedding_matrix],
                            trainable=True)
    embedding = embedding_layer(inputs)
    
    conv_0 = Conv1D(num_filters, filter_sizes[0],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_1 = Conv1D(num_filters, filter_sizes[1],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_2 = Conv1D(num_filters, filter_sizes[2],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    
    maxpool_0 = MaxPooling1D(maxlen - filter_sizes[0] + 1, strides=1)(conv_0)
    maxpool_1 = MaxPooling1D(maxlen - filter_sizes[1] + 1, strides=1)(conv_1)
    maxpool_2 = MaxPooling1D(maxlen - filter_sizes[2] + 1, strides=1)(conv_2)
    
    merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
    flatten = Flatten()(merged_tensor)
    
    dense1 = Dense(256)(flatten)
    dropout = Dropout(drop)(dense1)
    
    output = Dense(units=num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    
    # this creates a model that includes
    model = Model(inputs, output)
    print(model.summary())
    return model

def createCNNLSTMModel(filter_sizes, num_filters, embedding_matrix, embedding_dim, vocabulary_size, maxlen, num_classes):
    filter_sizes = filter_sizes
    num_filters = num_filters
    drop = 0.5
    
    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocabulary_size,
                            embedding_dim,
                            weights=[embedding_matrix],
                            trainable=True)
    embedding = embedding_layer(inputs)
    
    conv_0 = Conv1D(num_filters, filter_sizes[0],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_1 = Conv1D(num_filters, filter_sizes[1],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_2 = Conv1D(num_filters, filter_sizes[2],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    
    maxpool_0 = MaxPooling1D(maxlen - filter_sizes[0] + 1, strides=1)(conv_0)
    maxpool_1 = MaxPooling1D(maxlen - filter_sizes[1] + 1, strides=1)(conv_1)
    maxpool_2 = MaxPooling1D(maxlen - filter_sizes[2] + 1, strides=1)(conv_2)
    
    merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
    flatten = Flatten()(merged_tensor)
    
    dense1 = Dense(256)(merged_tensor)
    dropout = Dropout(drop)(dense1)
    lstm_1 = LSTM(128)(dropout)
    output = Dense(units=num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(lstm_1)
    
    # this creates a model that includes
    model = Model(inputs, output)
    print(model.summary())
    return model

def trainCNN(x_train, y_train, model, epochs, filepath):
    adam = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy', metrics=['categorical_accuracy'],
              optimizer=adam)
    callbacks = [EarlyStopping(patience=4),
            ModelCheckpoint(filepath=filepath, save_best_only=True)]
    model.fit(x_train, y_train, batch_size=32, epochs=epochs, validation_split=0.1,
         callbacks=callbacks)
    
def get_output_cnn(model, x_train, x_test):
    total_layers = len(model.layers)
    fl_index = total_layers-1
    feature_layer_model = Model(
                     inputs=model.input,
                     outputs=model.get_layer(index=fl_index).output)
    x_train_xg = feature_layer_model.predict(x_train)
    x_test_xg = feature_layer_model.predict(x_test)
    return x_train_xg, x_test_xg

## Preproses Data

In [4]:
data = pd.read_csv('dataset/priority_3k_labelled.csv', sep=',')

In [5]:
data.head()

Unnamed: 0,review,category_sentiment,category,sentiment
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,wifi_P1-neg,wifi_P1,neg
1,Kotor berdebu. Saya tdk berhenti bersin ketika...,kebersihan-neg,kebersihan,neg
2,kamar ada semutnya. kamar mandi bermasalah. bu...,kebersihan-neg,kebersihan,neg
3,"Kamar mandi bau, airnya bau",bau_P1-neg,bau_P1,neg
4,"tak sesuai espektasi, kamar sempit, pintu kama...",service-neg,service,neg


In [6]:
data = data.drop(columns='category_sentiment')
data.head()

Unnamed: 0,review,category,sentiment
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,wifi_P1,neg
1,Kotor berdebu. Saya tdk berhenti bersin ketika...,kebersihan,neg
2,kamar ada semutnya. kamar mandi bermasalah. bu...,kebersihan,neg
3,"Kamar mandi bau, airnya bau",bau_P1,neg
4,"tak sesuai espektasi, kamar sempit, pintu kama...",service,neg


In [7]:
X_train, y_train = get_all_label(data['review'], data['category'])

In [8]:
data_train = list(zip(X_train, y_train))
df_train = pd.DataFrame(data_train, columns=['review', 'labels'])
df_train.head()

Unnamed: 0,review,labels
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,"[wifi_P1, kebersihan]"
1,kamar ada semutnya. kamar mandi bermasalah. bu...,[kebersihan]
2,"Kamar mandi bau, airnya bau",[bau_P1]
3,"tak sesuai espektasi, kamar sempit, pintu kama...",[service]
4,buruk. kasur ada bekas sperma seprai jg air ba...,"[linen_P1, wifi_P1]"


In [9]:
mlb = MultiLabelBinarizer()
y = df_train.labels
y = mlb.fit_transform(y)
mlb.classes_

array(['ac_P1', 'air_panas_P1', 'bau_P1', 'general', 'kebersihan',
       'linen_P1', 'service', 'sunrise_meal_P1', 'tv_P1', 'wifi_P1'],
      dtype=object)

In [10]:
find_maxlen(df_train.review)

138

## Train w2v model

In [11]:
model = train_w2v(df_train.review, 400, 2)
model.wv.save_word2vec_format('model/w2v_model.bin', binary=True)

In [12]:
word_vectors = KeyedVectors.load_word2vec_format('model/w2v_model.bin', binary=True)

## Tokenize & create embedding matrix

In [13]:
x, word_index = tokenize_text(5000, df_train.review, 180)
embedding_matrix = createEmbeddingMatrix(word_index, 5000, 400, word_vectors)

## Train test split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Build CNN Model

In [15]:
cnn_model = createCNNModel([1,3,5], 128, embedding_matrix, 400, 5000, 180, 10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 180)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 180, 400)     2000000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 180, 128)     51328       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 178, 128)     153728      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [16]:
trainCNN(x_train, y_train, cnn_model, 100, 'model/model-cnn.h5')

Train on 2073 samples, validate on 231 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


In [17]:
cnn_model = keras.models.load_model('model/model-cnn.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(cnn_model.metrics_names[0], metrics[0]))
print("{}: {}".format(cnn_model.metrics_names[1], metrics[1]))

loss: 0.2900873389508989
categorical_accuracy: 0.6284722222222222


## Train XGBoost model

In [18]:
x_train_xg, x_test_xg = get_output_cnn(cnn_model, x_train, x_test)

In [19]:
clf = BinaryRelevance(XGBClassifier())
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

## Result CNN-XGBoost

In [20]:
category = mlb.classes_.tolist()
print(classification_report(y_test,y_pred, target_names=category, digits=4))

                 precision    recall  f1-score   support

          ac_P1     0.9593    0.9752    0.9672       121
   air_panas_P1     0.9306    0.9437    0.9371        71
         bau_P1     0.8864    0.9176    0.9017        85
        general     0.4400    0.3729    0.4037        59
     kebersihan     0.8672    0.9367    0.9006       237
       linen_P1     0.8418    0.8817    0.8613       169
        service     0.8639    0.8581    0.8610       148
sunrise_meal_P1     0.7826    0.7660    0.7742        47
          tv_P1     0.8913    0.8039    0.8454        51
        wifi_P1     0.9694    0.9596    0.9645        99

      micro avg     0.8658    0.8786    0.8721      1087
      macro avg     0.8433    0.8415    0.8417      1087
   weighted avg     0.8623    0.8786    0.8698      1087
    samples avg     0.9000    0.9038    0.8821      1087



In [21]:
hammloss = hamming_loss(y_test,y_pred)
print(hammloss)

0.04861111111111111


In [132]:
clf = ClassifierChain(XGBClassifier(), order=[0,1,9,8,5,4,2,3,7,6])
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

In [133]:
y_pred = y_pred.todense().tolist()

In [134]:
for p in y_pred:
    p[0], p[1], p[9], p[8], p[5], p[4], p[2], p[3], p[7], p[6] =  p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9]

In [135]:
y_pred = np.array(y_pred)

In [136]:
print(classification_report(y_test,y_pred, target_names=category, digits=4))

                 precision    recall  f1-score   support

          ac_P1     0.9593    0.9752    0.9672       121
   air_panas_P1     0.9306    0.9437    0.9371        71
         bau_P1     0.8876    0.9294    0.9080        85
        general     0.4902    0.4237    0.4545        59
     kebersihan     0.8770    0.9325    0.9039       237
       linen_P1     0.8466    0.8817    0.8638       169
        service     0.8759    0.8581    0.8669       148
sunrise_meal_P1     0.7955    0.7447    0.7692        47
          tv_P1     0.8913    0.8039    0.8454        51
        wifi_P1     0.9794    0.9596    0.9694        99

      micro avg     0.8740    0.8804    0.8772      1087
      macro avg     0.8533    0.8452    0.8485      1087
   weighted avg     0.8711    0.8804    0.8752      1087
    samples avg     0.9056    0.9069    0.8869      1087



In [137]:
hammloss = hamming_loss(y_test,y_pred)
print(hammloss)

0.04652777777777778


## XGBoost without CNN

In [138]:
clf = BinaryRelevance(XGBClassifier())
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
category = mlb.classes_.tolist()
print(classification_report(y_test, y_pred, target_names=category, digits=4))

                 precision    recall  f1-score   support

          ac_P1     0.8163    0.3306    0.4706       121
   air_panas_P1     0.7333    0.1549    0.2558        71
         bau_P1     0.4286    0.0353    0.0652        85
        general     0.3333    0.0169    0.0323        59
     kebersihan     0.5315    0.3207    0.4000       237
       linen_P1     0.5493    0.2308    0.3250       169
        service     0.6087    0.1892    0.2887       148
sunrise_meal_P1     0.0000    0.0000    0.0000        47
          tv_P1     0.6667    0.0392    0.0741        51
        wifi_P1     1.0000    0.0909    0.1667        99

      micro avg     0.6040    0.1923    0.2917      1087
      macro avg     0.5668    0.1409    0.2078      1087
   weighted avg     0.5969    0.1923    0.2716      1087
    samples avg     0.2718    0.1998    0.2166      1087



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Baseline: CNN-SVM

In [139]:
svm_clf = BinaryRelevance(LinearSVC())
svm_clf.fit(x_train_xg, y_train)
y_pred = svm_clf.predict(x_test_xg)
print(classification_report(y_test,y_pred, target_names=category, digits=4))

                 precision    recall  f1-score   support

          ac_P1     0.9661    0.9421    0.9540       121
   air_panas_P1     0.9394    0.8732    0.9051        71
         bau_P1     0.8902    0.8588    0.8743        85
        general     0.4615    0.2034    0.2824        59
     kebersihan     0.8645    0.9156    0.8893       237
       linen_P1     0.8683    0.8580    0.8631       169
        service     0.8947    0.8041    0.8470       148
sunrise_meal_P1     0.8214    0.4894    0.6133        47
          tv_P1     0.8667    0.7647    0.8125        51
        wifi_P1     0.9892    0.9293    0.9583        99

      micro avg     0.8880    0.8243    0.8550      1087
      macro avg     0.8562    0.7639    0.7999      1087
   weighted avg     0.8752    0.8243    0.8443      1087
    samples avg     0.9093    0.8640    0.8672      1087



In [140]:
hammloss = hamming_loss(y_test,y_pred)
print(hammloss)

0.05277777777777778


## Baseline CNN-LSTM

In [141]:
cnn_lstm_model = createCNNLSTMModel([1,3,5], 128, embedding_matrix, 400, 5000, 180, 10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 180)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 180, 400)     2000000     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 180, 128)     51328       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 178, 128)     153728      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_6 (

In [142]:
trainCNN(x_train, y_train, cnn_lstm_model, 100, 'model/model-cnn-lstm.h5')

Train on 2073 samples, validate on 231 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [143]:
cnn_lstm_model = keras.models.load_model('model/model-cnn-lstm.h5')
metrics = cnn_lstm_model.evaluate(x_test, y_test)
print("{}: {}".format(cnn_lstm_model.metrics_names[0], metrics[0]))
print("{}: {}".format(cnn_lstm_model.metrics_names[1], metrics[1]))

loss: 0.322007159392039
categorical_accuracy: 0.6701388888888888


In [144]:
y_pred = cnn_lstm_model.predict(x_test)

In [145]:
y_pred_bool = (y_pred > 0.1)

In [146]:
y_pred = np.array([[0 if x==False else 1 for x in arr] for arr in y_pred_bool])

In [147]:
y_pred[4]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [148]:
print(classification_report(y_test,y_pred, target_names=category, digits=4))

                 precision    recall  f1-score   support

          ac_P1     0.9435    0.9669    0.9551       121
   air_panas_P1     0.9079    0.9718    0.9388        71
         bau_P1     0.8444    0.8941    0.8686        85
        general     0.0952    0.0339    0.0500        59
     kebersihan     0.8129    0.9536    0.8777       237
       linen_P1     0.8045    0.8521    0.8276       169
        service     0.7975    0.8514    0.8235       148
sunrise_meal_P1     0.0000    0.0000    0.0000        47
          tv_P1     0.7576    0.9804    0.8547        51
        wifi_P1     0.8505    0.9192    0.8835        99

      micro avg     0.8191    0.8289    0.8240      1087
      macro avg     0.6814    0.7423    0.7079      1087
   weighted avg     0.7594    0.8289    0.7910      1087
    samples avg     0.8578    0.8627    0.8383      1087



In [149]:
hammloss = hamming_loss(y_test,y_pred)
print(hammloss)

0.06684027777777778
