In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

## Praproses Data

In [2]:
data = pd.read_csv('priority_1k_labelled.csv', sep=',')

In [3]:
data.head()

Unnamed: 0,review,category-sentiment,category,sentiment,Unnamed: 4
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,kebersihan-neg,kebersihan,neg,
1,Kotor berdebu. Saya tdk berhenti bersin ketika...,wifi_p1-neg,wifi_p1,neg,
2,kamar ada semutnya. kamar mandi bermasalah. bu...,kebersihan-neg,kebersihan,neg,
3,"Kamar mandi bau, airnya bau",bau_P1-neg,bau_P1,neg,
4,"tak sesuai espektasi, kamar sempit, pintu kama...",service-neg,service,neg,


In [4]:
data = data.drop(columns='category-sentiment')

In [5]:
data.isnull().any()

review        False
category      False
sentiment     False
Unnamed: 4     True
dtype: bool

In [6]:
def get_all_label(x_train, y_train):
    y_train_set = []
    sent = x_train[0]
    X_train_set = [sent]
    labels = []
    for i in range (len(x_train)):
        if sent != x_train[i]:
            if (len(labels)>0):
                y_train_set.append(labels)
            sent = x_train[i]
            X_train_set.append(sent)
            labels = []
        labels.append(y_train[i])
    return X_train_set, y_train_set

In [7]:
X_train, y_train = get_all_label(data['review'], data['category'])

In [8]:
data_train = list(zip(X_train, y_train))

In [9]:
df_train = pd.DataFrame(data_train, columns=['review', 'labels'])
df_train.head()

Unnamed: 0,review,labels
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,"[kebersihan, wifi_p1]"
1,kamar ada semutnya. kamar mandi bermasalah. bu...,[kebersihan]
2,"Kamar mandi bau, airnya bau",[bau_P1]
3,"tak sesuai espektasi, kamar sempit, pintu kama...",[service]
4,buruk. kasur ada bekas sperma seprai jg air ba...,"[linen_P1, wifi_p1]"


In [10]:
mlb = MultiLabelBinarizer()
y = df_train.labels
y = mlb.fit_transform(y)
mlb.classes_b


array(['ac_P1', 'air_panas_P1', 'bau_P1', 'general', 'kebersihan',
       'linen_P1', 'service', 'sunrise_meal_P1', 'tv_P1', 'wifi_P1',
       'wifi_p1'], dtype=object)

In [24]:
y_list = y.tolist()
print(y)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 1 ... 0 0 0]]


In [11]:
num_classes = len(mlb.classes_)

In [12]:
df_y = pd.DataFrame(y, columns=mlb.classes_)
df_train = df_train.join(df_y)
df_train = df_train.drop(columns=['labels'])

In [13]:
df_train.head()

Unnamed: 0,review,ac_P1,air_panas_P1,bau_P1,general,kebersihan,linen_P1,service,sunrise_meal_P1,tv_P1,wifi_P1,wifi_p1
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,0,0,0,0,1,0,0,0,0,0,1
1,kamar ada semutnya. kamar mandi bermasalah. bu...,0,0,0,0,1,0,0,0,0,0,0
2,"Kamar mandi bau, airnya bau",0,0,1,0,0,0,0,0,0,0,0
3,"tak sesuai espektasi, kamar sempit, pintu kama...",0,0,0,0,0,0,1,0,0,0,0
4,buruk. kasur ada bekas sperma seprai jg air ba...,0,0,0,0,0,1,0,0,0,0,1


In [14]:
df_train.to_csv('priority_1k_labelled_onehot.csv', sep=',')

In [15]:
import string
def tokenize(msg):
    clean = [char for char in msg if char not in string.punctuation]
    clean = ''.join(clean)
    return clean.lower().split()

In [16]:
longest = 0
for review in df_train.review:
    if len(tokenize(review)) > longest:
        longest = len(review)
print(longest)

118


In [83]:
docs = [tokenize(doc) for doc in df_train.review]

In [86]:
import gensim 
model = gensim.models.Word2Vec(
        docs,
        size=400,
        window=10,
        min_count=2,
        workers=10)

model.train(docs, total_examples=len(docs), epochs=10)

(110034, 170070)

In [87]:
model.wv.most_similar("bau")

  if np.issubdtype(vec.dtype, np.int):


[('kotor', 0.9999662637710571),
 ('selimut', 0.9999473690986633),
 ('agak', 0.9999450445175171),
 ('banyak', 0.9999439120292664),
 ('kecil', 0.9999438524246216),
 ('masih', 0.9999436140060425),
 ('bekas', 0.9999414682388306),
 ('seperti', 0.9999414682388306),
 ('jg', 0.9999403953552246),
 ('dan', 0.9999392032623291)]

In [91]:
from gensim.models import Word2Vec, KeyedVectors

model.wv.save_word2vec_format('model.bin', binary=True)

In [92]:
word_vectors = KeyedVectors.load_word2vec_format('model.bin', binary=True)

## Tokenisasi

In [88]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(df_train.review)
sequences = tokenizer.texts_to_sequences(df_train.review)
x = pad_sequences(sequences, maxlen=180)
word_index = tokenizer.word_index

In [95]:
EMBEDDING_DIM=400
vocabulary_size=min(len(word_index)+1,5000)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=5000:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

In [25]:
model = Sequential()
model.add(Embedding(5000, 20, input_length=180))
model.add(Dropout(0.15))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(optimizer=Adam(0.015), loss='binary_crossentropy', metrics=['categorical_accuracy'])
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-simple.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 703 samples, validate on 79 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


In [27]:
import keras
simple_model = keras.models.load_model('model-simple.h5')
metrics = simple_model.evaluate(x_test, y_test)
print("{}: {}".format(simple_model.metrics_names[0], metrics[0]))
print("{}: {}".format(simple_model.metrics_names[1], metrics[1]))

loss: 0.251509889047973
categorical_accuracy: 0.5663265306122449


## CNN

In [70]:
filter_length = 128

model = Sequential()
model.add(Embedding(5000, 20, input_length=180))
model.add(Dropout(0.5))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 180, 20)           100000    
_________________________________________________________________
dropout_18 (Dropout)         (None, 180, 20)           0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 178, 128)          7808      
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 11)                1419      
_________________________________________________________________
activation_9 (Activation)    (None, 11)                0         
Total params: 109,227
Trainable params: 109,227
Non-trainable params: 0
_________________________________________________________________
Trai

Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50


In [71]:
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.20297585701455875
categorical_accuracy: 0.6326530612244898


## CNN-XGBoost

In [72]:
from keras.models import Model
total_layers = len(cnn_model.layers)

fl_index = total_layers-1

feature_layer_model = Model(
                     inputs=cnn_model.input,
                     outputs=cnn_model.get_layer(index=fl_index).output)

x_train_xg = feature_layer_model.predict(x_train)
x_test_xg = feature_layer_model.predict(x_test)

In [73]:
from skmultilearn.problem_transform import BinaryRelevance
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

clf = BinaryRelevance(XGBClassifier())
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.7598790307819695


In [74]:
from skmultilearn.problem_transform import ClassifierChain

clf = ClassifierChain(XGBClassifier())
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.7592212078185165


In [52]:
print(y_pred[0])

  (0, 1)	1.0
  (0, 3)	1.0


## CNN-SVM

In [75]:
from sklearn.svm import LinearSVC

clf = BinaryRelevance(LinearSVC())
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.7597110781217628


## CNN-LSTM

In [82]:
from keras.layers import LSTM
from keras.layers import MaxPooling1D

filter_length = 128

model = Sequential()
model.add(Embedding(5000, 20, input_length=180))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=2))
model.add(Dense(256))
model.add(LSTM(128))
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d-lstm.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 180, 20)           100000    
_________________________________________________________________
dropout_23 (Dropout)         (None, 180, 20)           0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 178, 128)          7808      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 89, 128)           0         
_________________________________________________________________
dense_19 (Dense)             (None, 89, 256)           33024     
_________________________________________________________________
lstm_14 (LSTM)               (None, 128)               197120    
_________________________________________________________________
dense_20 (Dense)             (None, 11)                1419      
__________

In [81]:
cnn_lstm_model = keras.models.load_model('model-conv1d-lstm.h5')
metrics = cnn_lstm_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.40627565919136516
categorical_accuracy: 0.45408163265306123


In [104]:
filter_length = 128

model = Sequential()
model.add(Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True, input_length=180))
model.add(Dropout(0.5))
model.add(Conv1D(filter_length, 3, padding='same', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 180, 400)          1092000   
_________________________________________________________________
dropout_26 (Dropout)         (None, 180, 400)          0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 180, 128)          153728    
_________________________________________________________________
global_max_pooling1d_16 (Glo (None, 128)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 11)                1419      
_________________________________________________________________
activation_16 (Activation)   (None, 11)                0         
Total params: 1,247,147
Trainable params: 1,247,147
Non-trainable params: 0
_________________________________________________________________


In [199]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras import regularizers

filter_sizes = [3,4,5]
num_filters = 128
drop = 0.5

inputs = Input(shape=(180,))
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)
embedding = embedding_layer(inputs)

conv_0 = Conv1D(num_filters, filter_sizes[0],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
conv_1 = Conv1D(num_filters, filter_sizes[1],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
conv_2 = Conv1D(num_filters, filter_sizes[2],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)

maxpool_0 = MaxPooling1D(180 - filter_sizes[0] + 1, strides=1)(conv_0)
maxpool_1 = MaxPooling1D(180 - filter_sizes[1] + 1, strides=1)(conv_1)
maxpool_2 = MaxPooling1D(180 - filter_sizes[2] + 1, strides=1)(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)

dense1 = Dense(256)(flatten)
dropout = Dropout(drop)(dense1)

output = Dense(units=11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_50 (InputLayer)           (None, 180)          0                                            
__________________________________________________________________________________________________
embedding_76 (Embedding)        (None, 180, 400)     1092000     input_50[0][0]                   
__________________________________________________________________________________________________
conv1d_61 (Conv1D)              (None, 178, 128)     153728      embedding_76[0][0]               
__________________________________________________________________________________________________
conv1d_62 (Conv1D)              (None, 177, 128)     204928      embedding_76[0][0]               
__________________________________________________________________________________________________
conv1d_63 

In [200]:
adam = Adam(lr=1e-3)

model.compile(loss='binary_crossentropy', metrics=['categorical_accuracy'],
              optimizer=adam)
model.summary()
callbacks = [EarlyStopping(patience=4),
            ModelCheckpoint(filepath='model-conv2d.h5', save_best_only=True)]
model.fit(x_train, y_train, batch_size=32, epochs=100, validation_split=0.1,
         callbacks=callbacks)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_50 (InputLayer)           (None, 180)          0                                            
__________________________________________________________________________________________________
embedding_76 (Embedding)        (None, 180, 400)     1092000     input_50[0][0]                   
__________________________________________________________________________________________________
conv1d_61 (Conv1D)              (None, 178, 128)     153728      embedding_76[0][0]               
__________________________________________________________________________________________________
conv1d_62 (Conv1D)              (None, 177, 128)     204928      embedding_76[0][0]               
__________________________________________________________________________________________________
conv1d_63 

Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100


<keras.callbacks.History at 0x2a2ea55cba8>

In [201]:
cnn_model = keras.models.load_model('model-conv2d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.3565950600468383
categorical_accuracy: 0.5357142857142857


In [202]:
from keras.models import Model
total_layers = len(cnn_model.layers)

fl_index = total_layers-1

feature_layer_model = Model(
                     inputs=cnn_model.input,
                     outputs=cnn_model.get_layer(index=fl_index).output)

x_train_xg = feature_layer_model.predict(x_train)
x_test_xg = feature_layer_model.predict(x_test)

In [128]:
x_train_xg.shape()

TypeError: 'tuple' object is not callable

In [203]:
from skmultilearn.problem_transform import BinaryRelevance
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

clf = BinaryRelevance(XGBClassifier())
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.7425836082881597


  'precision', 'predicted', average, warn_for)


In [171]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras import regularizers

filter_sizes = [3,4,5]
num_filters = 128
drop = 0.5

inputs = Input(shape=(180,))
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)
embedding = embedding_layer(inputs)
reshape = Reshape((180,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((180 - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((180 - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((180 - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)

dense1 = Dense(256)(merged_tensor)
dropout = Dropout(drop)(dense1)

lstm_1 = LSTM(128)(dropout)

output = Dense(units=11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           (None, 180)          0                                            
__________________________________________________________________________________________________
embedding_63 (Embedding)        (None, 180, 400)     1092000     input_37[0][0]                   
__________________________________________________________________________________________________
reshape_56 (Reshape)            (None, 180, 400, 1)  0           embedding_63[0][0]               
__________________________________________________________________________________________________
conv2d_103 (Conv2D)             (None, 178, 1, 128)  153728      reshape_56[0][0]                 
__________________________________________________________________________________________________
conv2d_104