In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('priority_1k_labelled.csv', sep=',')

In [3]:
data.head()

Unnamed: 0,review,category-sentiment,category,sentiment,Unnamed: 4
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,kebersihan-neg,kebersihan,neg,
1,Kotor berdebu. Saya tdk berhenti bersin ketika...,wifi_p1-neg,wifi_p1,neg,
2,kamar ada semutnya. kamar mandi bermasalah. bu...,kebersihan-neg,kebersihan,neg,
3,"Kamar mandi bau, airnya bau",bau_P1-neg,bau_P1,neg,
4,"tak sesuai espektasi, kamar sempit, pintu kama...",service-neg,service,neg,


In [4]:
data = data.drop(columns='category-sentiment')

In [5]:
data.isnull().any()

review        False
category      False
sentiment     False
Unnamed: 4     True
dtype: bool

In [6]:
def get_all_label(x_train, y_train):
    y_train_set = []
    sent = x_train[0]
    X_train_set = [sent]
    labels = []
    for i in range (len(x_train)):
        if sent != x_train[i]:
            if (len(labels)>0):
                y_train_set.append(labels)
            sent = x_train[i]
            X_train_set.append(sent)
            labels = []
        labels.append(y_train[i])
    return X_train_set, y_train_set

In [7]:
X_train, y_train = get_all_label(data['review'], data['category'])

In [8]:
data_train = list(zip(X_train, y_train))

In [9]:
df_train = pd.DataFrame(data_train, columns=['review', 'labels'])
df_train.head()

Unnamed: 0,review,labels
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,"[kebersihan, wifi_p1]"
1,kamar ada semutnya. kamar mandi bermasalah. bu...,[kebersihan]
2,"Kamar mandi bau, airnya bau",[bau_P1]
3,"tak sesuai espektasi, kamar sempit, pintu kama...",[service]
4,buruk. kasur ada bekas sperma seprai jg air ba...,"[linen_P1, wifi_p1]"


In [10]:
mlb = MultiLabelBinarizer()
y = df_train.labels
y = mlb.fit_transform(y)
mlb.classes_b


array(['ac_P1', 'air_panas_P1', 'bau_P1', 'general', 'kebersihan',
       'linen_P1', 'service', 'sunrise_meal_P1', 'tv_P1', 'wifi_P1',
       'wifi_p1'], dtype=object)

In [24]:
y_list = y.tolist()
print(y)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 1 ... 0 0 0]]


In [11]:
num_classes = len(mlb.classes_)

In [12]:
df_y = pd.DataFrame(y, columns=mlb.classes_)
df_train = df_train.join(df_y)
df_train = df_train.drop(columns=['labels'])

In [13]:
df_train.head()

Unnamed: 0,review,ac_P1,air_panas_P1,bau_P1,general,kebersihan,linen_P1,service,sunrise_meal_P1,tv_P1,wifi_P1,wifi_p1
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,0,0,0,0,1,0,0,0,0,0,1
1,kamar ada semutnya. kamar mandi bermasalah. bu...,0,0,0,0,1,0,0,0,0,0,0
2,"Kamar mandi bau, airnya bau",0,0,1,0,0,0,0,0,0,0,0
3,"tak sesuai espektasi, kamar sempit, pintu kama...",0,0,0,0,0,0,1,0,0,0,0
4,buruk. kasur ada bekas sperma seprai jg air ba...,0,0,0,0,0,1,0,0,0,0,1


In [14]:
df_train.to_csv('priority_1k_labelled_onehot.csv', sep=',')

In [15]:
import string
def tokenize(msg):
    clean = [char for char in msg if char not in string.punctuation]
    clean = ''.join(clean)
    return clean.lower().split()

In [16]:
longest = 0
for review in df_train.review:
    if len(tokenize(review)) > longest:
        longest = len(review)
print(longest)

118


In [17]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(df_train.review)
sequences = tokenizer.texts_to_sequences(df_train.review)
x = pad_sequences(sequences, maxlen=180)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

In [25]:
model = Sequential()
model.add(Embedding(5000, 20, input_length=180))
model.add(Dropout(0.15))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(optimizer=Adam(0.015), loss='binary_crossentropy', metrics=['categorical_accuracy'])
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-simple.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 703 samples, validate on 79 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


In [27]:
import keras
simple_model = keras.models.load_model('model-simple.h5')
metrics = simple_model.evaluate(x_test, y_test)
print("{}: {}".format(simple_model.metrics_names[0], metrics[0]))
print("{}: {}".format(simple_model.metrics_names[1], metrics[1]))

loss: 0.251509889047973
categorical_accuracy: 0.5663265306122449


In [29]:
filter_length = 300

model = Sequential()
model.add(Embedding(5000, 20, input_length=180))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_4 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 300)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 11)                3311      
_________________________________________________________________
activation_2 (Activation)    (None, 11)                0         
Total params: 121,611
Trainable params: 121,611
Non-trainable params: 0
_________________________________________________________________
Trai

In [30]:
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.20392031572302993
categorical_accuracy: 0.6173469387755102


In [40]:
from keras.models import Model
total_layers = len(cnn_model.layers)

fl_index = total_layers-1

feature_layer_model = Model(
                     inputs=cnn_model.input,
                     outputs=cnn_model.get_layer(index=fl_index).output)

x_train_xg = feature_layer_model.predict(x_train)
x_test_xg = feature_layer_model.predict(x_test)

In [41]:
from skmultilearn.problem_transform import BinaryRelevance
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

clf = BinaryRelevance(XGBClassifier())
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.7545092226254437


  'precision', 'predicted', average, warn_for)


In [42]:
from skmultilearn.problem_transform import ClassifierChain

clf = ClassifierChain(XGBClassifier())
clf.fit(x_train_xg, y_train)
y_pred = clf.predict(x_test_xg)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.7527145510221926


  'precision', 'predicted', average, warn_for)
