In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import string
import gensim
from gensim.models import Word2Vec, KeyedVectors
import keras
from keras.layers.core import Reshape, Flatten
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, LSTM, MaxPooling1D, concatenate
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.models import Model
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [83]:
def get_all_label(x_train, y_train):
    y_train_set = []
    sent = x_train[0]
    X_train_set = [sent]
    labels = []
    for i in range (len(x_train)):
        if sent != x_train[i]:
            if (len(labels)>0):
                y_train_set.append(labels)
            sent = x_train[i]
            X_train_set.append(sent)
            labels = []
        labels.append(y_train[i])
    return X_train_set, y_train_set

def tokenize(msg):
    clean = [char for char in msg if char not in string.punctuation]
    clean = ''.join(clean)
    return clean.lower().split()

def find_maxlen(reviews):
    longest = 0
    strlong = "test"
    for review in reviews:
        if len(tokenize(review)) > longest:
            longest = len(tokenize(review))
            strlong = review
    return longest

def train_w2v(corpus, size, min_count):
    docs = [tokenize(doc) for doc in corpus]
    model = gensim.models.Word2Vec(docs,
                                   size=size,
                                   window=10,
                                   min_count=min_count,
                                   workers=10)
    model.train(docs, total_examples=len(docs), epochs=10)
    return model

def tokenize_text(vocab_size, reviews,maxlen):
    tokenizer = Tokenizer(num_words=vocab_size, lower=True)
    tokenizer.fit_on_texts(reviews)
    sequences = tokenizer.texts_to_sequences(reviews)
    x = pad_sequences(sequences, maxlen=maxlen)
    word_index = tokenizer.word_index
    return x, word_index

def createEmbeddingMatrix(word_index, vocab_size, dim, word_vectors):
    EMBEDDING_DIM=dim
    vocabulary_size=min(len(word_index)+1,vocab_size)
    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i>=vocab_size:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
    return embedding_matrix

def createCNNModel(filter_sizes, num_filters, embedding_matrix, embedding_dim, vocabulary_size, maxlen, num_classes):
    filter_sizes = filter_sizes
    num_filters = num_filters
    drop = 0.5
    
    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocabulary_size,
                            embedding_dim,
                            weights=[embedding_matrix],
                            trainable=True)
    embedding = embedding_layer(inputs)
    
    conv_0 = Conv1D(num_filters, filter_sizes[0],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_1 = Conv1D(num_filters, filter_sizes[1],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_2 = Conv1D(num_filters, filter_sizes[2],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    
    maxpool_0 = MaxPooling1D(maxlen - filter_sizes[0] + 1, strides=1)(conv_0)
    maxpool_1 = MaxPooling1D(maxlen - filter_sizes[1] + 1, strides=1)(conv_1)
    maxpool_2 = MaxPooling1D(maxlen - filter_sizes[2] + 1, strides=1)(conv_2)
    
    merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
    flatten = Flatten()(merged_tensor)
    
    dense1 = Dense(256)(flatten)
    dropout = Dropout(drop)(dense1)
    
    output = Dense(units=num_classes, activation='sigmoid',kernel_regularizer=regularizers.l2(0.01))(dropout)
    
    # this creates a model that includes
    model = Model(inputs, output)
    print(model.summary())
    return model

def createCNNLSTMModel(filter_sizes, num_filters, embedding_matrix, embedding_dim, vocabulary_size, maxlen, num_classes):
    filter_sizes = filter_sizes
    num_filters = num_filters
    drop = 0.5
    
    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocabulary_size,
                            embedding_dim,
                            weights=[embedding_matrix],
                            trainable=True)
    embedding = embedding_layer(inputs)
    
    conv_0 = Conv1D(num_filters, filter_sizes[0],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_1 = Conv1D(num_filters, filter_sizes[1],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    conv_2 = Conv1D(num_filters, filter_sizes[2],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
    
    maxpool_0 = MaxPooling1D(maxlen - filter_sizes[0] + 1, strides=1)(conv_0)
    maxpool_1 = MaxPooling1D(maxlen - filter_sizes[1] + 1, strides=1)(conv_1)
    maxpool_2 = MaxPooling1D(maxlen - filter_sizes[2] + 1, strides=1)(conv_2)
    
    merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
    flatten = Flatten()(merged_tensor)
    
    dense1 = Dense(256)(merged_tensor)
    dropout = Dropout(drop)(dense1)
    lstm_1 = LSTM(128)(dropout)
    output = Dense(units=num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(lstm_1)
    
    # this creates a model that includes
    model = Model(inputs, output)
    print(model.summary())
    return model

def trainCNN(x_train, y_train, model, epochs, filepath):
    adam = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy', metrics=['accuracy'],
              optimizer=adam)
    callbacks = [EarlyStopping(patience=4),
            ModelCheckpoint(filepath=filepath, save_best_only=True)]
    model.fit(x_train, y_train, batch_size=32, epochs=epochs, validation_split=0.1,
         callbacks=callbacks)
    
def get_output_cnn(model, x_train, x_test):
    total_layers = len(model.layers)
    fl_index = total_layers-1
    feature_layer_model = Model(
                     inputs=model.input,
                     outputs=model.get_layer(index=fl_index).output)
    x_train_xg = feature_layer_model.predict(x_train)
    x_test_xg = feature_layer_model.predict(x_test)
    return x_train_xg, x_test_xg

In [16]:
data = pd.read_csv('dataset/priority_3k_labelled.csv', sep=',')

In [17]:
data.head()

Unnamed: 0,review,category_sentiment,category,sentiment
0,Kotor berdebu. Saya tdk berhenti bersin ketika...,wifi_P1-neg,wifi_P1,neg
1,Kotor berdebu. Saya tdk berhenti bersin ketika...,kebersihan-neg,kebersihan,neg
2,kamar ada semutnya. kamar mandi bermasalah. bu...,kebersihan-neg,kebersihan,neg
3,"Kamar mandi bau, airnya bau",bau_P1-neg,bau_P1,neg
4,"tak sesuai espektasi, kamar sempit, pintu kama...",service-neg,service,neg


In [23]:
data_list = [[],[],[],[],[],[],[],[],[],[]]
target_list = [[],[],[],[],[],[],[],[],[],[]]
for i in range(len(data)):
    if(data.category[i]=='ac_P1'):
        data_list[0].append(data.review[i])
        target_list[0].append(data.sentiment[i])
    if(data.category[i]=='air_panas_P1'):
        data_list[1].append(data.review[i])
        target_list[1].append(data.sentiment[i])
    if(data.category[i]=='bau_P1'):
        data_list[2].append(data.review[i])
        target_list[2].append(data.sentiment[i])
    if(data.category[i]=='general'):
        data_list[3].append(data.review[i])
        target_list[3].append(data.sentiment[i])
    if(data.category[i]=='kebersihan'):
        data_list[4].append(data.review[i])
        target_list[4].append(data.sentiment[i])
    if(data.category[i]=='linen_P1'):
        data_list[5].append(data.review[i])
        target_list[5].append(data.sentiment[i])
    if(data.category[i]=='service'):
        data_list[6].append(data.review[i])
        target_list[6].append(data.sentiment[i])
    if(data.category[i]=='sunrise_meal_P1'):
        data_list[7].append(data.review[i])
        target_list[7].append(data.sentiment[i])
    if(data.category[i]=='tv_P1'):
        data_list[8].append(data.review[i])
        target_list[8].append(data.sentiment[i])
    if(data.category[i]=='wifi_P1'):
        data_list[9].append(data.review[i])
        target_list[9].append(data.sentiment[i])
print(data_list[0])
print(target_list[0])

['kamarnya nyaman, acnya berbunyi,lantai kamarnya kurang bersih tapi makanan unuk sarapannya enak terima kasih airy', 'sangat memuaskan. Sprei dan handuknya wangi, kamarnya cukup bersih dan nyaman. AC dan air panasnya berfungsi dengan baik. Saluran TV sedikit bermasalah tapi cukup memuaskan. ini pengalaman kedua saya di airy gunung sahari. Terima.kasih', 'AC nya parah. Berisik bersurara. Bikin gak bisa tidur sama sekali. Lokasi nya susah minta ampun ditemukan.', 'free snack + sabun / shampoo airy gak ada... AC nya juga bocor air nya keluar di dlm ruangan... semoga di perbaiki tempat dan kinerjanya dimana hak buat konsumen...', 'kamar sudah cukup bagus dan luas. ac cukup lama baru dingin dan air kamar mandi agak bau pas dibuka pertama kali', 'kamar nya oke bagus rapi.ac dingin. ramah.tapi wifinya parah gk bisa dipake..', 'bantalnya enak.. kamarnya dingin.. keset ga ada lagi nha..', 'Kamarnya tdk ada jendela, ac kurang dingin, seprei/bantal agak berdebu..semalaman hidung jg gatel2..kamar

In [29]:
y = [[1 if s=='pos' else 0 for s in c] for c in target_list]

In [27]:
word_vectors = KeyedVectors.load_word2vec_format('model/w2v_model.bin', binary=True)

In [101]:
x_ac_P1, word_index = tokenize_text(5000, data_list[0], 180)
embedding_matrix_ac_P1 = createEmbeddingMatrix(word_index, 5000, 400, word_vectors)

In [117]:
from imblearn.over_sampling import RandomOverSampler

In [103]:
x_train_ac_P1, x_test_ac_P1, y_train_ac_P1, y_test_ac_P1 = train_test_split(x_ac_P1, y[0], test_size=0.2, random_state=9000)

In [105]:
pos_weight = len([s for s in y_train_ac_P1 if s==1])
neg_weight = len([s for s in y_train_ac_P1 if s==0])
print(pos_weight)
print(neg_weight)

50
425


In [118]:
ros = RandomOverSampler(random_state=42)
x_res, y_res = ros.fit_resample(x_train_ac_P1, y_train_ac_P1)

In [119]:
pos_weight = len([s for s in y_res if s==1])
neg_weight = len([s for s in y_res if s==0])
print(pos_weight)
print(neg_weight)

425
425


In [57]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', np.unique(y_train_ac_P1), y_train_ac_P1)

In [120]:
cnn_model_ac_P1 = createCNNModel([1,3,5], 128, embedding_matrix_ac_P1, 400, min(len(word_index)+1,5000), 180, 1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 180)          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 180, 400)     885200      input_9[0][0]                    
__________________________________________________________________________________________________
conv1d_22 (Conv1D)              (None, 180, 128)     51328       embedding_9[0][0]                
__________________________________________________________________________________________________
conv1d_23 (Conv1D)              (None, 178, 128)     153728      embedding_9[0][0]                
__________________________________________________________________________________________________
conv1d_24 

In [121]:
trainCNN(x_res, y_res, cnn_model_ac_P1, 100, 'model/model-cnn-sentiment-ac_P1.h5')

Train on 765 samples, validate on 85 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


In [122]:
cnn_model_ac_P1 = keras.models.load_model('model/model-cnn-sentiment-ac_P1.h5')
metrics = cnn_model_ac_P1.evaluate(x_test_ac_P1, y_test_ac_P1)
print("{}: {}".format(cnn_model_ac_P1.metrics_names[0], metrics[0]))
print("{}: {}".format(cnn_model_ac_P1.metrics_names[1], metrics[1]))

loss: 0.3181004226207733
acc: 0.9075630272136015


In [123]:
x_train_xg, x_test_xg = get_output_cnn(cnn_model_ac_P1, x_train_ac_P1, x_test_ac_P1)

In [124]:
clf = XGBClassifier()
clf.fit(x_train_xg, y_train_ac_P1)
y_pred = clf.predict(x_test_xg)

In [125]:
print(classification_report(y_test_ac_P1,y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9083    0.9802    0.9429       101
           1     0.8000    0.4444    0.5714        18

   micro avg     0.8992    0.8992    0.8992       119
   macro avg     0.8541    0.7123    0.7571       119
weighted avg     0.8919    0.8992    0.8867       119

