In [1]:
# Run setup code
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import collections
from tqdm import tqdm
%matplotlib inline

In [32]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv1D, Dropout, GRU, Bidirectional, Conv2D
from keras.layers import Reshape, Activation, Flatten, TimeDistributed,MaxPooling1D, MaxPooling2D
from keras.preprocessing import sequence
from keras.layers.merge import Dot
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau

In [7]:
path = '/data/wongnai-review/'
data_raw = pd.read_csv(path + 'w_review_train.csv', delimiter=';', header=None)

In [8]:
data_raw = pd.DataFrame(data=data_raw)
data_raw = data_raw.rename(index=str, columns={0: "sent", 1: "rating"})
data_raw.head()

Unnamed: 0,sent,rating
0,ร้านอาหารใหญ่มากกกกกกก \nเลี้ยวเข้ามาเจอห้องน้...,3
1,อาหารที่นี่เป็นอาหารจีนแคะที่หากินยากในบ้านเรา...,4
2,ปอเปี๊ยะสด ทุกวันนี้รู้สึกว่าหากินยาก (ร้านที่...,3
3,รัานคัพเค้กในเมืองไทยมีไม่มาก หลายๆคนอาจจะสงสั...,5
4,อร่อย!!! เดินผ่านDigital gatewayทุกวัน ไม่ยักร...,5


In [9]:
import re
data_raw['sent'] = data_raw['sent'].apply(lambda k : re.sub(r'["|–|\'|:|;|?|$|!|~|\n|\t|-|#|+|<|>|/|\\|\|{|}|\[|\]|`|0|1|2|3|4|5|6|7|8|9|*|.|%|@|$|^|&|=|:|(|)|-|_]', r'', k))
# data_raw['sent']

In [10]:
data_set = data_raw.copy()
data_set['sent'] = data_raw['sent'].apply(lambda row: list(row))
data_set.head()

Unnamed: 0,sent,rating
0,"[ร, ้, า, น, อ, า, ห, า, ร, ใ, ห, ญ, ่, ม, า, ...",3
1,"[อ, า, ห, า, ร, ท, ี, ่, น, ี, ่, เ, ป, ็, น, ...",4
2,"[ป, อ, เ, ป, ี, ๊, ย, ะ, ส, ด, , ท, ุ, ก, ว, ...",3
3,"[ร, ั, า, น, ค, ั, พ, เ, ค, ้, ก, ใ, น, เ, ม, ...",5
4,"[อ, ร, ่, อ, ย, , เ, ด, ิ, น, ผ, ่, า, น, D, ...",5


In [11]:
# Create a character map
CHARS = [
  '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
  ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
  '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
  'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  'n', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
  'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช',
  'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
  'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ',
  'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า',
  'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ',
  'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓',
  '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff'
]
CHARS_MAP = {v: k for k, v in enumerate(CHARS)}

In [12]:
def create_n_gram_df(df, n_pad):
    """
    Given an input dataframe, create a feature dataframe of shifted characters
    Input:
    df: timeseries of size (N)
    n_pad: the number of context. For a given character at position [idx],
    character at position [idx-n_pad/2 : idx+n_pad/2] will be used 
    as features for that character.

    Output:
    dataframe of size (N * n_pad) which each row contains the character, 
    n_pad_2 characters to the left, and n_pad_2 characters to the right
    of that character.
    """
    n_pad_2 = int((n_pad - 1)/2)
    for i in range(n_pad_2):
        df['char-{}'.format(i+1)] = df['char'].shift(i + 1)
        df['char{}'.format(i+1)] = df['char'].shift(-i - 1)
    return df[n_pad_2: -n_pad_2]

In [13]:
def prepare_feature(input_string):
    """
    Transform the path to a directory containing processed files 
    into a feature matrix and output array
    Input:
    best_processed_path: str, path to a processed version of the BEST dataset
    option: str, 'train' or 'test'
    """
    # we use padding equals 21 here to consider 10 characters to the left
    # and 10 characters to the right as features for the character in the middle
    n_pad = 21
    n_pad_2 = int((n_pad - 1)/2)
    pad = [{'char' : ' '}]
    df_pad = pd.DataFrame(pad * n_pad_2)

#     df = pd.DataFrame(data=best_processed_path['sent'][0], columns=['char'])
    df = pd.DataFrame(data=input_string, columns=['char'])
    # pad with empty string feature
    df = pd.concat((df_pad, df, df_pad))
    
    # map characters to numbers, use 'other' if not in the predefined character set.
    df['char'] = df['char'].map(lambda x: CHARS_MAP.get(x, 80))
    # Use nearby characters as features
    df_with_context = create_n_gram_df(df, n_pad=n_pad)

    char_row = ['char' + str(i + 1) for i in range(n_pad_2)] + \
             ['char-' + str(i + 1) for i in range(n_pad_2)] + ['char']

    # convert pandas dataframe to numpy array to feed to the model
    x_char = df_with_context[char_row].as_matrix()

    return x_char

In [14]:
#print char of feature 1
char = np.array(CHARS)

#A function for displaying our features in text
def print_features(tfeature,label,index):
    feature = np.array(tfeature[index],dtype=int).reshape(21,1)
    #Convert to string
    char_list = char[feature]
    left = ''.join(reversed(char_list[10:20].reshape(10))).replace(" ", "")
    center = ''.join(char_list[20])
    right =  ''.join(char_list[0:10].reshape(10)).replace(" ", "")
    word = ''.join([left,' ',center,' ',right])
    print(center + ': ' + word + "\tpred = "+str(label[index]))

In [15]:
# Tokenize model
def get_my_tokenize_model():
    input1 = Input(shape=(21,))
    x = Embedding(178,8)(input1)
    x = Conv1D(100,5,strides=1,activation='relu',padding="same")(x)
    x = TimeDistributed(Dense(5))(x)
    x = Flatten()(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(),
                 loss='binary_crossentropy',
                 metrics=['acc'])          
    return model

In [16]:
##progress bar
import progressbar
def set_progressbar(l):
    return progressbar.ProgressBar(maxval=l, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

In [59]:
## LOAD Tokenize model
weight_path_model_best='/data/model_best.h5'

tokenize_model = get_my_tokenize_model()
tokenize_model.load_weights(weight_path_model_best)

In [17]:
def map_pred_to_word(y_pred,sent):
    out = []
    w = ''
    for i in range(len(y_pred)):
        if(y_pred[i] == 1):
            out.append(w)
            w = sent[i]
        else:
            w += sent[i]
#         t = w.strip()
#         if(t != ''):
    out.append(w.strip())
    return out[1:]

In [19]:
%%time
from tqdm import tqdm
# predict
# bar = set_progressbar(len(data_set))
# bar.start()

tokenized_sent = []
for i in tqdm(range(len(data_set))):
    chars_array = prepare_feature(data_set['sent'][i])
    y_pred = tokenize_model.predict(chars_array)
    #map probability to class
    prob_to_class = lambda p: 1 if p[0]>=0.5 else 0
    y_pred = np.apply_along_axis(prob_to_class,1,y_pred)
    tokenized_sent.append(map_pred_to_word(y_pred,data_raw['sent'][i]))
#     bar.update(i+1)

100%|██████████| 40000/40000 [25:21<00:00, 26.29it/s]

CPU times: user 31min 39s, sys: 3min 3s, total: 34min 43s
Wall time: 25min 21s





In [None]:
X_train = np.array(tokenized_sent)
y_train = data_set['rating'].as_matrix()
prepared_data = pd.DataFrame(data={'sent':X_train, 'rating':y_train},columns=['sent','rating'])

In [37]:
# save tokenize
with open('/data/tokenized-review-clean', 'wb') as f:
    pickle.dump(prepared_data, f)

In [18]:
# load tokenize
with open('/data/tokenized-review-clean', 'rb') as f:
    tokenized_sent = pickle.load(f)

In [381]:
# tokenized_sent['sent'][10]

In [19]:
## read fasttext
ftext_w = {}
with open('/data/fasttext/wiki.th.vec', 'r') as f:
    embeded_w = f.readlines()
for line in embeded_w:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    ftext_w[word] = coefs
    
# word_to_idx = {}
# idx_to_word = ['for_keras_zero_padding']

# for w in ftext_w.keys():
#     word_to_idx[w] = len(idx_to_word)
#     idx_to_word.append(w)

In [17]:
## read thai2vec
thai2vec = dict()
with open('/data/thai2vec.vec', 'r') as f:
    for i, line in tqdm(enumerate(f), ncols=10):
        if(i == 0) :
            continue
        thai2vec[line.split(' ')[0]] = [np.float(x) for x in line.split(' ')[1:]]

51359it [00:06, 7590.56it/s]


In [71]:
word_to_idx = {}
with open('/data/thai2vec.vocab', 'r') as f:
    for i, line in tqdm(enumerate(f), ncols=10):
        word_to_idx[line.split(' ')[0]] = int(line.split(' ')[1])

51358it [00:00, 541723.83it/s]


In [None]:
print(len(word_to_idx), len(thai2vec))
print(thai2vec['UNK'])

In [53]:
max_len = 1000
def create_index(input_text):
    count_word = 0
    words = []
    
    for sent in input_text:
        for w in sent:
            words.append(w.strip('\n'))
            count_word +=1
    
    word_count = list()
    #use set and len to get the number of unique words
    word_count.extend(collections.Counter(words).most_common(len(set(words))))
    
    #include a token for unknown word
    threshold = 5
    num_UNK = 0
    index = len(word_count) - 1
    rare_word = set()
    
    
    while(word_count[index][1] <= threshold):
        num_UNK += word_count[index][1]
        rare_word.add(word_count[index][0])
        index -= 1
    
    word_count = word_count[:index+1]
    word_count.append(("UNK",num_UNK))
    word_count = sorted(word_count, key=lambda x: -x[1])
    
    print(num_UNK , num_UNK/count_word)       

    #print out 10 most frequent words
    
    print(word_count[:10])
    dictionary = dict()
    dictionary["for_keras_zero_padding"] = 0
    
    for word in word_count:
        dictionary[word[0]] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    dataset = list()
    for sent in input_text:
        dataset.append([])
        for word in sent[:max_len]:
            if(word not in rare_word):
                dataset[-1].append(dictionary[word])
            else:
                dataset[-1].append(dictionary["UNK"])

    return dataset, dictionary, reverse_dictionary

dataset ,dictionary,reverse_dictionary = create_index(tokenized_sent['sent'])

172055 0.02664951523454532
[('', 1115292), ('UNK', 172055), ('ๆ', 112143), ('ร้าน', 110083), ('ที่', 108221), ('ไม่', 101023), ('มา', 92063), ('มี', 81201), ('นี้', 72266), ('ได้', 70890)]


In [None]:
max_len = 500
def create_index(input_text):
    count_word = 0
    words = []
    
    for sent in input_text:
        for w in sent[:max_len]:
            words.append(w.strip('\n'))
            count_word +=1
    
    word_count = list()
    #use set and len to get the number of unique words
    word_count.extend(collections.Counter(words).most_common(len(set(words))))
    
    #include a token for unknown word
    threshold = 10
    num_UNK = 0
    index = len(word_count) - 1
    rare_word = set()
    
    
    while(word_count[index][1] <= threshold):
        num_UNK += word_count[index][1]
        rare_word.add(word_count[index][0])
        index -= 1
    
    word_count = word_count[:index+1]
    word_count.append(("UNK",num_UNK))
    word_count = sorted(word_count, key=lambda x: -x[1])
    
    print(num_UNK , num_UNK/count_word)       

    #print out 10 most frequent words
    
    print(word_count[:10])
    dictionary = word_to_idx.copy()
    dictionary["for_keras_zero_padding"] = 0
        
    for word in word_count:
        word = word[0].strip('\n')
        if(not word in word_to_idx.keys()):
            word_to_idx[word] = len(idx_to_word)
            dictionary[word] = len(idx_to_word)
            idx_to_word.append(word)
        else:
            dictionary[word] = word_to_idx[word]
            
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))    
    
    dataset = list()
    for sent in input_text:
        dataset.append([])
        for word in sent[:max_len]:
            word = word.strip('\n')
            if(word not in rare_word):
                dataset[-1].append(dictionary[word])
            else:
                dataset[-1].append(dictionary["UNK"])

    return dataset, dictionary, reverse_dictionary

dataset ,dictionary,reverse_dictionary = create_index(tokenized_sent['sent'])

In [34]:
X_train = sequence.pad_sequences(dataset, maxlen=max_len, padding='post', truncating='pre') #padding
y_train = data_set['rating'].as_matrix()


# y_test = y_train[int(len(y_train)*0.9):]
# y_train = y_train[:int(len(y_train)*0.9)]
y_train = pd.get_dummies(y_train).as_matrix()

# X_test = X_train[int(len(X_train)*0.9):]
# X_train = X_train[:int(len(X_train)*0.9)]
# X_dev = X_train[int(len(X_train)*0.7):int(len(X_train)*0.9)]


# print(len(X_train), len(y_train), len(X_test), len(y_test))

In [54]:
match = 0
for w in dictionary.keys():
    if(w.strip('\n') in ftext_w.keys()):
        match += 1

print('match', match, match/len(dictionary))

match 7013 0.4218345864661654


In [55]:
### Prepare embed layer
def prepare_embed(pretrain):
    pre_emb = []
    pre_emb.append(np.zeros(300))

    for k in dictionary.keys():
        if(not k in pretrain.keys()):    
            pre_emb.append(np.zeros(300))
        else:
            if(len(pretrain[k]) ==  300):
                pre_emb.append(pretrain[k])
            else:
                pre_emb.append(np.zeros(300))
    return pre_emb

pre_emb = prepare_embed(ftext_w)

In [56]:
# print(pre_emb[68143])
pre_emb_w = np.array(pre_emb.copy())
# pre_emb_w[20591] = np.array(ftext_w[idx_to_word[20591]])
# print(pre_emb_w[67344])
has_no_300 = 0
for i,e in enumerate(pre_emb_w):
    if (len(e) != 300):
        has_no_300 += 1
        print(i, len(e), e)
#         pre_emb_w[i] = ftext_w[idx_to_word[i]]

In [68]:
## Predict Model
def get_predict_model():
    input1 = Input(shape=(max_len,))
    x = Embedding(len(dictionary)+1, 300, weights=[pre_emb_w], trainable=True)(input1)
    x = Conv1D(64,10,strides=1,activation='relu',padding="same")(x)
#     x = MaxPooling1D(pool_size=4, strides=1, padding='same')(x)    
    x = Conv1D(64,5,strides=1,activation='relu',padding="same")(x)
#     x = Conv1D(16,2,strides=1,activation='relu',padding="same")(x)
    x = MaxPooling1D(pool_size=5, strides=1, padding='same')(x)   
    x = Dropout(0.5)(x)
    x = TimeDistributed(Dense(5))(x)
    x = Flatten()(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.5)(x)
    out = Dense(5, activation='softmax')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(),
                 loss='categorical_crossentropy',
                 metrics=['categorical_accuracy'])          
    return model
model = get_predict_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 1000)              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 1000, 300)         4987800   
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 1000, 64)          192064    
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 1000, 64)          20544     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 1000, 64)          0         
_________________________________________________________________
dropout_15 (Dropout)         (None, 1000, 64)          0         
_________________________________________________________________
time_distributed_8 (TimeDist (None, 1000, 5)           325       
__________

In [69]:
%%time

weight_path_model_best='/data/midterm-1.h5'

callbacks_list = [
#    TensorBoard(log_dir='/data/Graph/midterm', histogram_freq=1, write_grads=True),
    ModelCheckpoint(
        weight_path_model_best,
        monitor = "val_loss",
        mode = 'min',
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
    ),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                    patience=2, min_lr=0.001)
]

model.fit(X_train,y_train,batch_size=256,epochs=3,verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)

Train on 32000 samples, validate on 8000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 1min 5s, sys: 25.1 s, total: 1min 30s
Wall time: 2min 1s


In [70]:
model.fit(X_train,y_train,batch_size=256,epochs=3,verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)

Train on 32000 samples, validate on 8000 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [71]:
model.load_weights(weight_path_model_best)

In [419]:
## Predict Model
# import keras.backend as K
# K.clear_session()
def get_predict_model2():
    input1 = Input(shape=(max_len,))
    x = Embedding(len(dictionary)+1, 300, weights=[pre_emb_w], trainable=True)(input1)
    x = Conv1D(64,5,strides=1,activation='relu',padding="valid")(x)
    x = MaxPooling1D(pool_size=5, strides=1, padding='valid')(x)    
    x = Conv1D(32,5,strides=1,activation='relu',padding="valid")(x)
    x = MaxPooling1D(pool_size=5, strides=1, padding='valid')(x)    
    x = Dropout(0.25)(x)
    x = TimeDistributed(Dense(5))(x)
    x = Flatten()(x)
#     x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    out = Dense(5, activation='softmax')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(lr=0.001),
                 loss='categorical_crossentropy',
                 metrics=['categorical_accuracy'])          
    return model
model2 = get_predict_model2()
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_58 (InputLayer)        (None, 1000)              0         
_________________________________________________________________
embedding_58 (Embedding)     (None, 1000, 300)         6978300   
_________________________________________________________________
conv1d_87 (Conv1D)           (None, 996, 64)           96064     
_________________________________________________________________
max_pooling1d_79 (MaxPooling (None, 992, 64)           0         
_________________________________________________________________
conv1d_88 (Conv1D)           (None, 988, 32)           10272     
_________________________________________________________________
max_pooling1d_80 (MaxPooling (None, 984, 32)           0         
_________________________________________________________________
time_distributed_41 (TimeDis (None, 984, 5)            165       
__________

In [358]:
%%time
# use 1000, 
weight_path_model_best2='/data/midterm-2.h5'

callbacks_list = [
#    TensorBoard(log_dir='/data/Graph/midterm', histogram_freq=1, write_grads=True),
    ModelCheckpoint(
        weight_path_model_best2,
        monitor = "val_loss",
        mode = 'min',
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
    ),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                    patience=2, min_lr=0.001)
]

model2.fit(X_train,y_train,batch_size=512,epochs=5,verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)

Train on 28800 samples, validate on 7200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 1s, sys: 24.6 s, total: 1min 26s
Wall time: 1min 58s


In [44]:
## Predict Model
### BEST NOW
### first 1000, threshold 3, thai2vec
def get_predict_model3():
    input1 = Input(shape=(max_len,))
    x = Embedding(len(dictionary)+1, 300, weights=[pre_emb_w], trainable=True)(input1)
#     x = Conv1D(32,5,strides=1,activation='relu',padding="valid")(x)
#     x = MaxPooling1D(pool_size=5, strides=1,padding='valid')(x)    
    x = Conv1D(64,5,strides=1,activation='relu',padding="valid")(x)
    x = MaxPooling1D(pool_size=5, strides=1,padding='valid')(x)    
    x = Dropout(0.25)(x)
    x = TimeDistributed(Dense(5))(x)
    x = Flatten()(x)
#     x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.5)(x)
    out = Dense(5, activation='softmax')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(lr=0.001),
                 loss='categorical_crossentropy',
                 metrics=['categorical_accuracy'])          
    return model
model3 = get_predict_model3()
model3.summary()
weight_path_model_best3='/data/midterm-3.h5'

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 1000, 300)         6978300   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 996, 64)           96064     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 992, 64)           0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 992, 64)           0         
_________________________________________________________________
time_distributed_5 (TimeDist (None, 992, 5)            325       
_________________________________________________________________
flatten_5 (Flatten)          (None, 4960)              0         
__________

In [452]:
%%time

callbacks_list = [
#    TensorBoard(log_dir='/data/Graph/midterm', histogram_freq=1, write_grads=True),
    ModelCheckpoint(
        weight_path_model_best3,
        monitor = "val_loss",
        mode = 'min',
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
    ),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                    patience=2, min_lr=0.001)
]

# model3.fit(X_train,y_train,batch_size=256,epochs=5,verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)

Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 7s, sys: 26.1 s, total: 1min 33s
Wall time: 2min 5s


In [45]:
model3.load_weights(weight_path_model_best3)

In [28]:
from sklearn.metrics import f1_score,precision_score,recall_score
def evaluate(x_test, y_test, model):
    """
    Evaluate model on the splitted 10 percent testing set.
    """
    y_pred = model.predict(x_test)
    #map probability to class
    y_pred_mapped = []
    for i,pred in enumerate(y_pred):
        pred = list(pred)
        y_pred_mapped.append(pred.index(max(pred))+1)    
    
    f1score = f1_score(y_test,y_pred_mapped, average='weighted')
    precision = precision_score(y_test,y_pred_mapped, average='weighted')
    recall = recall_score(y_test,y_pred_mapped, average='weighted')
    return f1score, precision, recall

In [29]:
print(len(X_test), len(y_test))

4000 4000


In [63]:
evaluate(X_test, y_test, model)

(0.4794448217436239, 0.5883356096010524, 0.5465)

In [49]:
## PREDICT
test_raw = pd.read_csv(path + 'test_file.csv', delimiter=';', header=0)
test_raw = pd.DataFrame(data=test_raw, columns=['reviewID','review'])
test_raw['review'] = test_raw['review'].apply(lambda k : re.sub(r'["|–|\'|:|;|?|$|!|~|\n|\t|-|#|+|<|>|/|\\|\|{|}|\[|\]|`|0|1|2|3|4|5|6|7|8|9|*|.|%|@|$|^|&|=|:|(|)|-|_]', r'', k))
test_raw.head()

Unnamed: 0,reviewID,review
0,1,ร้านนี้จะอยู่เส้นสันกำแพง-แม่ออน เลยแยกบ่...
1,2,สั่งไป เมนู คือมัชฉะลาเต้ร้อน กับ ไอศครีมชาเขี...
2,3,ครัววงเดือน หิวดึกๆ ตระเวนหาร้านทาน มาเจอร้าน...
3,4,จะว่าเป็นเจ้าประจำก็คงไม่ผิด แต่ก็ไม่กล้า...
4,5,ถ้าคิดถึงสลัดผมคิดถึงร้านนี้เป็นร้านแรกๆเลยครั...


In [50]:
import re
test_list = test_raw.copy()
test_list['review'] = test_raw['review'].apply(lambda row: list(row))
test_list.head()

Unnamed: 0,reviewID,review
0,1,"[ , , , , , ร, ้, า, น, น, ี, ้, จ, ะ, อ, ..."
1,2,"[ส, ั, ่, ง, ไ, ป, , เ, ม, น, ู, , ค, ื, อ, ..."
2,3,"[ค, ร, ั, ว, ว, ง, เ, ด, ื, อ, น, , , ห, ิ, ..."
3,4,"[ , , , , , จ, ะ, ว, ่, า, เ, ป, ็, น, เ, ..."
4,5,"[ถ, ้, า, ค, ิ, ด, ถ, ึ, ง, ส, ล, ั, ด, ผ, ม, ..."


In [None]:
from tqdm import tqdm

tokenized_test = []

for i in tqdm(range(len(test_list)), ncols=100):
    chars_array = prepare_feature(test_list['review'][i])
    y_pred = tokenize_model.predict(chars_array)
    #map probability to class
    prob_to_class = lambda p:1 if p[0]>=0.5 else 0
    y_pred = np.apply_along_axis(prob_to_class,1,y_pred)
    tokenized_test.append(map_pred_to_word(y_pred,test_raw['review'][i]))

In [395]:
import pickle
with open ('/data/test_tokened','wb') as f:
    pickle.dump(tokenized_test,f)

In [44]:
import pickle
with open ('/data/test_tokened','rb') as f:
    tokenized_test = pickle.load(f)

In [60]:
from tqdm import tqdm
# data['review'] = test_raw['review'].apply(lambda row: list(row))
dataset_test = list()
for i in tqdm(range(len(tokenized_test)), ncols=100):
    sent = tokenized_test[i]
    dataset_test.append([])
    for word in sent[:max_len]:
        word = word.strip('\n')
        if(word in dictionary.keys()):
            dataset_test[-1].append(dictionary[word])
        else:
            dataset_test[-1].append(dictionary["UNK"])

100%|████████████████████████████████████████████████████████| 6203/6203 [00:00<00:00, 10375.63it/s]


In [72]:
%%time
test_prep = sequence.pad_sequences(dataset_test, maxlen=max_len, padding='post', truncating='pre')
test_pred = model.predict(test_prep)
ans = []
for i,pred in enumerate(test_pred):
    pred = list(pred)
    mapped = pred.index(max(pred))+1
    ans.append(mapped)

CPU times: user 1.06 s, sys: 210 ms, total: 1.27 s
Wall time: 2.76 s


In [73]:
output = pd.DataFrame({'reviewID':test_raw['reviewID'],'rating':ans})
output.head()

Unnamed: 0,rating,reviewID
0,4,1
1,3,2
2,4,3
3,4,4
4,4,5


In [74]:
output.to_csv('/data/sub4.csv', sep=',', index=False, columns=['reviewID','rating'])

In [64]:
### TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import re
tokenized_sent['sent'] = tokenized_sent['sent'].apply(laนอนผmbda k: [e.strip('\n') for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip(' ') for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\t')for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\!')for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\#')for e in k ])

tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\n') for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip(' ') for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\t')for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\!')for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('#')for e in k ])

tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\n') for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip(' ') for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\t')for e in k ])
tokenized_sent['sent'] = tokenized_sent['sent'].apply(lambda k: [e.strip('\!')for e in k ])
tokenized_sent['sent_print (dict(zip(vectorizer.get_feature_names(), idf)))

In [65]:
##TF
# # tokenize=['']
# tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
# tfs = tfidf.fit_transform(token_dict.values())

In [None]:
## Predict Model
# import keras.backend as K
# K.clear_session()
def get_predict_model3():
    input1 = Input(shape=(max_len,))
    x = Embedding(len(dictionary)+1, 300, trainable=True)(input1)
    x = Conv1D(32,5,strides=1,activation='relu',padding="valid")(x)
    x = MaxPooling1D(pool_size=5, strides=1, padding='valid')(x)
    x = TimeDistributed(Dense(5))(x)
    x = Flatten()(x)
#     x = GRU(50, activation='relu')(x)
#     x = Bidirectional(GRU(50 ,activation='relu'))(x)
    x = Dropout(0.4)(x)
    x = Dense(50, activation='relu')(x)
#     x = Dropout(0.25)(x)
#     x = Dense(100, activation='relu')(x)
    out = Dense(5, activation='softmax')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(),
                 loss='categorical_crossentropy',
                 metrics=['categorical_accuracy'])          
    return model
model3 = get_predict_model2()
model3.summary()