In [1]:
import pandas as pd
import numpy as np

In [2]:
trained_csv = '~/Midterm/data/w_review_train.csv'
sep = ';'
header = None

df = pd.read_csv(trained_csv, sep=sep, header=header)

In [3]:
# wordX, wordY = df[0], df[1]

In [4]:
%matplotlib inline
import math
import nltk
import io
import os
import random
from random import shuffle
random.seed(999)

In [5]:
CHARS = [
  '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
  ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
  '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
  'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  'n', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
  'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช',
  'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
  'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ',
  'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า',
  'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ',
  'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓',
  '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff'
]
CHARS_MAP = {v: k for k, v in enumerate(CHARS)}

In [6]:
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Embedding, Conv1D, TimeDistributed, Activation, Dense, Flatten, GRU, Bidirectional, Dropout, LSTM, MaxPooling1D
from keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from keras.preprocessing import sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
def get_tokenizer_model():
    input1 = Input(shape=(21,))
    x = Embedding(178,8)(input1)
    x = Conv1D(100,5,strides=1,activation='relu',padding="same")(x)
    x = TimeDistributed(Dense(5))(x)
    x = Flatten()(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(),
                 loss='binary_crossentropy',
                 metrics=['acc'])
    return model

In [8]:
tokenizer_model = get_tokenizer_model()
tokenizer_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 21)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 21, 8)             1424      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 21, 100)           4100      
_________________________________________________________________
time_distributed_1 (TimeDist (None, 21, 5)             505       
_________________________________________________________________
flatten_1 (Flatten)          (None, 105)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10600     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
__________

In [9]:
tokenizer_path_model_best='/data/model_best_2.h5'
tokenizer_model.load_weights(tokenizer_path_model_best)

In [10]:
def create_n_gram_df(df, n_pad):
  n_pad_2 = int((n_pad - 1)/2)
  for i in range(n_pad_2):
      df['char-{}'.format(i+1)] = df[0].shift(i + 1)
      df['char{}'.format(i+1)] = df[0].shift(-i - 1)
  return df[n_pad_2: -n_pad_2]

In [11]:
def tokenize(sentence, model):
    character = list(sentence)
    character_df = pd.DataFrame(character)

    n_pad = 21
    n_pad_2 = int((n_pad - 1)/2)
    pad = [{0: ' '}]
    df_pad = pd.DataFrame(pad * n_pad_2)

    character_df = pd.concat((df_pad, character_df, df_pad))
    character_df[0] = character_df[0].map(lambda x: CHARS_MAP.get(x, 80))
    
    df_with_context = create_n_gram_df(character_df, n_pad=n_pad)

    char_row = ['char' + str(i + 1) for i in range(n_pad_2)] + \
                 ['char-' + str(i + 1) for i in range(n_pad_2)] + [0]

    x_char = df_with_context[char_row].as_matrix()
    y_pred = model.predict(x_char)

    prob_to_class = lambda p: 1 if p[0]>=0.5 else 0
    y_pred = np.apply_along_axis(prob_to_class,1,y_pred)
    
    tokenize = []
    accumulator = ''
    for i in range(len(y_pred)):
        if(y_pred[i] == 1):
            tokenize.append(accumulator)
            accumulator = character[i]
        else:
            accumulator += character[i]
    else:
        tokenize.append(accumulator)
    tokenize = tokenize[1:]
    return tokenize

In [13]:
import re
from collections import Counter, defaultdict
from tqdm import tqdm

In [120]:
word_threshold = 3

def tokenize_corpus(df, threshold = word_threshold):
    num_UNK = 0
    rare_word = set()
    corpus = set()
    word_freq = defaultdict(lambda: 0)
    corpus_list = []
    
    for idx in tqdm(range(len(df))):
        target = df[idx]
        target = re.sub(r'[$|!|~|\n|-|#|+|<|>|0-9|*|.|%|@|$|^|&|=|:|:D|(|)|-|_|\t|?|;|\/|\\|\||{|}|\\]|\\[|`]|"|๑|๒|๓|๔|๕|๖|๗|๘|๙|๐|a-z|A-Z|\'', r'', target)
        tokenize_list = list(map(lambda x: re.sub('\s+','', x), list(filter(lambda x: len(x.strip()) \
                    != 0 and x not in conjunction_word, tokenize(target, my_best_model)))))
        corpus_list.append(tokenize_list)
        for x in tokenize_list:
            word_freq[x] += 1

    for x in word_freq:
        if(word_freq[x] <= threshold):
            num_UNK += word_freq[x]
            rare_word.add(x)
        else:
            corpus.add(x)
    
    print("Approximate {} % are normal words ({} of {})".format(100*len(corpus)/(len(corpus) + len(rare_word)),\
                                                                len(corpus), len(rare_word) + len(corpus)))
    return list(map(lambda x: list(filter(lambda y: y in corpus, x)), corpus_list)), corpus, rare_word

In [121]:
import functools

#Threshold processing
def process_threshold(lis, threshold = word_threshold):
    word_freq = defaultdict(lambda: 0)
    num_UNK = 0
    rare_word = set()
    word_freq = defaultdict(lambda: 0)
    
    for x in lis:
        for word in x:
            word_freq[word] += 1
    
    for x in word_freq:
        if(word_freq[x] <= threshold):
            num_UNK += word_freq[x]
            rare_word.add(x)

    word_freq["UNK"] = num_UNK
    
    def abc(word):
        if word not in rare_word:
            return word
        return "UNK"
    
    return list(map(lambda x: list(map(lambda y: abc(y), x)), lis)), word_freq, rare_word

In [122]:
# content_list_0 = tokenize_corpus(wordX, threshold=0)
# save_object(content_list_0, '/data/tokenizeWordAll-2.pkl')

In [123]:
content_list_0 = load_object('/data/tokenizeWordAll-2.pkl')
content_list_0 = process_threshold(content_list_0[0], threshold = word_threshold)

In [124]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i+1

fileSave = "/data/personalWord2Vec.pkl"

In [125]:
# word2vec = dict()
# filePath = "/data/fastText/thai2vec.vec"
# fileLen = file_len(filePath)
# with open(filePath, 'r') as f:
#     for idx, line in tqdm(enumerate(f), total = fileLen):
#         if idx == 0:
#             continue
#         word2vec[line.split(' ')[0]] = [float(x) for x in line.split(' ')[1:]]
# save_object(word2vec, fileSave)

In [126]:
word2vec = load_object(fileSave)

In [127]:
word_to_index = dict()
index_to_word = dict()
for idx, x in enumerate(word2vec.keys()):
    word_to_index[x] = idx
    index_to_word[idx] = x

In [128]:
max_dim = 1000

In [129]:
#Prepare word embedding for unknown word

word_freq = content_list_0[1]

dictionary = dict()
dictionary_r = dict()

dictionary["for_keras_zero_padding"] = 0
dictionary_r[0] = "for_keras_zero_padding"

rare_word = set()

for word in word_freq:
    c += 1
    if c <= 10:
        print(word, end='|')
    t = len(dictionary)
    dictionary[word] = t
    dictionary_r[t] = word
    
    if word_freq[word] <= word_threshold:
        rare_word.add(word)

data = list()
data_feature = []

for sentence in content_list_0[0]:
    for word in sentence:
        if word not in rare_word:
            data.append(dictionary[word])
        else:
            data.append(dictionary["UNK"])
    data_feature.append(data)
    data =[]

print(list(map(lambda lis: list(map(lambda y: dictionary_r[y], lis)), data_feature[:2])))

[['ร้าน', 'อาหาร', 'ใหญ่', 'มากกกกกกก', 'เลี้ยว', 'เข้า', 'มา', 'เจอ', 'ห้อง', 'น้ำ', 'ก่อน', 'เลย', 'เออ', 'แปลก', 'ดี', 'UNK', 'หลัก', 'ๆ', 'อยู่', 'ชั้น', 'มี', 'กาแฟ', 'น้ำ', 'ผึ้ง', 'ซึ่ง', 'ก็', 'แค่', 'เอา', 'น้ำ', 'ผึ้ง', 'มา', 'ราด', 'แพงเวอร์', 'อย่า', 'สั่ง', 'เลย', 'ลาบ', 'ไข่', 'ต้ม', 'ไข่', 'มัน', 'คาว', 'อะ', 'เลย', 'ไม่', 'ประทับใจ', 'เท่า', 'ไหร่', 'ทอด', 'มัน', 'หัว', 'UNK', 'อร่อย', 'ต้อง', 'เบิ้ล', 'พะแนง', 'ห่อ', 'ไข่', 'อร่อย', 'ดี', 'เห้ย', 'แต่', 'ราคา', 'บาท', 'มัน', 'เกิน', 'ไป', 'นะ', 'รับ', 'ไม่', 'UNK', 'เลิก', 'กิน', 'แล้ว', 'มี', 'ขนม', 'หวาน', 'ให้', 'กิน', 'ฟรี', 'เล็กน้อย', 'ขนม', 'ไทย', 'คง', 'ไม่', 'ไปซ้ำ', 'แพงเกิน'], ['อาหาร', 'ที่', 'นี่', 'เป็น', 'อาหารจีน', 'แคะ', 'ที่', 'หากิน', 'ยาก', 'ใน', 'บ้าน', 'เรา', 'ตัว', 'ร้าน', 'ตั้ง', 'อยู่', 'ที่', 'ถนนพุทธมณฑล', 'สาย', 'ไป', 'ตาม', 'UNK', '-', 'นครชัยศรี', 'เมื่อ', 'ถึง', 'พุทธมณฑลสาย', 'ก็', 'เลี้ยว', 'เข้า', 'ไป', 'ประมาณ', 'เมตร', 'ร้าน', 'อยู่', 'ทาง', 'ซ้าย', 'มือ', 'ค่ะ', 'มี', 'คน', 'บอก', '

In [130]:
#Create embedding Main train_data
pre_embedding = []
pre_embedding.append(np.zeros(300))
for l in dictionary.keys():
    if l in word_to_index:
        pre_embedding.append(word2vec[l])
    else:
        pre_embedding.append(np.zeros(300)) 
weight_em = np.array(pre_embedding)

In [131]:
print(weight_em.shape)

(119458, 300)


In [132]:
x_train = sequence.pad_sequences(data_feature, maxlen=max_dim, padding='post', truncating='pre')
print(x_train.shape)

(40000, 1000)


In [133]:
test_review_file_path = '~/Midterm/data/test_file.csv'
sep = ';'
header = None

df_test = pd.read_csv(test_review_file_path, sep=sep)

In [134]:
testTokenizeLocation = "/data/testTokenize.pkl"

In [135]:
# tokenize_test, kwn_wds, unk_wds = tokenize_corpus(df_test["review"], 0)
# tokenize_test, word_freq_test, unk_wds = process_threshold(tokenize_test, threshold = word_threshold)
# save_object(tokenize_test, testTokenizeLocation)

In [136]:
test_review = load_object(testTokenizeLocation)

In [137]:
#Make input feature for test data
data =[]
test_feature = []
for x in test_review:
    for word in x:
        if word in dictionary.keys():
            data.append(dictionary[word])
        else:
            data.append(dictionary["UNK"])
    test_feature.append(data)
    data =[]
test =[]
test = sequence.pad_sequences(test_feature, maxlen=max_dim, padding='post', truncating='pre')
print(test.shape)
print(list(map(lambda x: dictionary_r[x], test[0])))

(6203, 1000)
['ร้าน', 'นี้', 'จะ', 'อยู่', 'เส้น', 'สัน', 'กำแพง', '-', 'แม่ออน', 'เลย', 'แยก', 'บ่อ', 'สร้าง', 'ร้าน', 'จะ', 'อยู่', 'ด้าน', 'ซ้าย', 'ติด', 'ริม', 'ถนน', 'มี', 'ป้าย', 'ติด', 'ไว้', 'เห็น', 'ชัดเจน', 'ปู', 'ทอง', 'UNK', 'เด็ด', 'ตาม', 'หา', 'ข้าว', 'แกงรสชาติ', 'นี้', 'มา', 'ตลอด', 'ใน', 'ที่สุด', 'ก็', 'ได้', 'เจอ', 'เพราะ', 'ส่วน', 'ใหญ่', 'จะ', 'เจอรสชาติ', 'กลาง', 'ๆ', 'ถ้า', 'คน', 'ชอบรส', 'จัด', 'จ้าน', 'แต่', 'ไม่', 'ถึง', 'กับ', 'เผ็ด', 'เว่อร์', 'แนะนำ', 'ร้าน', 'นี้', 'เลย', 'ค่ะ', 'ที่', 'นั่ง', 'ก็', 'สะอาดสะอ้าน', 'มี', 'ทั้ง', 'น้ำ', 'ซุป', 'และ', 'น้ำ', 'พริก', 'กะปิฟรี', 'และ', 'น้ำ', 'ดื่ม', 'ฟรีบริการ', 'ตัว', 'เอง', 'คหสต', 'ชอบแบบ', 'นี้', 'ที่', 'ไม่', 'บังคับ', 'ให้', 'ลูกค้า', 'ต้อง', 'ซื้อ', 'น้ำ', 'กิน', 'เพราะ', 'ทาน', 'คน', 'เดียว', 'แค่', 'ขวด', 'เหลือ', 'ทิ้ง', 'ตลอด', 'เสียดาย', 'เงิน', 'สั่ง', 'กับ', 'ข้าว', 'ราด', 'มา', 'อย่าง', 'อร่อย', 'ทั้ง', 'สอง', 'อย่าง', 'เสียดาย', 'ไข่', 'ดาว', 'หมด', 'ซะก่อน', 'แต่', 'แค่', 'อย่าง', 'กับ', 'ข้าว

In [138]:
##Prepare data for training
input_data = x_train
splitter_idx = int(len(input_data)*0.8)
train_data = input_data[:splitter_idx]
val_data = input_data[splitter_idx:]

label = list(df[1])
y_train = pd.get_dummies(pd.Series(label)).as_matrix()

target = y_train
train_target = target[:splitter_idx]
val_target = target[splitter_idx:]

In [139]:
def get_my_best_model():
    input1 = Input(shape=(max_dim,))
    x = Embedding(weight_em.shape[0],300, weights = [weight_em])(input1)
    x = Conv1D(64,10,strides=2,activation='relu',padding="valid")(x)
    x = MaxPooling1D(pool_size=5, strides=1,padding='valid')(x)
    x = Dense(100)(x)
    x = Dropout(0.25)(x)
    x = Dense(100)(x)
    x = Conv1D(32,5,strides=2,activation='relu',padding="valid")(x)
    x = MaxPooling1D(pool_size=5, strides=1,padding='valid')(x)
    x = Dropout(0.25)(x)
    x = TimeDistributed(Dense(10))(x)
    x = Flatten()(x)
    x = Dropout(0.2)(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(100, activation='relu')(x)
    out = Dense(5, activation='softmax')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(),
                 loss='categorical_crossentropy',
                 metrics=['categorical_accuracy'])
    return model

In [140]:
model = get_my_best_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 1000, 300)         35837400  
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 496, 64)           192064    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 492, 64)           0         
_________________________________________________________________
dense_24 (Dense)             (None, 492, 100)          6500      
_________________________________________________________________
dropout_13 (Dropout)         (None, 492, 100)          0         
_________________________________________________________________
dense_25 (Dense)             (None, 492, 100)          10100     
__________

In [142]:
weight_path_model='/data/WongNai_Review_New_3.h5'

In [143]:
%%time

callbacks_list_model = [
    ModelCheckpoint(
        weight_path_model,
        monitor = "val_loss",
        mode = 'min',
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
    )   
]

model.fit(train_data,train_target,
          batch_size=256,epochs=5,
          verbose=1,callbacks=callbacks_list_model,
          validation_data=(val_data, val_target))

Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 38s, sys: 44.5 s, total: 2min 23s
Wall time: 3min 8s


In [117]:
model.load_weights(weight_path_model)
y_pred = model.predict(test)

ans = np.argmax(y_pred, axis=1)
ans = [x+1 for x in ans]
d =dict()
d['reviewID'] = [x for x in range (1,6204)]
d['rating'] = ans
sub = pd.DataFrame(d)
sub.to_csv('/data/Out.csv', ',', index=False , columns=['reviewID','rating'])