In [10]:
%matplotlib inline
import pandas as pd
import sklearn
import numpy as np
from IPython.display import display
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from matplotlib import rcParams
import scipy.stats as stats

In [7]:
from pythainlp.tag.named_entity import ThaiNameTagger

In [66]:
data = pd.read_csv('../../data/Thaipbs-tokenize_include_stop.csv')

In [317]:
ner_tagger = ThaiNameTagger()

In [372]:
boundary = 1000
data_bounded = data[data['view']<boundary]
x = data_bounded['view']
std = np.std(x)
mean = np.mean(x)
groups = [mean-3*std, mean-std,mean,mean+std,mean+3*std]
print('STD : ',std)
print('MEAN : ',mean)
print(groups)

STD :  243.60813759873662
MEAN :  384.19986908138776
[-346.62454371482204, 140.59173148265114, 384.19986908138776, 627.8080066801244, 1115.0242818775976]


In [373]:
pop_level = []
for i in x:
    for j in range(len(groups)-1):
        if i >= groups[j] and i <= groups[j+1]:
            pop_level+=[j]

In [374]:
print(np.unique(pop_level,return_counts=True))

(array([0, 1, 2, 3]), array([2720, 8060, 4120, 3432]))


## Prepare Data

In [354]:
headlines = data_bounded['headline']
views = data_bounded['view']

In [324]:
ner = set()
pos = set()
for hl in [ner_tagger.get_ner(i) for i in headlines]:
    for word in hl:
        ner.add(word[1])
        pos.add(word[2])

In [328]:
ner = list(ner)
pos = list(pos)
ner_pos_to_idx = dict()
count = 0
for i in ner:
    for j in pos:
        ner_pos_to_idx[i+'-'+j] = count
        count+=1

In [329]:
idx_to_ner_pos = {v: k for k, v in ner_pos_to_idx.items()}

In [325]:
def prepare_x(data):
    data_ner = [ner_tagger.get_ner(i) for i in data]
    x = []
    for hl in data_ner:
        temp = []
        for word in hl:
            temp+=[ner_pos_to_idx[word[1]+'-'+word[2]]]
        x+=[temp]
    return x

In [326]:
def prepare_y(data):
    y = []
    for i in data:
        for j in range(len(groups)-1):
            if i >= groups[j] and i <= groups[j+1]:
                y+=[j]
                break
    return y

In [375]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

x_train = prepare_x(headlines)
y_train = prepare_y(views)

x_train = pad_sequences(x_train,maxlen=50)
y_train = to_categorical(y_train)

print('x_train shape : ',x_train.shape)
print('y_train shape : ',y_train.shape)

x_train shape :  (18332, 50)
y_train shape :  (18332, 4)


## Model

In [376]:
from keras.models import Model, Input
from keras.layers import Dense, Dropout, BatchNormalization, GRU, Conv1D, Dropout
from keras.optimizers import Adam
def get_model():    
    input1 = Input(shape=(x_train.shape[1],))
#     x = Conv1D(16,3)(input1)
#     x = GRU(32)(x)
#     x = Flatten()(x)
    x = Dense(1024, activation='relu')(input1)
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(y_train.shape[1],activation='softmax')(x)
    model = Model(inputs=input1, outputs=x)
    adam  = Adam(lr=0.00001)
    model.compile(optimizer=adam,  loss='categorical_crossentropy' ,metrics=['accuracy'])
    
    
    return model

In [377]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        (None, 50)                0         
_________________________________________________________________
dense_75 (Dense)             (None, 1024)              52224     
_________________________________________________________________
dropout_58 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_76 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_59 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_77 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_60 (Dropout)         (None, 256)               0         
__________

In [378]:
model.fit(x_train, y_train, batch_size=32 ,epochs=50, verbose=2)

Epoch 1/50
 - 4s - loss: 9.5113 - acc: 0.3642
Epoch 2/50
 - 3s - loss: 8.9940 - acc: 0.4133
Epoch 3/50
 - 3s - loss: 8.9463 - acc: 0.4240
Epoch 4/50
 - 3s - loss: 8.9355 - acc: 0.4174
Epoch 5/50
 - 3s - loss: 8.8837 - acc: 0.4170
Epoch 6/50
 - 4s - loss: 8.8883 - acc: 0.4161
Epoch 7/50
 - 4s - loss: 8.8358 - acc: 0.4148
Epoch 8/50
 - 4s - loss: 8.8169 - acc: 0.4126
Epoch 9/50
 - 3s - loss: 8.7511 - acc: 0.4087
Epoch 10/50
 - 3s - loss: 8.7137 - acc: 0.3947
Epoch 11/50
 - 5s - loss: 8.6512 - acc: 0.3990
Epoch 12/50
 - 4s - loss: 8.6123 - acc: 0.3914
Epoch 13/50
 - 4s - loss: 8.4996 - acc: 0.3866
Epoch 14/50
 - 4s - loss: 8.3441 - acc: 0.3704
Epoch 15/50
 - 4s - loss: 8.1757 - acc: 0.3711
Epoch 16/50
 - 4s - loss: 7.9490 - acc: 0.3595
Epoch 17/50
 - 4s - loss: 7.6029 - acc: 0.3512
Epoch 18/50
 - 4s - loss: 7.1861 - acc: 0.3421
Epoch 19/50
 - 4s - loss: 6.7032 - acc: 0.3290
Epoch 20/50
 - 5s - loss: 6.1915 - acc: 0.3192
Epoch 21/50
 - 9s - loss: 5.5579 - acc: 0.3140
Epoch 22/50
 - 5s - lo

KeyboardInterrupt: 

## Result

In [380]:
test = ['ไฟไหม้มหาวิหารน็อทร์-ดามอายุ 850 ปี กลางกรุงปารีส ล่าสุดควบคุมเพลิงได้แล้ว คาดสาเหตุจากการซ่อม']
x_test = prepare_x(test)
x_test = pad_sequences(x_test,maxlen=50)
y_pred = model.predict(x_test)

In [385]:
p_level = np.argmax(y_pred[0])
print('Pop Level : ',p_level)
print('Pred Range : %f - %f' % (groups[p_level], groups[p_level+1]))
print('Real View : %d' % (202))


Pop Level :  1
Pred Range : 140.591731 - 384.199869
Real View : 202


## Try GRU

In [402]:
from keras.models import Model, Input
from keras.layers import Dense, Dropout, BatchNormalization, GRU, Conv1D, Dropout, TimeDistributed, LSTM
from keras.optimizers import Adam
def get_model_gru():    
    input1 = Input(shape=(x_train.shape[1],))
    x = LSTM(32)(input1)
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(y_train.shape[1],activation='softmax')(x)
    model = Model(inputs=input1, outputs=x)
    adam  = Adam(lr=0.00001)
    model.compile(optimizer=adam,  loss='categorical_crossentropy' ,metrics=['accuracy'])
    
    
    return model

In [403]:
model = get_model_gru()
model.summary()

ValueError: Input 0 is incompatible with layer lstm_1: expected ndim=3, found ndim=2