In [1]:
%matplotlib inline
import pandas as pd
import sklearn
import numpy as np
from IPython.display import display
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from matplotlib import rcParams
import scipy.stats as stats

In [2]:
from pythainlp.tag.named_entity import ThaiNameTagger

In [3]:
data = pd.read_csv('../../data/the_standard_data.csv')

In [4]:
ner_tagger = ThaiNameTagger()

In [27]:
boundary = 3000
data_bounded = data[data['view']<boundary]
x = data_bounded['view']
std = np.std(x)
mean = np.mean(x)
groups = [mean-3*std, mean-std,mean,mean+1.5*std,mean+3*std]
print('STD : ',std)
print('MEAN : ',mean)
print(groups)

STD :  863.8879788248664
MEAN :  1042.4462392108508
[-1549.2176972637485, 178.55826038598434, 1042.4462392108508, 2338.27820744815, 3634.11017568545]


In [28]:
pop_level = []
for i in x:
    for j in range(len(groups)-1):
        if i >= groups[j] and i <= groups[j+1]:
            pop_level+=[j]

In [29]:
print(np.unique(pop_level,return_counts=True))

(array([0, 1, 2, 3]), array([ 998, 3908, 2299,  905]))


## Prepare Data

In [30]:
headlines = data_bounded['headline']
views = data_bounded['view']

In [31]:
ner = set()
pos = set()
for hl in [ner_tagger.get_ner(i) for i in headlines]:
    for word in hl:
        ner.add(word[1])
        pos.add(word[2])

In [32]:
ner = list(ner)
pos = list(pos)
ner_pos_to_idx = dict()
count = 0
for i in ner:
    for j in pos:
        ner_pos_to_idx[i+'-'+j] = count
        count+=1

In [33]:
idx_to_ner_pos = {v: k for k, v in ner_pos_to_idx.items()}

In [34]:
def prepare_x(data):
    data_ner = [ner_tagger.get_ner(i) for i in data]
    x = []
    for hl in data_ner:
        temp = []
        for word in hl:
            temp+=[ner_pos_to_idx[word[1]+'-'+word[2]]]
        x+=[temp]
    return x

In [35]:
def prepare_y(data):
    y = []
    for i in data:
        for j in range(len(groups)-1):
            if i >= groups[j] and i <= groups[j+1]:
                y+=[j]
                break
    return y

In [36]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

x_train = prepare_x(headlines)
y_train = prepare_y(views)

x_train = pad_sequences(x_train,maxlen=50)
y_train = to_categorical(y_train)

print('x_train shape : ',x_train.shape)
print('y_train shape : ',y_train.shape)

Using TensorFlow backend.


x_train shape :  (8110, 50)
y_train shape :  (8110, 4)


## Model

In [37]:
from keras.models import Model, Input
from keras.layers import Dense, Dropout, BatchNormalization, GRU, Conv1D, Dropout
from keras.optimizers import Adam
def get_model():    
    input1 = Input(shape=(x_train.shape[1],))
#     x = Conv1D(16,3)(input1)
#     x = GRU(32)(x)
#     x = Flatten()(x)
    x = Dense(1024, activation='relu')(input1)
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(y_train.shape[1],activation='softmax')(x)
    model = Model(inputs=input1, outputs=x)
    adam  = Adam(lr=0.00001)
    model.compile(optimizer=adam,  loss='categorical_crossentropy' ,metrics=['accuracy'])
    
    
    return model

In [38]:
model = get_model()
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              52224     
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)      

In [None]:
model.fit(x_train, y_train, batch_size=16 ,epochs=50, verbose=2)

Epoch 1/50
 - 2s - loss: 7.9110 - acc: 0.4330
Epoch 2/50
 - 2s - loss: 7.8869 - acc: 0.4353
Epoch 3/50
 - 2s - loss: 7.7555 - acc: 0.4312
Epoch 4/50
 - 2s - loss: 7.7094 - acc: 0.4284
Epoch 5/50
 - 2s - loss: 7.5938 - acc: 0.4317
Epoch 6/50
 - 2s - loss: 7.4124 - acc: 0.4300
Epoch 7/50
 - 2s - loss: 7.4168 - acc: 0.4178
Epoch 8/50
 - 2s - loss: 7.1616 - acc: 0.4206
Epoch 9/50
 - 2s - loss: 7.0770 - acc: 0.4132
Epoch 10/50
 - 2s - loss: 6.9224 - acc: 0.3940
Epoch 11/50
 - 2s - loss: 6.6747 - acc: 0.4001
Epoch 12/50
 - 2s - loss: 6.4271 - acc: 0.4009
Epoch 13/50
 - 2s - loss: 6.2026 - acc: 0.3957
Epoch 14/50
 - 2s - loss: 5.9991 - acc: 0.3905
Epoch 15/50
 - 2s - loss: 5.7642 - acc: 0.3925
Epoch 16/50
 - 2s - loss: 5.5186 - acc: 0.3888
Epoch 17/50
 - 2s - loss: 5.3116 - acc: 0.3793
Epoch 18/50
 - 2s - loss: 5.0253 - acc: 0.3710
Epoch 19/50
 - 2s - loss: 4.8208 - acc: 0.3631
Epoch 20/50


## Result

In [380]:
test = ['ไฟไหม้มหาวิหารน็อทร์-ดามอายุ 850 ปี กลางกรุงปารีส ล่าสุดควบคุมเพลิงได้แล้ว คาดสาเหตุจากการซ่อม']
x_test = prepare_x(test)
x_test = pad_sequences(x_test,maxlen=50)
y_pred = model.predict(x_test)

In [385]:
p_level = np.argmax(y_pred[0])
print('Pop Level : ',p_level)
print('Pred Range : %f - %f' % (groups[p_level], groups[p_level+1]))
print('Real View : %d' % (202))


Pop Level :  1
Pred Range : 140.591731 - 384.199869
Real View : 202


## Try GRU

In [402]:
from keras.models import Model, Input
from keras.layers import Dense, Dropout, BatchNormalization, GRU, Conv1D, Dropout, TimeDistributed, LSTM
from keras.optimizers import Adam
def get_model_gru():    
    input1 = Input(shape=(x_train.shape[1],))
    x = LSTM(32)(input1)
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(y_train.shape[1],activation='softmax')(x)
    model = Model(inputs=input1, outputs=x)
    adam  = Adam(lr=0.00001)
    model.compile(optimizer=adam,  loss='categorical_crossentropy' ,metrics=['accuracy'])
    
    
    return model

In [403]:
model = get_model_gru()
model.summary()

ValueError: Input 0 is incompatible with layer lstm_1: expected ndim=3, found ndim=2