In [6]:
import tensorflow as tf 
from tensorflow import keras
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

In [3]:
data_path = 'data/weibo_senti_100k.csv'
data_df = pd.read_csv(data_path, sep=',', error_bad_lines=False)

reviews, labels = list(), list()
max_length = 0
for index, (label, review) in tqdm(data_df.iterrows(), total=data_df.shape[0], desc='to token :'):
    if len(review) > max_length:
        max_length = len(review)
    reviews.append([r for r in review])
    labels.append(int(label))

to token :: 100%|██████████| 119988/119988 [00:08<00:00, 13671.04it/s]


In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, char_level=True, oov_token='UNK')
tokenizer.fit_on_texts(reviews)
vocab_size = len(tokenizer.word_index.keys())
X = tokenizer.texts_to_sequences(reviews)
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post')
y = np.array(labels)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(5000).batch(64)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(5000).batch(64)

In [5]:
checkpoint_path = 'text_cnn_checkpoints/checkpoint.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1
)

In [7]:
# Model
input = keras.Input(shape=(max_length, ), dtype='float64')

In [8]:
input

<tf.Tensor 'input_1:0' shape=(None, 260) dtype=float64>

In [9]:
embedder = keras.layers.Embedding(vocab_size, 256)
embedder

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7f7b493e82b0>

In [10]:
embed = embedder(input)
embed

<tf.Tensor 'embedding/Identity:0' shape=(None, 260, 256) dtype=float32>

In [11]:
conv1 = keras.layers.Conv1D(filters=100, kernel_size=3, activation='relu')(embed)

In [12]:
conv1

<tf.Tensor 'conv1d/Identity:0' shape=(None, 258, 100) dtype=float32>

In [13]:
pool1 = keras.layers.MaxPool1D(max_length - 3 + 1)(conv1)

In [14]:
pool1

<tf.Tensor 'max_pooling1d/Identity:0' shape=(None, 1, 100) dtype=float32>

In [15]:
flatten1 = keras.layers.Flatten()(pool1)
flatten1

<tf.Tensor 'flatten/Identity:0' shape=(None, 100) dtype=float32>

In [16]:
conv2 = keras.layers.Conv1D(filters=100, kernel_size=4, activation='relu')(embed)
pool2 = keras.layers.MaxPool1D(max_length - 4 + 1)(conv2)
flatten2 = keras.layers.Flatten()(pool2)

In [18]:
pool2

<tf.Tensor 'max_pooling1d_1/Identity:0' shape=(None, 1, 100) dtype=float32>

In [17]:
flatten2

<tf.Tensor 'flatten_1/Identity:0' shape=(None, 100) dtype=float32>

In [19]:
conv3 = keras.layers.Conv1D(filters=100, kernel_size=4, activation='relu')(embed)
pool3 = keras.layers.MaxPool1D(max_length - 4 + 1)(conv3)
flatten3 = keras.layers.Flatten()(pool3)

In [20]:
flatten3

<tf.Tensor 'flatten_2/Identity:0' shape=(None, 100) dtype=float32>

In [23]:
merge = keras.layers.concatenate([flatten1, flatten2, flatten3], axis=-1)

In [24]:
merge

<tf.Tensor 'concatenate_1/Identity:0' shape=(None, 300) dtype=float32>

In [25]:
output = keras.layers.Dropout(0.5)(merge)

In [26]:
output

<tf.Tensor 'dropout/Identity:0' shape=(None, 300) dtype=float32>

In [27]:
output = keras.layers.Dense(32, activation='relu')(output)
output

<tf.Tensor 'dense/Identity:0' shape=(None, 32) dtype=float32>

In [28]:
output = keras.layers.Dense(1, activation='sigmoid')(output)

In [30]:
model = keras.Model(inputs=input, outputs=output)

In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 260)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 260, 256)     1508352     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 258, 100)     76900       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 257, 100)     102500      embedding[0][0]                  
______________________________________________________________________________________________

In [32]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
history = model.fit(
    x_train,
    y_train,
    epochs=5,
    batch_size=512,
    validation_data=(x_test, y_test),
    callbacks=[cp_callback]
)

Train on 95990 samples, validate on 23998 samples
Epoch 1/5
Epoch 00001: saving model to text_cnn_checkpoints/checkpoint.ckpt
Epoch 2/5
Epoch 00002: saving model to text_cnn_checkpoints/checkpoint.ckpt
Epoch 3/5
Epoch 00003: saving model to text_cnn_checkpoints/checkpoint.ckpt
Epoch 4/5
Epoch 00004: saving model to text_cnn_checkpoints/checkpoint.ckpt
Epoch 5/5
Epoch 00005: saving model to text_cnn_checkpoints/checkpoint.ckpt
