In [1]:
import tensorflow as tf 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

In [2]:
data_path = 'data/weibo_senti_100k.csv'

In [3]:
data_df = pd.read_csv(data_path, sep=',', error_bad_lines=False)

In [4]:
data_df.head(5)

Unnamed: 0,label,review
0,1,﻿更博了，爆照了，帅的呀，就是越来越爱你！生快傻缺[爱你][爱你][爱你]
1,1,@张晓鹏jonathan 土耳其的事要认真对待[哈哈]，否则直接开除。@丁丁看世界 很是细心...
2,1,姑娘都羡慕你呢…还有招财猫高兴……//@爱在蔓延-JC:[哈哈]小学徒一枚，等着明天见您呢/...
3,1,美~~~~~[爱你]
4,1,梦想有多大，舞台就有多大![鼓掌]


In [5]:
data_df.shape

(119988, 2)

In [6]:
reviews, labels = list(), list()
max_length = 0

In [7]:
for index, (label, review) in tqdm(data_df.iterrows(), total=data_df.shape[0], desc='to token :'):
    if len(review) > max_length:
        max_length = len(review)
    reviews.append([r for r in review])
    labels.append(int(label))

to token :: 100%|██████████| 119988/119988 [00:08<00:00, 13934.45it/s]


In [8]:
reviews[2]

['姑',
 '娘',
 '都',
 '羡',
 '慕',
 '你',
 '呢',
 '…',
 '还',
 '有',
 '招',
 '财',
 '猫',
 '高',
 '兴',
 '…',
 '…',
 '/',
 '/',
 '@',
 '爱',
 '在',
 '蔓',
 '延',
 '-',
 'J',
 'C',
 ':',
 '[',
 '哈',
 '哈',
 ']',
 '小',
 '学',
 '徒',
 '一',
 '枚',
 '，',
 '等',
 '着',
 '明',
 '天',
 '见',
 '您',
 '呢',
 '/',
 '/',
 '@',
 '李',
 '欣',
 '芸',
 'S',
 'h',
 'a',
 'r',
 'o',
 'n',
 'L',
 'e',
 'e',
 ':',
 '大',
 '佬',
 '范',
 '儿',
 '[',
 '书',
 '呆',
 '子',
 ']']

In [9]:
max_length

260

In [10]:
len(reviews)

119988

In [11]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, char_level=True, oov_token='UNK')

In [12]:
tokenizer.fit_on_texts(reviews)

In [13]:
tokenizer.word_index

{'UNK': 1,
 '[': 2,
 ']': 3,
 '/': 4,
 ' ': 5,
 '@': 6,
 '，': 7,
 '的': 8,
 ':': 9,
 '哈': 10,
 '！': 11,
 '。': 12,
 '了': 13,
 '我': 14,
 '是': 15,
 '不': 16,
 '一': 17,
 '?': 18,
 '泪': 19,
 '嘻': 20,
 '你': 21,
 'a': 22,
 '有': 23,
 'n': 24,
 '这': 25,
 '人': 26,
 'e': 27,
 't': 28,
 'i': 29,
 'o': 30,
 '大': 31,
 '小': 32,
 '好': 33,
 '.': 34,
 '在': 35,
 '爱': 36,
 '个': 37,
 '天': 38,
 '来': 39,
 '？': 40,
 '心': 41,
 '们': 42,
 '~': 43,
 'l': 44,
 '子': 45,
 'c': 46,
 's': 47,
 '啊': 48,
 '#': 49,
 '就': 50,
 '上': 51,
 'h': 52,
 '0': 53,
 '1': 54,
 '回': 55,
 '可': 56,
 '看': 57,
 '到': 58,
 '家': 59,
 '都': 60,
 '要': 61,
 '么': 62,
 'r': 63,
 '还': 64,
 '-': 65,
 '狂': 66,
 '抓': 67,
 '吃': 68,
 'y': 69,
 '去': 70,
 'm': 71,
 '鼓': 72,
 'p': 73,
 '掌': 74,
 'd': 75,
 '下': 76,
 '没': 77,
 '中': 78,
 '2': 79,
 '老': 80,
 '太': 81,
 'g': 82,
 '也': 83,
 '会': 84,
 '时': 85,
 '开': 86,
 '多': 87,
 '复': 88,
 '…': 89,
 '得': 90,
 '生': 91,
 '笑': 92,
 '真': 93,
 '说': 94,
 '美': 95,
 '怒': 96,
 'u': 97,
 '：': 98,
 '发': 99,
 '儿': 100,
 '_': 

In [14]:
vocab_size = len(tokenizer.word_index.keys())

In [15]:
vocab_size

5892

In [16]:
len(reviews)

119988

In [17]:
X = tokenizer.texts_to_sequences(reviews)

In [18]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post')

In [19]:
X.shape

(119988, 260)

In [20]:
y = np.array(labels)

In [21]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
x_train

array([[ 196,   51,   57, ...,    0,    0,    0],
       [ 144,   35,   35, ...,    0,    0,    0],
       [  55,   88,    6, ...,    0,    0,    0],
       ...,
       [  23,   17,  258, ...,    0,    0,    0],
       [  55,   88,    6, ...,    0,    0,    0],
       [2750,  665,  292, ...,    0,    0,    0]], dtype=int32)

In [24]:
x_train.shape

(95990, 260)

In [25]:
x_test.shape

(23998, 260)

In [26]:
y_train[:10]

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1])

In [27]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(5000).batch(64)

In [28]:
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(5000).batch(64)

In [29]:
embedding_dim = 256
hidden_dim = 128

In [30]:
checkpoint_path = 'rnn_classify_checkpoints/checkpoint.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)
checkpoint_dir

'rnn_classify_checkpoints'

In [31]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1
)

In [32]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(hidden_dim, return_sequences=True,recurrent_initializer='glorot_uniform')),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(hidden_dim, return_sequences=False,recurrent_initializer='glorot_uniform')),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ]
)

In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         1508352   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 256)         296448    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               296448    
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,134,273
Trainable params: 2,134,273
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [35]:
history = model.fit(
    x_train,
    y_train,
    epochs=5,
    batch_size=512,
    validation_data=(x_test, y_test),
    callbacks=[cp_callback]
)

Train on 95990 samples, validate on 23998 samples
Epoch 1/5
Epoch 00001: saving model to rnn_classify_checkpoints/checkpoint.ckpt
Epoch 2/5
 9728/95990 [==>...........................] - ETA: 7:17 - loss: 0.0606 - accuracy: 0.9843