## 一，准备数据

In [1]:
import re,string
import numpy as np 
import pandas as pd 
import tensorflow as tf 
import matplotlib.pyplot as plt
from tensorflow.keras import models,layers,preprocessing,optimizers,losses,metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
train_data_path = "../../../data/imdb/train.csv"
test_data_path = "../../../data/imdb/test.csv"

MAX_WORDS = 10000
MAX_LEN = 200
BATCH_SIZE = 20

In [3]:
def split_line(line):
    arr = tf.strings.split(line, "\t")
    label = tf.expand_dims(tf.cast(tf.strings.to_number(arr[0]), tf.int32), axis=0)
    # tf.cast 用于数据类型转换
    # tf.expand_dims 用于增加维度
    text = tf.expand_dims(arr[1], axis=0)
    return text,label

ds_train_raw = tf.data.TextLineDataset(filenames=train_data_path) \
    .map(split_line, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
        .shuffle(buffer_size=1000) \
            .batch(BATCH_SIZE) \
                .prefetch(tf.data.experimental.AUTOTUNE)

ds_test_raw = tf.data.TextLineDataset(filenames=test_data_path) \
    .map(split_line, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
        .batch(BATCH_SIZE) \
            .prefetch(tf.data.experimental.AUTOTUNE)


In [4]:
def clearn_text(text):
    lowercase = tf.strings.lower(text)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    cleaned_punctuation = tf.strings.regex_replace(
        stripped_html,
        '[%s]' % re.escape(string.punctuation),
        ''
        )
    return cleaned_punctuation

vectorize_layer = TextVectorization(
    standardize=clearn_text,
    split='whitespace',
    max_tokens=MAX_WORDS-1,
    output_mode='int',
    output_sequence_length=MAX_LEN
)

ds_text = ds_train_raw.map(lambda text, label: text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:20])

[b'the', b'and', b'a', b'of', b'to', b'is', b'in', b'it', b'i', b'this', b'that', b'was', b'as', b'for', b'with', b'movie', b'but', b'film', b'on', b'not']


In [5]:
ds_train = ds_train_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)
ds_test = ds_test_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)

## 二，定义模型

In [7]:
tf.keras.backend.clear_session()

In [8]:
class CnnModel(models.Model):
    def __init__(self):
        super(CnnModel, self).__init__()
        
    def build(self,input_shape):
        self.embedding = layers.Embedding(MAX_WORDS,7,input_length=MAX_LEN)
        self.conv_1 = layers.Conv1D(16, kernel_size= 5,name = "conv_1",activation = "relu")
        self.pool_1 = layers.MaxPool1D(name = "pool_1")
        self.conv_2 = layers.Conv1D(128, kernel_size=2,name = "conv_2",activation = "relu")
        self.pool_2 = layers.MaxPool1D(name = "pool_2")
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(1,activation = "sigmoid")
        super(CnnModel,self).build(input_shape)
    
    def call(self, x):
        x = self.embedding(x)
        x = self.conv_1(x)
        x = self.pool_1(x)
        x = self.conv_2(x)
        x = self.pool_2(x)
        x = self.flatten(x)
        x = self.dense(x)
        return(x)
    
    # 用于显示Output Shape
    def summary(self):
        x_input = layers.Input(shape = MAX_LEN)
        output = self.call(x_input)
        model = tf.keras.Model(inputs = x_input,outputs = output)
        model.summary()

In [9]:
model = CnnModel()
model.build(input_shape =(None,MAX_LEN))
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 7)            70000     
_________________________________________________________________
conv_1 (Conv1D)              (None, 196, 16)           576       
_________________________________________________________________
pool_1 (MaxPooling1D)        (None, 98, 16)            0         
_________________________________________________________________
conv_2 (Conv1D)              (None, 97, 128)           4224      
_________________________________________________________________
pool_2 (MaxPooling1D)        (None, 48, 128)           0         
_________________________________________________________________
flatten (Flatten)            (None, 6144)              0     

## 三，训练模型

In [10]:
#打印时间分割线
@tf.function
def printbar():
    today_ts = tf.timestamp()%(24*60*60)
    
    hour = tf.cast(today_ts//3600+8,tf.int32)%tf.constant(24)
    minite = tf.cast((today_ts%3600)//60,tf.int32)
    second = tf.cast(tf.floor(today_ts%60),tf.int32)
    
    def timeformat(m):
        if tf.strings.length(tf.strings.format("{}",m))==1:
            return(tf.strings.format("0{}",m))
        else:
            return(tf.strings.format("{}",m))
    
    timestring = tf.strings.join([timeformat(hour),timeformat(minite),
                timeformat(second)],separator = ":")
    tf.print("=========="*8+timestring)

In [11]:
optimizer = optimizers.Nadam()
loss_func = losses.BinaryCrossentropy()

train_loss = metrics.Mean(name='train_loss')
train_metric = metrics.BinaryAccuracy(name='train_accuracy')

valid_loss = metrics.Mean(name='valid_loss')
valid_metric = metrics.BinaryAccuracy(name='valid_accuracy')


@tf.function
def train_step(model, features, labels):
    with tf.GradientTape() as tape:
        predictions = model(features,training = True)
        loss = loss_func(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss.update_state(loss)
    train_metric.update_state(labels, predictions)
    

@tf.function
def valid_step(model, features, labels):
    predictions = model(features,training = False)
    batch_loss = loss_func(labels, predictions)
    valid_loss.update_state(batch_loss)
    valid_metric.update_state(labels, predictions)


def train_model(model,ds_train,ds_valid,epochs):
    for epoch in tf.range(1,epochs+1):
        
        for features, labels in ds_train:
            train_step(model,features,labels)

        for features, labels in ds_valid:
            valid_step(model,features,labels)
        
        #此处logs模板需要根据metric具体情况修改
        logs = 'Epoch={},Loss:{},Accuracy:{},Valid Loss:{},Valid Accuracy:{}' 
        
        if epoch%1==0:
            printbar()
            tf.print(tf.strings.format(logs,
            (epoch,train_loss.result(),train_metric.result(),valid_loss.result(),valid_metric.result())))
            tf.print("")
        
        train_loss.reset_states()
        valid_loss.reset_states()
        train_metric.reset_states()
        valid_metric.reset_states()

train_model(model,ds_train,ds_test,epochs = 6)

Epoch=1,Loss:0.451597869,Accuracy:0.758,Valid Loss:0.382119209,Valid Accuracy:0.8308

Epoch=2,Loss:0.243866518,Accuracy:0.904,Valid Loss:0.322016329,Valid Accuracy:0.866

Epoch=3,Loss:0.167434573,Accuracy:0.93755,Valid Loss:0.357636,Valid Accuracy:0.868

Epoch=4,Loss:0.105727315,Accuracy:0.964,Valid Loss:0.473848879,Valid Accuracy:0.8572

Epoch=5,Loss:0.0603662916,Accuracy:0.98125,Valid Loss:0.599012673,Valid Accuracy:0.8568

Epoch=6,Loss:0.0302702207,Accuracy:0.99075,Valid Loss:0.805248082,Valid Accuracy:0.8578



## 四，评估模型

In [12]:
def evaluate_model(model,ds_valid):
    for features, labels in ds_valid:
         valid_step(model,features,labels)
    logs = 'Valid Loss:{},Valid Accuracy:{}' 
    tf.print(tf.strings.format(logs,(valid_loss.result(),valid_metric.result())))
    
    valid_loss.reset_states()
    train_metric.reset_states()
    valid_metric.reset_states()

In [13]:
evaluate_model(model,ds_test)

Valid Loss:0.805248082,Valid Accuracy:0.8578


## 五，使用模型

In [14]:
model.predict(ds_test)

array([[0.99278486],
       [0.9999759 ],
       [0.99926895],
       ...,
       [0.10119107],
       [0.19066869],
       [1.        ]], dtype=float32)

In [15]:
for x_test,_ in ds_test.take(1):
    print(model(x_test))
    #以下方法等价：
    #print(model.call(x_test))
    #print(model.predict_on_batch(x_test))

tf.Tensor(
[[9.92784858e-01]
 [9.99975920e-01]
 [9.99268949e-01]
 [5.72908993e-06]
 [7.66376317e-01]
 [1.68699025e-05]
 [2.09160822e-09]
 [1.17711592e-04]
 [9.99951839e-01]
 [9.94080007e-01]
 [9.99999642e-01]
 [9.99793112e-01]
 [5.23628478e-06]
 [6.86631203e-01]
 [1.19694805e-08]
 [9.73942876e-01]
 [1.27842053e-04]
 [5.36034822e-01]
 [5.98479701e-08]
 [9.33194220e-01]], shape=(20, 1), dtype=float32)


## 六，保存模型

In [19]:
model.save('./data/tf_model_savedmodel', save_format="tf")
print('export saved model.')

model_loaded = tf.keras.models.load_model('./data/tf_model_savedmodel')
model_loaded.predict(ds_test)

INFO:tensorflow:Assets written to: ./data/tf_model_savedmodel\assets
export saved model.


array([[0.99278486],
       [0.9999759 ],
       [0.99926895],
       ...,
       [0.10119107],
       [0.19066869],
       [1.        ]], dtype=float32)