## Sentiment analysis

> https://www.tensorflow.org/tutorials/keras/text_classification

In [1]:
import os
import re
import shutil
import string
import tensorflow as tf
import pandas as pd 

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## Download Data

In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
train_dir = os.path.join(dataset_dir, 'train')
remove_dir = os.path.join(train_dir, 'unsup')
shutil.move(remove_dir, dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


'./aclImdb/unsup'

## Load Data

In [4]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [5]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [6]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test', 
    batch_size=batch_size)

Found 25000 files belonging to 2 classes.


## Prepare for training

In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [8]:
max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [10]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [11]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

## Configure for performance

In [12]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Create the Model

In [14]:
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

### Train the model

In [16]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Export the model

In [17]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

0.8727200031280518


In [18]:
score = export_model.evaluate(raw_test_ds)
print("Loss: ", score[0])
print("Accuracy: ", score[1])

Loss:  0.3102678060531616
Accuracy:  0.8727200031280518


In [19]:
examples = [
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
]

export_model.predict(examples)

array([[0.6573541],
       [0.4834986],
       [0.401182 ]], dtype=float32)

In [20]:
import pandas as pd
X_test = pd.DataFrame.from_dict(examples)
X_test

Unnamed: 0,0
0,The movie was great!
1,The movie was okay.
2,The movie was terrible...


In [21]:
export_model.predict(examples)

array([[0.6573541],
       [0.4834986],
       [0.401182 ]], dtype=float32)

## mlflow Connect

In [22]:
experiment_name = "imdb"
registered_model_name="imdb_monkey"
artifact_path="keras-imdb-model"

In [23]:
import mlflow
import mlflow.keras
from mlflow.models.signature import infer_signature

# Set an experiment name, which must be unique and case sensitive.
# 填入實驗名稱, 務必符合在mlflow頁面裡的實驗名稱
# 根據填入的實驗名稱會將訓練完的物件儲存在該實驗所設定的儲存貯體(Bucket)
# 例: mnist

mlflow.set_experiment(experiment_name)

mlflow.start_run()

mlflow.log_metric("cross_entropy_test_loss", score[0])
mlflow.log_metric("test_accuracy", score[1])
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.3102678060531616
Test accuracy: 0.8727200031280518


In [24]:
signature = infer_signature(X_test, export_model.predict(X_test))
input_example = X_test

In [25]:
# 對應參數說明
# 1.訓練完的模型: model
# 2.Artifact的相對路徑: artifact_path=<可隨意修改>
# 3.註冊模型的名稱: registered_model_name="MNIST手寫辨識"
# 4.輸入輸出說明: signature=signature
# 5.輸入範例: input_example=input_example
# 務必!填入相對應的參數資料
mlflow.keras.log_model(export_model, artifact_path=artifact_path,registered_model_name=registered_model_name, signature=signature, input_example=input_example)

INFO:tensorflow:Assets written to: /tmp/tmph8stdvl6/model/data/model/assets


Registered model 'imdb_monkey' already exists. Creating a new version of this model...
2021/12/16 17:19:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: imdb_monkey, version 2
Created version '2' of model 'imdb_monkey'.


In [26]:
mlflow.end_run()