In [1]:
import pickle
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import pandas as pd
import jieba
from keras.utils import to_categorical 

def convert_to_chinese(number):
    chinese_numerals = {
        "0": "零",
        "1": "一",
        "2": "二",
        "3": "三",
        "4": "四",
        "5": "五",
        "6": "六",
        "7": "七",
        "8": "八",
        "9": "九",
    }
    chinese_number = "".join([chinese_numerals[str(digit)] for digit in str(number)])
    return chinese_number

# 定義將文本中的阿拉伯數字轉換為中文數字的函數
def convert_numbers_to_chinese(text):
    # 確保text是字符串，如果不是，轉換為字符串
    text = str(text)
    words = jieba.lcut(text)
    converted_words = ''.join([convert_to_chinese(word) if word.isdigit() else word for word in words])
    return converted_words



# 載入第一個程式產生的檔案
with open("tokenizer.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)

word2vec_model = Word2Vec.load("word2vec.model")
embedding_dim = word2vec_model.vector_size  # 取得Word2Vec模型的維度
vocab_list = [word for word,i in word2vec_model.wv.key_to_index.items()]  # 取得
word_index = {" ":0}
word_vector = {}


# 將文本轉換成詞向量
embedding_matrix = np.zeros((len(vocab_list) + 1, embedding_dim))
for i in range(len(vocab_list)):
    word = vocab_list[i]
    word_index[word] = i + 1
    word_vector[word] = word2vec_model.wv[word]
    embedding_matrix[i + 1] = word2vec_model.wv[word]

2023-12-26 11:38:00.244674: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-26 11:38:00.264620: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-26 11:38:00.264640: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-26 11:38:00.264654: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-26 11:38:00.269476: I tensorflow/core/platform/cpu_feature_g

In [2]:
data = pd.read_excel('train_data.xlsx')

# 將 "輸入" 欄位中的阿拉伯數字轉換為中文數字
data["輸入"] = data["輸入"].apply(convert_numbers_to_chinese)
# 資料預處理（假設已有資料X和標籤y）
X = []
y = []
for i in range(len(data)):
    X.append(jieba.lcut(data["輸入"].iloc[i]))
    y.append(data["分類"].iloc[i])
y = to_categorical(np.asarray(y))


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.211 seconds.
Prefix dict has been built succesfully.


In [3]:
def tokenizer(text, word_index):
    data = []
    for sentence in text:
        new_text = []
        for word in sentence:
            if word in word_index:
                try:
                    new_text.append(word_index[word])
                except:
                    new_text.append(0)
        data.append(new_text)
    data_lengths = [len(sentence) for sentence in data]
    max_length = max(data_lengths)
    
    texts = pad_sequences(data, maxlen=7)
    return texts
X = tokenizer(X, word_index)

In [4]:
# 切割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train = X
# y_train = y

訓練

In [8]:
from keras.models import Sequential, save_model
from keras.layers import Embedding, Bidirectional, LSTM, BatchNormalization, Dense, Dropout
from keras.optimizers import Adam
from keras.constraints import max_norm
from keras.initializers import he_normal
from keras.callbacks import ReduceLROnPlateau
from keras.regularizers import l2
from datetime import datetime
# 創建模型
model = Sequential()
model.add(Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                    weights=[embedding_matrix], input_length=X.shape[1], 
                    trainable=False, embeddings_initializer=he_normal()))
model.add(Bidirectional(LSTM(64, return_sequences=True, 
                             kernel_constraint=max_norm(5), kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(32, kernel_constraint=max_norm(5), kernel_regularizer=l2(0.01))))
model.add(Dropout(0.3))  # 添加 Dropout 層，可根據需要調整 dropout 比例
model.add(Dense(6, activation='softmax', kernel_regularizer=l2(0.01)))  # 在全連接層上應用 L2 正規化

# 編譯模型，使用學習率調整器
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.005), metrics=['accuracy'])

# 訓練模型，加入 callbacks
model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), callbacks=[reduce_lr])

# Generate today's date in the format YYYYMMDD
today_date = datetime.today().strftime('%Y%m%d')

# Construct the filename using the specified naming convention
filename = f'LSTMmodel_{today_date}.keras'

# Save the entire model in the .keras format with the constructed filename
save_model(model, filename)
filename

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

'LSTMmodel_20231226.keras'

訓練(early stop)

In [6]:
from keras.callbacks import EarlyStopping

# Create the Sequential model
model = Sequential()

# Add an Embedding layer
model.add(Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                    weights=[embedding_matrix], input_length=X.shape[1], 
                    trainable=False, embeddings_initializer=he_normal()))

# Add a Bidirectional LSTM layer with 64 units
model.add(Bidirectional(LSTM(64, return_sequences=True, 
                             kernel_constraint=max_norm(5), kernel_regularizer=l2(0.01))))

# Add Batch Normalization
model.add(BatchNormalization())

# Add another Bidirectional LSTM layer with 32 units
model.add(Bidirectional(LSTM(32, kernel_constraint=max_norm(5), kernel_regularizer=l2(0.01))))

# Add Dropout layer with a dropout rate of 0.3
model.add(Dropout(0.3))

# Add a Dense layer with 6 units and softmax activation function
model.add(Dense(6, activation='softmax', kernel_regularizer=l2(0.01)))

# Compile the model with categorical crossentropy loss and Adam optimizer with specified learning rate
# Also, include accuracy as a metric
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.0001)

# Add EarlyStopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model using the training data and validate on the test data
# Include the ReduceLROnPlateau and EarlyStopping callbacks
# model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), callbacks=[reduce_lr, early_stop])


In [7]:
# 假設X是你的文本數據，已經通過tokenizer轉換為數字序列

# 計算每個句子的長度
sentence_lengths = [len(sentence) for sentence in X]

# 計算平均數
average_length = sum(sentence_lengths) / len(sentence_lengths)

# 計算中位數
sorted_lengths = sorted(sentence_lengths)
mid = len(sorted_lengths) // 2
median_length = (sorted_lengths[mid] + sorted_lengths[~mid]) / 2  # 使用~mid取得中位數，即使數量為奇數也可正確計算

# 打印結果
print(f"平均數: {average_length}")
print(f"中位數: {median_length}")


平均數: 7.0
中位數: 7.0
