In [None]:
import pickle
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import pandas as pd
import jieba
from keras.utils import to_categorical 

def convert_to_chinese(number):
    chinese_numerals = {
        "0": "零",
        "1": "一",
        "2": "二",
        "3": "三",
        "4": "四",
        "5": "五",
        "6": "六",
        "7": "七",
        "8": "八",
        "9": "九",
    }
    chinese_number = "".join([chinese_numerals[str(digit)] for digit in str(number)])
    return chinese_number

# 定義將文本中的阿拉伯數字轉換為中文數字的函數
def convert_numbers_to_chinese(text):
    # 確保text是字符串，如果不是，轉換為字符串
    text = str(text)
    words = jieba.lcut(text)
    converted_words = ''.join([convert_to_chinese(word) if word.isdigit() else word for word in words])
    return converted_words

with open('./jieba/stop_words.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Remove empty strings and add additional stopwords if needed
stopwords = [word for word in stopwords if word.strip()]
stopwords.extend([' ', 'other_stopword'])

# 載入第一個程式產生的檔案
with open("tokenizer.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)

word2vec_model = Word2Vec.load("word2vec.model")
embedding_dim = word2vec_model.vector_size  # 取得Word2Vec模型的維度
vocab_list = [word for word,i in word2vec_model.wv.key_to_index.items()]  # 取得
word_index = {" ":0}
word_vector = {}


# 將文本轉換成詞向量
embedding_matrix = np.zeros((len(vocab_list) + 1, embedding_dim))
for i in range(len(vocab_list)):
    word = vocab_list[i]
    word_index[word] = i + 1
    word_vector[word] = word2vec_model.wv[word]
    embedding_matrix[i + 1] = word2vec_model.wv[word]

In [None]:
import jieba.analyse
from collections import Counter

data = pd.read_excel('train_data.xlsx')

# 將 "輸入" 欄位中的阿拉伯數字轉換為中文數字
data["輸入"] = data["輸入"].apply(convert_numbers_to_chinese)

# 資料預處理（假設已有資料X和標籤y）
X = []
y = []
z = []
for i in range(len(data)):
    words = jieba.lcut(data["輸入"].iloc[i])
    tmp = []
    for word in words:
        if word not in stopwords:
            tmp.append(word)
    X.append(tmp)
    y.append(data["分類"].iloc[i])
    z.append(jieba.analyse.extract_tags(data['輸入'].iloc[i], topK=4))

# Convert 'y' to categorical
y_cate = to_categorical(np.asarray(y))

# Calculate and display the count of each category
category_counts = Counter(y)
amount = sum(category_counts.values())
total_samples = len(y)

# Calculate and display the percentage of each category
for category, count in category_counts.items():
    percentage = (count / total_samples) * 100
    print(f"Category {category}: {count} occurrences, Percentage: {percentage:.2f}%")

print(f"Amount of categories: {amount}")


In [None]:
def tokenizer(text, word_index):
    data = []
    for sentence in text:
        new_text = []
        for word in sentence:
            if word in word_index:
                try:
                    new_text.append(word_index[word])
                except:
                    new_text.append(0)
        data.append(new_text)
    data_lengths = [len(sentence) for sentence in data]
    max_length = max(data_lengths)
    
    texts = pad_sequences(data, maxlen=7)
    return texts
with open('word_index.pkl', 'wb') as f:
    pickle.dump(word_index, f)


In [None]:
# 切割訓練集和測試集
use_data_version = [X,z]# X為斷詞結果，z為前4個關鍵字
X = tokenizer(use_data_version[0], word_index)
X_train, X_test, y_train, y_test = train_test_split(X, y_cate, test_size=0.2, random_state=42)
# word_index
# X_train = X
# y_train = y

In [None]:
# 假設X是你的文本數據，已經通過tokenizer轉換為數字序列

# 計算每個句子的長度
sentence_lengths = [len(sentence) for sentence in X]

# 計算平均數
average_length = sum(sentence_lengths) / len(sentence_lengths)

# 計算中位數
sorted_lengths = sorted(sentence_lengths)
mid = len(sorted_lengths) // 2
median_length = (sorted_lengths[mid] + sorted_lengths[~mid]) / 2  # 使用~mid取得中位數，即使數量為奇數也可正確計算

# 打印結果
print(f"平均數: {average_length}")
print(f"中位數: {median_length}")


訓練(lstm)

In [None]:
from keras.models import Sequential, save_model
from keras.layers import Embedding, Bidirectional, LSTM, BatchNormalization, Dense, Dropout
from keras.optimizers import Adam
from keras.constraints import max_norm
from keras.initializers import he_normal
from keras.callbacks import ReduceLROnPlateau
from keras.regularizers import l2
from datetime import datetime
# 創建模型
model = Sequential()
model.add(Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                    weights=[embedding_matrix], input_length=X.shape[1], 
                    trainable=False, embeddings_initializer=he_normal()))
model.add(Bidirectional(LSTM(64, return_sequences=True, 
                             kernel_constraint=max_norm(5), kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(32, kernel_constraint=max_norm(5), kernel_regularizer=l2(0.01))))
model.add(Dropout(0.3))  # 添加 Dropout 層，可根據需要調整 dropout 比例
model.add(Dense(6, activation='softmax', kernel_regularizer=l2(0.01)))  # 在全連接層上應用 L2 正規化

# 編譯模型，使用學習率調整器
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.005), metrics=['accuracy'])

# 訓練模型，加入 callbacks
model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), callbacks=[reduce_lr])

# Generate today's date in the format YYYYMMDD
today_date = datetime.today().strftime('%Y%m%d')

# Construct the filename using the specified naming convention
filename = f'LSTMmodel_{today_date}.keras'

# Save the entire model in the .keras format with the constructed filename
# save_model(model, filename)
filename

訓練(bigru early stop)

In [None]:
from keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential, save_model
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.utils import class_weight
from tensorflow.keras.optimizers import Adam
from keras.layers import Dropout, BatchNormalization
import numpy as np
from keras.regularizers import l2
from datetime import datetime

# Create the Sequential model
model = Sequential()
model.add(Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                    weights=[embedding_matrix], input_length=X.shape[1], 
                    trainable=False))
model.add(Bidirectional(LSTM(units=50, return_sequences=True, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, return_sequences=True, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, return_sequences=True, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, return_sequences=True, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, return_sequences=True, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(Dense(6, activation='softmax'))

# Compile the model with categorical crossentropy loss and Adam optimizer with specified learning rate
# Also, include accuracy as a metric
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.0001)

# Add EarlyStopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model using the training data and validate on the test data
# Include the ReduceLROnPlateau and EarlyStopping callbacks
# model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), callbacks=[reduce_lr, early_stop])

# Calculate class weights to handle class imbalance
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

# Train the model with class weights
history = model.fit(X, y_cate, epochs=80, batch_size=64, validation_split=0.2, class_weight=class_weights_dict, callbacks=[reduce_lr,early_stop])

# Generate today's date in the format YYYYMMDD
today_date = datetime.today().strftime('%Y%m%d')

# Construct the filename using the specified naming convention
filename = f'BiLSTMmodel_{today_date}.keras'

# Save the entire model in the .keras format with the constructed filename
save_model(model, filename)


訓練(NN)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, LSTM

from keras.layers import Dropout
Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                                weights=[embedding_matrix], input_length=X.shape[1], 
                                trainable=False)
model = Sequential()
model.add(Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                                weights=[embedding_matrix], input_length=X.shape[1], 
                                trainable=False))
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=50, return_sequences=False))  # 新增一層 LSTM
model.add(Dense(6, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# 打印模型摘要
model.summary()
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


In [None]:
from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization, Dropout, MultiHeadAttention, Embedding, Flatten, LSTM
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from datetime import datetime
from keras.layers import concatenate

# 修改 create_model_with_self_attention 函數
def create_model_with_attention_and_lstm(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']):
    inputs = Input(shape=(X.shape[1],))
    
    # 添加嵌入層，使用預訓練的嵌入矩陣
    embedding_layer = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                                weights=[embedding_matrix], input_length=X.shape[1], 
                                trainable=False)(inputs)
    
    # 添加自注意力機制（self-attention）
    attention = MultiHeadAttention(num_heads=1, key_dim=50)(embedding_layer, embedding_layer, embedding_layer)
    
    # Flatten 注意力的輸出
    attention_flatten = Flatten()(attention)
    
    # LSTM 層
    lstm_layer = LSTM(64)(embedding_layer)
    
    # 將自注意力的輸出和LSTM的輸出連接
    merged_layer = concatenate([attention_flatten, lstm_layer])
    
    # 全連接層
    dense_layer = Dense(64, activation='relu')(merged_layer)
    batch_norm = BatchNormalization()(dense_layer)
    dropout = Dropout(0.3)(batch_norm)
    
    # 輸出層，使用 softmax 激活函數
    output_layer = Dense(6, activation='softmax')(dropout)
    
    # 創建模型
    model = Model(inputs=inputs, outputs=output_layer)
    
    # 編譯模型
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    return model

# 使用最佳學習率創建具有自注意力機制的模型
model_with_self_attention = create_model_with_attention_and_lstm()

# 生成當天日期的字符串
today_date = datetime.today().strftime('%Y%m%d')

# 構建文件名，使用指定的命名慣例
filename = f'NNmodel_{today_date}.tf'

# 建立 ModelCheckpoint 回調以保存最佳模型
checkpoint = ModelCheckpoint(filename, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
model_with_self_attention.summary()
# 模型訓練
model_with_self_attention.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), callbacks=[checkpoint])


訓練(textcnn)

In [None]:
import pandas as pd
import numpy as np
import jieba
import keras
from keras.layers import concatenate, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout, Dense, Input
from keras.models import Model, save_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.initializers import he_normal
from keras.callbacks import ModelCheckpoint, EarlyStopping
from datetime import datetime
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
# 生成當天日期的字符串
today_date = datetime.today().strftime('%Y%m%d')
#数据预处理
def data_process(path, max_len=50):           #path为句子的存储路径，max_len为句子的固定长度
    dataset = pd.read_excel(path,  names=['輸入', '分類']).astype(str)
    cw = lambda x: list(jieba.cut(x))         # 定义分词函数
    dataset['words'] = dataset['輸入'].apply(cw)  # 将句子进行分词
    tokenizer = Tokenizer()                   # 创建一个Tokenizer对象，将一个词转换为正整数
    tokenizer.fit_on_texts(dataset['words'])  #将词编号，词频越大，编号越小
    vocab = tokenizer.word_index              # 得到每个词的编号
    x_train, x_test, y_train, y_test = train_test_split(dataset['words'], dataset['分類'], test_size=0.1)  #划分数据集
    x_train_word_ids = tokenizer.texts_to_sequences(x_train)     #将测试集列表中每个词转换为数字
    x_test_word_ids = tokenizer.texts_to_sequences(x_test)       #将训练集列表中每个词转换为数字
    x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=max_len)  # 将每个句子设置为等长，每句默认为50
    x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=max_len)    #将超过固定值的部分截掉，不足的在最前面用0填充
    return x_train_padded_seqs,y_train,x_test_padded_seqs,y_test,vocab
# 构建TextCNN模型
def TextCNN_model_1(x_train, y_train, x_test, y_test):
    main_input = Input(shape=(7,), dtype='float64')
    # 嵌入层（使用预训练的词向量）
    embedder = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_dim, 
                    weights=[embedding_matrix], input_length=X.shape[1], 
                    trainable=False, embeddings_initializer=he_normal())
    embed = embedder(main_input)
    # 卷积层和池化层，设置卷积核大小分别为3,4,5
    cnn1 = Conv1D(embedding_dim, 3, padding='same', strides=1, activation='relu')(embed)
    cnn1 = MaxPooling1D(pool_size=6)(cnn1)
    cnn2 = Conv1D(embedding_dim, 4, padding='same', strides=1, activation='relu')(embed)
    cnn2 = MaxPooling1D(pool_size=5)(cnn2)
    cnn3 = Conv1D(embedding_dim, 5, padding='same', strides=1, activation='relu')(embed)
    cnn3 = MaxPooling1D(pool_size=4)(cnn3)
    # 合并三个模型的输出向量
    cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat) #在池化层到全连接层之前可以加上dropout防止过拟合
    main_output = Dense(6, activation='softmax')(drop)
    model = Model(inputs=main_input, outputs=main_output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    # 構建文件名，使用指定的命名慣例
    filename = f'textCNNmodel_{today_date}.keras'
    # one_hot_labels = keras.utils.to_categorical(y_train, num_classes=3)  # 将标签转换为one-hot编码
    # Add EarlyStopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
    history = model.fit(x_train, y_train, batch_size=64, validation_data=(X_test, y_test), epochs=120, callbacks=[early_stop])
    result = model.predict(x_test)  # 预测样本属于每个类别的概率
    result_labels = np.argmax(result, axis=1)  # 获得最大概率对应的标签
    y_predict = list(map(int, result_labels))
    y=np.argmax(y_test, axis=-1)
    print(y_predict)
    print('准确率', metrics.accuracy_score(y, y_predict))
    # Save the entire model in the .keras format with the constructed filename
    save_model(model, filename)
    return history
# path = 'train_data.xlsx'
# x_train, y_train, x_test, y_test, vocab = data_process(path)
history = TextCNN_model_1(X_train, y_train, X_test, y_test)

訓練 Ernie

In [None]:
import pandas as pd 
import jieba
import jieba.analyse
data = pd.read_excel('train_data.xlsx')
with open('./jieba/stop_words.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# Remove empty strings and add additional stopwords if needed
stopwords = [word for word in stopwords if word.strip()]
stopwords.extend([' ', 'other_stopword'])

def convert_to_chinese(number):
    chinese_numerals = {
        "0": "零",
        "1": "一",
        "2": "二",
        "3": "三",
        "4": "四",
        "5": "五",
        "6": "六",
        "7": "七",
        "8": "八",
        "9": "九",
    }
    chinese_number = "".join([chinese_numerals[str(digit)] for digit in str(number)])
    return chinese_number

# 定義將文本中的阿拉伯數字轉換為中文數字的函數
def convert_numbers_to_chinese(text):
    # 確保text是字符串，如果不是，轉換為字符串
    text = str(text)
    converted_words = ''.join([convert_to_chinese(word) if word.isdigit() else word for word in text]).replace('，', '').replace(',', '')
    return converted_words
# 將 "輸入" 欄位中的阿拉伯數字轉換為中文數字
print(data["輸入"])
data["輸入"] = data["輸入"].apply(convert_numbers_to_chinese)
print(data["輸入"])
# 資料預處理（假設已有資料X和標籤y）
X = []
y = []
z = []
for i in range(len(data)):
    words = jieba.lcut(data["輸入"].iloc[i])
    tmp=[]
    for word in words:
        if word not in stopwords:
            tmp.append(word)
    X.append(tmp)
    y.append(data["分類"].iloc[i])
    z.append(jieba.analyse.extract_tags(data['輸入'].iloc[i], topK=4) )
# y_cate = to_categorical(np.asarray(y))
print(X)
print(y)
print(z)
formatted_data = list(zip([' '.join(sentence) for sentence in X], y))
maxlen = max(len(seq) for seq in X)
# df = pd.DataFrame({'Text':X,'Label':y})
maxlen

In [None]:
from ernie import SentenceClassifier, Models
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame(formatted_data)

classifier = SentenceClassifier(
    model_name=(Models.BertBaseUncased),
    max_length=12,
    labels_no=6,
    
)
batchsize = 2
# Load dataset and fine-tune the classifier
classifier.load_dataset(df )
history = classifier.fine_tune(
    epochs=30,
    learning_rate=2e-5,
    training_batch_size=batchsize,
    validation_batch_size=batchsize,
)
# classifier.dump('./ernie3.model')



測試(手動輸入)

In [1]:
import numpy as np
import pickle
from ernie import SentenceClassifier
import jieba

classifier = SentenceClassifier(model_path='./ernie2.model')
cate = {0: '重新輸入', 1: '查件', 2: '營業所', 3: '客訴', 4: '服務時間', 5: '其他相關'}

def chinese_text_segmentation(text):
    seg_list = jieba.cut(text)
    return " ".join(seg_list)

def convert_to_chinese(number):
    chinese_numerals = {
        "0": "零",
        "1": "一",
        "2": "二",
        "3": "三",
        "4": "四",
        "5": "五",
        "6": "六",
        "7": "七",
        "8": "八",
        "9": "九",
    }
    chinese_number = "".join([chinese_numerals[digit] for digit in str(number)])
    return chinese_number

def tokenizer(text, word_index):
    data = []
    for sentence in text:
        new_text = []
        for word in sentence:
            if word in word_index:
                try:
                    new_text.append(word_index[word])
                except:
                    new_text.append(0)
        data.append(new_text)
    return data

# for _ in range(1):
while True:
    # 假設 user_input 是用戶輸入的文本
    user_input = input("請輸入文本：")

    # 將數字轉換為中文
    user_input = "".join(
        [convert_to_chinese(ch) if ch.isdigit() else ch for ch in str(user_input)]
    )
    print(user_input)

    # 讀取 word_index
    with open('word_index.pkl', 'rb') as f:
        word_index = pickle.load(f)

    # 使用 jieba 進行斷詞
    user_input_segmented = chinese_text_segmentation(user_input)

    # 使用 SentenceClassifier 進行預測
    print(user_input_segmented)
    predictions = list(classifier.predict([user_input_segmented]))
    predicted_class = np.array(predictions).argmax(axis=-1)[0]
    predicted_class2 = np.argsort(predictions[0])[-2]
    # 打印預測結果
    print(f"Predicted Class: {str(predicted_class) + cate[predicted_class]}")
    print(f"Second predicted Class: {str(predicted_class2) + cate[predicted_class2]}")
    print(f"Predictions: {predictions}")


  from .autonotebook import tqdm as notebook_tqdm
2024-01-25 10:58:13.094217: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-25 10:58:13.164940: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-25 10:58:13.182303: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-25 10:58:13.457060: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not l



Predicted Class: 0重新輸入
Second predicted Class: 5其他相關
Predictions: [(0.5343225528578964, 0.07113374233913006, 0.1349709062684465, 0.037900927354390365, 0.06373803362929155, 0.15793383755084509)]


Building prefix dict from the default dictionary ...
2024-01-25 10:58:30 [DEBUG] Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2024-01-25 10:58:30 [DEBUG] Loading model from cache /tmp/jieba.cache


貨件到哪裡了


Loading model cost 0.225 seconds.
2024-01-25 10:58:31 [DEBUG] Loading model cost 0.225 seconds.
Prefix dict has been built succesfully.
2024-01-25 10:58:31 [DEBUG] Prefix dict has been built succesfully.


貨件 到 哪裡 了
Predicted Class: 3客訴
Second predicted Class: 2營業所
Predictions: [(0.012552255736375473, 0.20234490148319612, 0.2530043374383476, 0.29987032886306914, 0.01817702587726872, 0.2140511506017428)]
貨件到哪裡了
貨件 到 哪裡 了
Predicted Class: 3客訴
Second predicted Class: 2營業所
Predictions: [(0.012552255736375473, 0.20234490148319612, 0.2530043374383476, 0.29987032886306914, 0.01817702587726872, 0.2140511506017428)]
貨件到哪裡了
貨件 到 哪裡 了
Predicted Class: 3客訴
Second predicted Class: 2營業所
Predictions: [(0.012552255736375473, 0.20234490148319612, 0.2530043374383476, 0.29987032886306914, 0.01817702587726872, 0.2140511506017428)]
貨件到哪裡了
貨件 到 哪裡 了
Predicted Class: 3客訴
Second predicted Class: 2營業所
Predictions: [(0.012552255736375473, 0.20234490148319612, 0.2530043374383476, 0.29987032886306914, 0.01817702587726872, 0.2140511506017428)]
我的貨件到哪裡了
我 的 貨件 到 哪裡 了
Predicted Class: 1查件
Second predicted Class: 2營業所
Predictions: [(0.0011809063291520927, 0.9853689525646834, 0.009408288711518676, 0.0009898197666282384,

KeyboardInterrupt: Interrupted by user

測試(test data)

In [None]:
import pandas as pd 
import jieba
data = pd.read_excel('train_data.xlsx',sheet_name='test')
def convert_to_chinese(number):
    chinese_numerals = {
        "0": "零",
        "1": "一",
        "2": "二",
        "3": "三",
        "4": "四",
        "5": "五",
        "6": "六",
        "7": "七",
        "8": "八",
        "9": "九",
    }
    chinese_number = "".join([chinese_numerals[str(digit)] for digit in str(number)])
    return chinese_number

# 定義將文本中的阿拉伯數字轉換為中文數字的函數
def convert_numbers_to_chinese(text):
    # 確保text是字符串，如果不是，轉換為字符串
    text = str(text)
    words = jieba.lcut(text)
    converted_words = ''.join([convert_to_chinese(word) if word.isdigit() else word for word in words])
    return converted_words
# 將 "輸入" 欄位中的阿拉伯數字轉換為中文數字
data["輸入"] = data["輸入"].apply(convert_numbers_to_chinese)
# 資料預處理（假設已有資料X和標籤y）
X = []
y = []
for i in range(len(data)):
    X.append(" ".join(jieba.lcut(data["輸入"].iloc[i])))
    y.append(data["分類"].iloc[i])
# y_cate = to_categorical(np.asarray(y))
print(X)
print(y)
# formatted_data = list(zip([' '.join(sentence) for sentence in X], y))
df = pd.DataFrame({'Text':X,'Label':y})
# df['Text'].tolist()

In [None]:
import numpy as np
from ernie import SentenceClassifier
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

# classifier = SentenceClassifier(model_path='./ernie-autosave/bert/1705565624717/')
classifier = SentenceClassifier(model_path='./ernie-autosave/bert/1705891874320/')
cate={0:'重新輸入',1:'查件',2:'營業所',3:'客訴',4:'服務時間',5:'其他相關'}

# 預測結果
predictions = list(classifier.predict(df['Text'].tolist()))
# 獲取最大概率的類別
predicted_class = np.array(predictions).argmax(axis=-1)
df['predict']=predicted_class
# 打印預測結果
matching_rows = df[df['Label'] == df['predict']]

# 計算百分比
percentage = (len(matching_rows) / len(df)) * 100

print(f"Label等於Predict的百分比為: {percentage}%")


In [None]:
# 找到Label不等於Predict的行
non_matching_rows = df[df['Label'] != df['predict']]

# 顯示不匹配的行
print("不匹配的行:")
print(non_matching_rows)


ernie 2 (transformer)

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, ErnieForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
from tqdm import tqdm
# 載入預先訓練好的 BERT 模型和分詞器
model_name = 'maidalun1020/bce-reranker-base_v1'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)  # 設置 num_labels 為 6
# model = ErnieForSequenceClassification.from_pretrained(model_name, num_labels=6)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
# 假設你有一個包含文本和標籤的資料集
# 記得將文本轉換成 BERT 模型可接受的格式（使用分詞器）
data = pd.read_excel('train_data.xlsx', dtype={'輸入': str, '分類': int})
texts = data['輸入'].tolist()  # 文本轉換成列表
labels = data['分類'].tolist()  # 標籤（0 到 5）
print(texts)
print(labels)
# 將文本轉換成 BERT 模型的輸入格式
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# 建立 PyTorch Dataset
dataset = TensorDataset(encoded_texts['input_ids'], encoded_texts['attention_mask'], torch.tensor(labels))
# 將資料集分為訓練集和驗證集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
batch_size = 4
# 使用 DataLoader 加載資料
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 定義優化器和損失函數
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()  # 使用 CrossEntropyLoss 替代 BCEWithLogitsLoss

In [None]:
# 訓練模型
epochs = 5
for epoch in range(epochs):
    # Inside the training loop
    model.train()
    # 使用 tqdm 來顯示進度條
    train_dataloader = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', dynamic_ncols=True)
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        # print(labels)
        labels = labels.long()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    # 在驗證集上評估模型
    # Inside the validation loop
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            _, predicted_labels = torch.max(outputs.logits, 1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_samples += labels.size(0)

    # Calculate average validation loss
    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / total_samples

    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}')

# 儲存模型的參數
model_name = model_name.split('/')[-1]
torch.save(model.state_dict(), f'./models/{model_name}')



測試參數用: 

In [None]:
from tqdm import tqdm

# Assuming model, train_dataset, val_dataset, optimizer initialization elsewhere

# Defining a list of epochs and learning rates to try
epochs_list = [3, 5, 8]
learning_rates = [5e-5, 0.001, 0.01, 0.1]

best_accuracy = 0.0
best_loss = 1.0
best_epochs_loss = 0
best_epochs = 0
best_lr = 0.0

# Iterating through different learning rates
for lr in learning_rates:
    print(f"Testing learning rate: {lr}")
    
    # Initializing optimizer with the current learning rate
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    # Iterating through different epochs
    for epochs in epochs_list:
        # Training loop for each epoch
        for epoch in range(epochs):
            model.train()

            train_dataloader = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', dynamic_ncols=True)

            for batch in train_dataloader:
                input_ids, attention_mask, labels = batch
                optimizer.zero_grad()
                labels = labels.long()
                
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

            model.eval()
            val_loss = 0.0
            correct_predictions = 0
            total_samples = 0

            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids, attention_mask, labels = batch
                    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss += outputs.loss.item()
                    _, predicted_labels = torch.max(outputs.logits, 1)
                    correct_predictions += (predicted_labels == labels).sum().item()
                    total_samples += labels.size(0)

            avg_val_loss = val_loss / len(val_dataloader)
            accuracy = correct_predictions / total_samples

            print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}')

        # Checking if the current model is the best based on accuracy or loss
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_epochs = epochs
            best_lr = lr
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            best_epochs_loss = epochs

# Printing the best parameters
print(f'Best number of epochs: {best_epochs}, Best accuracy: {best_accuracy:.4f}')
print(f'Best number of epochs_loss: {best_epochs_loss}, Best loss: {best_loss:.4f}')
print(f'Best learning rate: {best_lr}')

測試transformer(test_data)

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 創建 DataFrame
df = pd.DataFrame({'Text': X, 'Label': y})
model_name = 'bert-base-chinese'
# 載入保存的 BERT 模型
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
tokenizer = BertTokenizer.from_pretrained(model_name)

# 讀取保存的模型參數
model.load_state_dict(torch.load(f'./models/{model_name}'))

# 設置模型為評估模式
model.eval()

# 將文本轉換為 BERT 模型的輸入格式
encoded_texts = tokenizer(df['Text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# 進行預測
with torch.no_grad():
    outputs = model(encoded_texts['input_ids'], attention_mask=encoded_texts['attention_mask'])
    _, predicted_labels = torch.max(outputs.logits, 1)

# 將預測結果添加到 DataFrame
df['Predicted_Label'] = predicted_labels.numpy()

# 打印 DataFrame
print(df)
from sklearn.metrics import accuracy_score

# 假設你已經有一個 DataFrame df 包含了 'Label' 和 'Predicted_Label' 列
y_true = df['Label'].tolist()
y_pred = df['Predicted_Label'].tolist()

# 計算精確度
accuracy = accuracy_score(y_true, y_pred)

print(f'Accuracy: {accuracy:.4f}')


In [None]:
df.to_csv('output.csv', index=False)

測試transformer(手動輸入)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
cate={0:'重新輸入',1:'查件',2:'營業所',3:'客訴',4:'服務時間',5:'其他相關'}
# 載入保存的 BERT 模型
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=6)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# 讀取保存的模型參數
model.load_state_dict(torch.load('transformer.bert'))

# 設置模型為評估模式
model.eval()
while True:
    # 使用者手動輸入測試文本
    user_input_text = input("請輸入測試文本: ")

    # 將使用者輸入的文本轉換為 BERT 模型的輸入格式
    encoded_text = tokenizer(user_input_text, padding=True, truncation=True, return_tensors='pt')

    # 進行預測
    with torch.no_grad():
        output = model(**encoded_text)
        _, predicted_label = torch.max(output.logits, 1)

    # 打印預測結果
    print(f'預測標籤: {predicted_label.item()} {cate[predicted_label.item()]}')


output figure loss & accuracy

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[0:, ['loss', 'val_loss']].plot()
print(("Minimum Training Loss: {:0.4f}").format(history_df['loss'].min()))
print(("Minimum Validation Loss: {:0.4f}").format(history_df['val_loss'].min()))
history_df.loc[0:, ['accuracy', 'val_accuracy']].plot()
print(("Maximum Training Accuracy: {:0.4f}").format(history_df['accuracy'].max()))
print(("Maximum Validation Accuracy: {:0.4f}").format(history_df['val_accuracy'].max()))