In [None]:
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split

# 下载 NLTK 停用词库
nltk.download('stopwords')  # 确保停用词库已下载
stop_words = set(stopwords.words('english'))

# 加载 IMDB 数据集
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)

# 创建一个反向映射词典
word_index = tf.keras.datasets.imdb.get_word_index()

# 反向映射字典
index_word = {index: word for word, index in word_index.items()}

# 将电影评论的数字转为单词
train_texts = [' '.join([index_word.get(i - 3, '?') for i in review]) for review in train_data]
test_texts = [' '.join([index_word.get(i - 3, '?') for i in review]) for review in test_data]

# 合并训练和测试数据
texts = train_texts + test_texts
labels = list(train_labels) + list(test_labels)

# 将数据转换为 DataFrame
df = pd.DataFrame({
    'text': texts,
    'category': labels  # 电影评论的标签：0（负面评论）和 1（正面评论）
})

# 显示前几行数据
print(df.head())
### 2.数据清理
import re
import numpy as np


def load_word_vectors(file_path):
    word_vectors = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array([float(i) for i in parts[1:]], dtype=np.float32)
            word_vectors[word] = vector
    return word_vectors


# 假设训练好的词向量存储在 word2vec.txt 文件中
word_vectors = load_word_vectors('txt/word2vec_vectors_50d.txt')


# 定义文本清理函数
def clean_text(text):
    # 转小写
    text = text.lower()

    # 去除标点符号和非字母字符（可选保留数字）
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # 按空格分词
    words = text.split()

    # 去除停用词
    cleaned_words = [word for word in words if word not in stop_words]

    # 重新组合为字符串
    cleaned_text = ' '.join(cleaned_words)

    # 去除多余的空格
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text


# 清理所有文本数据
texts = df['text'].apply(clean_text)
df['cleaned_text'] = texts

# 显示前几行数据查看清理结果
print(df.head())
### 3.转为词向量
import torch
import numpy as np

texts = df['cleaned_text']

# 文本分类的标签
labels = df['category'].values  # 这是数字标签，0 - 1（正负面评论）

# Text-CNN 参数
embedding_size = 50  # 词向量维度
sequence_length = 100  # 句子的最大长度
num_classes = 2  # 正负面评论分类
filter_sizes = [3, 4, 5]
num_filters = 200  # 卷积核数量

# 创建输入词向量
inputs = []
for sen in texts:
    sentence_vector = []
    for word in sen.split():
        # 使用读取的词向量字典来获取每个单词的词向量
        if word in word_vectors:
            sentence_vector.append(word_vectors[word])  # 获取词向量
        else:
            sentence_vector.append(np.zeros(embedding_size))  # 如果词不在词汇表中，使用零向量
    if len(sentence_vector) < sequence_length:
        sentence_vector.extend([np.zeros(embedding_size)] * (sequence_length - len(sentence_vector)))
    else:
        sentence_vector = sentence_vector[:sequence_length]
    inputs.append(np.array(sentence_vector))

# 将 inputs 列表转换为一个 numpy 数组
inputs_array = np.array(inputs)

# 然后再转换为 PyTorch 张量
inputs_tensor = torch.FloatTensor(inputs_array)

# 标签转换为 PyTorch 张量
labels_tensor = torch.LongTensor(labels)

# 显示输入的形状
print(inputs_tensor.shape)  # (batch_size, sequence_length, embedding_size)
print(labels_tensor.shape)  # (batch_size,)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn import ensemble
from model import net
from transformers import BertTokenizer, BertForSequenceClassification

X_train, X_test, y_train, y_test = train_test_split(inputs_tensor, labels_tensor, test_size=0.2, random_state=0)

model1 = net.TextCNN()
model2 = net.TransformerModel()
model1.load_state_dict(torch.load("./model/txtcnn_model.pth"))
model2.load_state_dict(torch.load("./model/transformer_model.pth"))

# 加载调优过的模型和Tokenizer
model3 = BertForSequenceClassification.from_pretrained("finetuned_bert_model")
tokenizer = BertTokenizer.from_pretrained("finetuned_bert_model")


# 使用多数投票集成组合模型
avg = VotingClassifier(estimators=[( 'lr' , model1), ( 'dt' , model2) , ( 'svc' , model3)], Voting= 'hard' ) 
 
# 在训练数据上拟合集成 ensemble.fit
avg.fit(X_train, y_train) 
 
# 在测试数据上评估集成的性能
print ( f"整体精准度：{ensemble.score(X_test, y_test)* 100 } %" )
