1.数据加载和合并

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from gensim.models import KeyedVectors
import jieba
import requests

# 加载数据
negative_words = pd.read_csv('/content/drive/My Drive/Work/1.高中生情感分类/negative_words.csv', header=None, names=['text'])
neutral_words = pd.read_csv('/content/drive/My Drive/Work/1.高中生情感分类/neutral_words.csv', header=None, names=['text'],on_bad_lines='skip')
positive_words = pd.read_csv('/content/drive/My Drive/Work/1.高中生情感分类/positive_words.csv', header=None, names=['text'])

# 分配标签
neutral_words['label'] = 0
positive_words['label'] = 1
negative_words['label'] = 2
# 合并数据集
data = pd.concat([positive_words, neutral_words, negative_words])


In [3]:
#随机化数据
data = data.sample(frac=1).reset_index(drop=True)

In [4]:
import jieba
import requests

stop_words_url = 'https://raw.githubusercontent.com/goto456/stopwords/master/cn_stopwords.txt'

# 使用requests库下载停用词库
response = requests.get(stop_words_url)
stop_words = set(response.text.splitlines())

def preprocess_chinese_text(text):
    # 使用jieba进行中文分词
    words = jieba.cut(text)
    # 移除停用词
    filtered_words = [word for word in words if word not in stop_words and not word.isspace()]
    # 合并词汇为一个字符串
    return ' '.join(filtered_words)

# 应用预处理函数到文本列
data['original_text'] = data['text'].copy()
data['text'] = data['text'].apply(preprocess_chinese_text)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.789 seconds.
DEBUG:jieba:Loading model cost 0.789 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [13]:
# 划分数据集
X = data['text']
y = data['label']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# 文本序列化和填充
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
max_seq_length = max([len(x) for x in X_seq])
X_pad = pad_sequences(X_seq, maxlen=max_seq_length)

# 加载腾讯的预训练模型
model_path = '/content/drive/My Drive/Work/1.高中生情感分类/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
word_vectors_zh = KeyedVectors.load_word2vec_format(model_path, binary=False)

In [14]:
# 初始化嵌入矩阵
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = word_vectors_zh[word] if word in word_vectors_zh else None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
# 构建模型
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, weights=[embedding_matrix], input_length=max_seq_length, trainable=False),
    SpatialDropout1D(0.2),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(len(y_categorical[0]), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [16]:
# 模型训练
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_categorical, test_size=0.2, random_state=42)
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7e55a36bef20>

In [17]:
# 评估模型
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy*100:.2f}%')

Test accuracy: 85.71%


In [18]:
!pip install ipywidgets



In [19]:
from ipywidgets import widgets, Layout
from IPython.display import display
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [20]:
#设置一个输出区域
output_area = widgets.Output()

In [21]:
# 模型对整个数据集的预测
predictions = model.predict(X_pad)
predicted_classes = np.argmax(predictions, axis=1)  # 获取预测的类别索引



In [22]:
#定义查询按钮和点击事件处理函数
query_button = widgets.Button(
    description='查询',
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='点击查询',
)

def on_query_button_clicked(b):
    with output_area:
        output_area.clear_output(wait=True)  # 清除输出区域的内容
        emotion = emotion_dropdown.value
        num_statements = num_statements_input.value
        # 假设predicted_classes已经定义，并且包含了对整个数据集的情感预测
        display_statements(emotion, num_statements, predicted_classes, data)
        display_performance(emotion)  # 显示性能指标

query_button.on_click(on_query_button_clicked)

In [23]:
def display_performance(selected_emotion):
    # 根据selected_emotion筛选对应情感的文本和标签
    emotion_to_label = {'积极': 1, '中立': 0, '消极': 2}
    label = emotion_to_label[selected_emotion]
    indices = np.where(y == label)[0]
    selected_X_test = X_pad[indices]
    selected_y_test = y_categorical[indices]

    # 预测
    predictions = model.predict(selected_X_test)
    predictions = np.argmax(predictions, axis=1)

    # 计算性能指标
    accuracy = accuracy_score(np.argmax(selected_y_test, axis=1), predictions)
    recall = recall_score(np.argmax(selected_y_test, axis=1), predictions, average='macro', zero_division=1)
    f1 = f1_score(np.argmax(selected_y_test, axis=1), predictions, average='macro',zero_division=1)

    # 显示性能指标
    print(f"准确率: {accuracy:.2f}")
    print(f"召回率: {recall:.2f}")
    print(f"F1得分: {f1:.2f}")

In [24]:
def display_statements(emotion, num_statements, predicted_classes, data):
    # 将用户选择的情感转换为对应的整数标签
    emotion_to_label = {'积极': 1, '中立': 0, '消极': 2}
    label = emotion_to_label[emotion]

    # 获取对应情感预测的索引
    selected_indices = np.where(predicted_classes == label)[0]

    # 如果用户指定的数量超过了找到的文本数量，调整为实际数量
    num_statements = min(num_statements, len(selected_indices))

    # 随机选择指定数量的索引
    if num_statements > 0:
        selected_samples = np.random.choice(selected_indices, num_statements, replace=False)
    else:
        selected_samples = []

    # 显示这些文本
    for idx in selected_samples:
        print(data.iloc[idx]['original_text'])  # 确保你有一个保存原始文本的列，这里使用了'text'

In [25]:
from ipywidgets import VBox

# 创建情感类型下拉菜单
emotion_dropdown = widgets.Dropdown(
    options=['积极', '中立', '消极'],
    value='积极',
    description='情感类别:',
)

# 创建一个整数输入框用于指定语句数量
num_statements_input = widgets.IntText(
    value=5,
    description='语句数量:',
    disabled=False
)

# 当情感类型或语句数量改变时触发的函数
def on_value_change(change):
    emotion = emotion_dropdown.value
    num_statements = num_statements_input.value
    #display_statements(emotion, num_statements)
    #display_performance(emotion)

# 监听下拉菜单和整数输入框的变化
emotion_dropdown.observe(on_value_change, names='value')
num_statements_input.observe(on_value_change, names='value')

output_area.clear_output()

# 显示UI组件
display(VBox([emotion_dropdown, num_statements_input, query_button, output_area]))


VBox(children=(Dropdown(description='情感类别:', options=('积极', '中立', '消极'), value='积极'), IntText(value=5, descrip…