In [None]:
from sklearn.model_selection import train_test_split

data_new_pos = pd.DataFrame()
data_new_pos['comment'] = data_pos['comment'].apply(lambda x: str.join('', x))
data_new_pos['label'] = 0
data_new_pos.reset_index(inplace=True,drop=True)

data_new_neg = pd.DataFrame()
data_new_neg['comment'] = data_neg['comment'].apply(lambda x: str.join('', x))
data_new_neg['label'] = 1
data_new_neg.reset_index(inplace=True,drop=True)

data_new = pd.concat([data_new_pos,data_new_neg],axis=0)
data_new.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
data_new['comment'] = data_new['comment'].apply(jieba.lcut)
# jieba分词
data_new['comment'] = data_new['comment'].apply(lambda x: str.join(' ', x))
# 分词以空格为间隔进行合并，保留语义信息
X_comments = vect.fit_transform(data_new['comment']).toarray()
# X_comments = X_comments.toarray()
word_bag = vect.vocabulary_ # 词袋模型中各词所占的权重

In [None]:
from sklearn.model_selection import train_test_split

X_comments = vect.fit_transform(data_new['comment']).toarray()
Y_label = data_new['label']
test_ratio = 0.2
X_train, X_test, Y_train, Y_test = train_test_split(X_comments, Y_label, test_size=test_ratio, random_state=1)
# 划分训练集和测试集

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

# 分类器初始化
mlp = MLPClassifier(max_iter = 3)
random_forest = RandomForestClassifier()
logistic = LogisticRegression()
svm = SVC()
bayes = GaussianNB()
knn = KNeighborsClassifier()

# 分类器训练
mlp.fit(X_train, Y_train)
random_forest.fit(X_train, Y_train)
logistic.fit(X_train, Y_train)
svm.fit(X_train, Y_train)
bayes.fit(X_train, Y_train)
knn.fit(X_train, Y_train)

# 分类器预测
Y_pred_random_forest = random_forest.predict(X_test)
Y_pred_mlp = mlp.predict(X_test)
Y_pred_logistic = logistic.predict(X_test)
Y_pred_svm = svm.predict(X_test)
Y_pred_bayes = bayes.predict(X_test)
Y_pred_knn = knn.predict(X_test)

# 计算准确率并制作相关图表
module_names = ['MLP神经网络', '随机森林', '逻辑回归', 'SVM支撑向量机', '朴素贝叶斯', 'KNN近邻']
acc_scores = [accuracy_score(Y_pred_random_forest, Y_test),
              accuracy_score(Y_pred_mlp, Y_test),
              accuracy_score(Y_pred_logistic, Y_test),
              accuracy_score(Y_pred_svm, Y_test),
              accuracy_score(Y_pred_bayes, Y_test),
              accuracy_score(Y_pred_knn, Y_test)]

plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
for i in range(len(x_data)):
    autolabel(plt.bar(x_data[i], y_data[i]))
plt.title("模型种类与准确率柱状图")
plt.xlabel("模型类型")
plt.ylabel("准确率")
plt.ylimit(0.6, 1)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

y_pred = Y_pred.astype(np.int_)
y_test = Y_test.astype(np.int_)
tp = sum(y_pred & y_test) # 计算
fp = sum((y_pred == 1) & (y_test == 0))
tn = sum((y_pred == 0) & (y_test == 0))
fn = sum((y_pred == 0) & (y_test == 1))
print('TP = %s, FP = %s, TN = %s, FN = %s' % (tp, fp, tn, fn))

cm = confusion_matrix(y_test, y_pred) # 绘制混淆矩阵
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
from sklearn.metrics import accuracy_score
import math

# 计算评估指标
accuracy = accuracy_score(Y_pred, Y_test)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)
mcc = (tp*tn - fp*fn)/ math.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))

print('Accuracy = %s, Precision = %s, Recall = %s, F1_score = %s, MCC = %s' %(accuracy, precision, recall, f1_score, mcc))

In [None]:
import pandas as pd
import jieba
import re

data_test = pd.read_excel('../data/test.xlsx') # 读取测试文件
comment_origin = data_test['comment'].copy() # 保存评论数据
data_test['comment'] = data_test['comment'].apply(lambda x: x.replace('text：',''))
data_test['comment'] = data_test['comment'].apply(lambda x: re.sub('[^\u4E00-\u9FD5,.?!，。！？、；;:：0-9]+', '', x)) # 数据清洗

data_test['comment'] = data_test['comment'].apply(jieba.lcut) # jieba分词
data_test['comment'] = data_test['comment'].apply(lambda x: str.join(' ', x)) # 分词后使用空格连接，保留语义信息
data_test.head()

In [None]:
X_comments_test = vect.transform(data_test['comment']).toarray()
Y_pred_test = mlp.predict(X_comments_test) # 模型预测

In [None]:
data_test['target'] = Y_pred_test
data_test['comment'] = comment_origin # 恢复评论内容
data_test.to_excel('../data/test_out.xlsx', index=False) # 输出到文件