# 微博评论情感分析

## 1. 中文文本预处理——结巴

In [4]:
# 利用中文分词工具jieba对文本进行分词，移除标点符号，最后将分词结果写入文件
import jieba
import pandas as pd
import re

# 读取CSV文件
df = pd.read_csv('./weibo_senti_6k.csv', encoding='utf-8')

# 定义移除标点符号的函数
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# 对文本进行分词，并移除标点符号
def preprocess_text(text):
    text = remove_punctuation(text)
    words = jieba.cut(text)
    return ' '.join(words)

# 对数据集中的每一行进行处理
df['processed_text'] = df['review'].apply(preprocess_text)

# 移除原来的review列
df = df.drop(columns=['review'])

# 将分词结果写入文件
df.to_csv('processed_weibo_senti_6k.csv', index=False, encoding='utf-8')

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Barry\AppData\Local\Temp\jieba.cache
Loading model cost 0.390 seconds.
Prefix dict has been built successfully.


## 2. TF-IDF

In [5]:
#按4:1 划分训练集和测试集，利用向量空间模型，采用TF-IDF 权重，对预处理后的微博内容进行向量化表示

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# 读取预处理后的数据集
df = pd.read_csv('processed_weibo_senti_6k.csv', encoding='utf-8')

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.2, random_state=1)

# 初始化TF-IDF模型
tfidf_vec = TfidfVectorizer()

# 对训练集和测试集分别进行向量化表示
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)
print(X_train_tfidf.shape, X_test_tfidf.shape)

(4800, 27066) (1200, 27066)


## 3. 逻辑回归 

In [None]:
#运用逻辑回归模型开展情感分析并进行效果评价
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

#初始化逻辑回归模型
logreg = LogisticRegression()

#训练模型
logreg.fit(X_train_tfidf, y_train)

#预测
y_hat_test = logreg.predict(X_test_tfidf)
print("Precision: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))
y_hat_test = logreg.predict_proba(X_test_tfidf)
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test[:,1])))

Precision: 0.903 Recall: 0.891
F1 score: 0.897
AUC score: 0.946


## 4. SnowNLP

In [9]:
#使用SnowNLP自带的sentiment 函数直接开展情感分析

from snownlp import SnowNLP

# 读取数据集
df = pd.read_csv('./weibo_senti_6k.csv', encoding='utf-8')

# 定义情感分析函数
def sentiment_transfer(text):
    s = SnowNLP(text)
    return 1 if s.sentiments > 0.5 else 0

# 使用SnowNLP进行情感分析
df['sentiment'] = df['review'].apply(sentiment_transfer)
df['sentiment_prob']  = df['review'].apply(lambda x: SnowNLP(x).sentiments)

#评估情感分析效果,计算Precision, Recall, F1 score, AUC score
print("Precision: {:.3f}".format(precision_score(df['label'], df['sentiment'])), "Recall: {:.3f}".format(recall_score(df['label'], df['sentiment'])))
print("F1 score: {:.3f}".format(f1_score(df['label'], df['sentiment'])))
print("AUC score: {:.3f}".format(roc_auc_score(df['label'], df['sentiment_prob'])))

Precision: 0.554 Recall: 0.854
F1 score: 0.672
AUC score: 0.601


## 5. SBERT

In [22]:
#使用SBERT对微博内容进行向量化表示
from sentence_transformers import SentenceTransformer

# 读取数据集
df = pd.read_csv('./weibo_senti_6k.csv', encoding='utf-8')
X = []
for index, row in df.iterrows():
    review = row['review']
    X.append(review)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# 加载SBERT模型
model = SentenceTransformer('paraphrase-mpnet-base-v2')

X_train_sbert = model.encode(X_train)
X_test_sbert = model.encode(X_test)
print(X_train_sbert.shape, X_test_sbert.shape)

#初始化逻辑回归模型
logreg = LogisticRegression()

#训练模型
logreg.fit(X_train_sbert, y_train)

#预测
y_hat_test = logreg.predict(X_test_sbert)

print("Precision: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))

y_hat_test = logreg.predict_proba(X_test_sbert)
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test[:,1])))


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(4800, 768) (1200, 768)
Precision: 0.659 Recall: 0.684
F1 score: 0.672
AUC score: 0.727
