# 利用标注好的新浪微博数据 (weibo_senti_6k.csv，字段描述见表1)，开展情感分析
# 1. 数据集介绍
数据字段及具体含义如下：
- label 1 代表积极情感，0 代表消极情感
- review 微博内容

# 2. 实验准备

In [None]:
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np
import warnings
warnings.simplefilter("ignore")

## 3.1. 载入预处理后的数据

In [None]:
reviews = pd.read_csv("E:\本科\数据挖掘与商务分析\hw/final\IMDB Dataset_with_tags.csv")
print ("# of reviews: ", reviews.shape[0])
reviews_label_count = reviews.groupby("sentiment").count()
print(reviews_label_count)
reviews.head()
reviews['sentiment'] = reviews['sentiment'].replace({'positive': 1, 'negative': 0})
print(reviews.head())

# of reviews:  50000
           review   tags
sentiment               
negative    25000  25000
positive    25000  25000
                                              review  sentiment  \
0  One of the other reviewers has mentioned that ...          1   
1  A wonderful little production. <br /><br />The...          1   
2  I thought this was a wonderful way to spend ti...          1   
3  Basically there's a family where a little boy ...          0   
4  Petter Mattei's "Love in the Time of Money" is...          1   

                                                tags  
0  ['br', 'Oz', 'me', 'violence', 'll', 'was', 's...  
1  ['br', 'well', 'little', 'production', 'very',...  
2  ['was', 'br', 'but', 'thought', 'comedy', 'Whi...  
3  ['br', 'Jake', 'his', 'movie', 'drama', 'there...  
4  ['br', 'Mattei', 'these', 'Mr', 'people', 'dif...  


## 3.2. 划分训练集与测试集

In [None]:
X = []
for index, row in reviews.iterrows():
    review = row['review']
    X.append(review)
y = reviews["sentiment"]

from sklearn.model_selection import train_test_split
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 618)
print(len(X_train), len(X_test))

40000 10000


# 4. 实验设计
## 4.1. 评论情感分析 - 向量空间模型
### 4.1.1. 词频向量 + 逻辑回归

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vec = CountVectorizer()
X_train_tf = tf_vec.fit_transform(X_train)
X_test_tf = tf_vec.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)

(40000, 93007) (10000, 93007)


In [None]:
from sklearn.linear_model import LogisticRegression
# Fitting a logistic regression model with default parameters
logreg = LogisticRegression()
logreg.fit(X_train_tf, y_train)
# Prediction & Evaluation
y_hat_test = logreg.predict(X_test_tf)
print("Precision: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))
y_hat_test = logreg.predict_proba(X_test_tf)
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test[:,1])))

Precision: 0.887 Recall: 0.896
F1 score: 0.892
AUC score: 0.951


### 4.1.2. TF-IDF向量 + 逻辑回归

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)
print(X_train_tfidf.shape, X_test_tfidf.shape)

(40000, 93007) (10000, 93007)


In [None]:
# Fitting a logistic regression model with default parameters
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)
# Prediction & Evaluation
y_hat_test = logreg.predict(X_test_tfidf)
print("Precision: {:.3f}".format(precision_score(y_test, y_hat_test)), "Recall: {:.3f}".format(recall_score(y_test, y_hat_test)))
print("F1 score: {:.3f}".format(f1_score(y_test, y_hat_test)))
y_hat_test = logreg.predict_proba(X_test_tfidf)
print("AUC score: {:.3f}".format(roc_auc_score(y_test, y_hat_test[:,1])))

Precision: 0.886 Recall: 0.908
F1 score: 0.897
AUC score: 0.959


### 4.1.3. 使用 TextBlob 自带的 sentiment 函数

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# 1. 加载数据
df = pd.read_csv('IMDB_Dataset_with_TextBlob_sentiment.csv')

# 2. 创建标签映射，确保 'sentiment' 的真实标签格式一致
label_mapping = {'positive': 1, 'negative': 0}
df['sentiment_mapped'] = df['sentiment'].replace(label_mapping)

# 3. 按 4:1 划分训练集和测试集（80% 训练集，20% 测试集）
_, X_test, y_train, y_test = train_test_split(
    df.index,  # 使用索引进行划分，因为我们只关心标签和情感分数
    df['sentiment_mapped'],  # 使用转换后的情感标签作为目标变量
    test_size=0.2,  # 20% 测试集，80% 训练集
    random_state=618  # 设置随机种子，确保每次划分相同
)

# 获取与测试集对应的TextBlob预测的情感分数
y_score_test = df.loc[X_test, 'textblob_sentiment_score']

# 根据情感分数生成二值预测结果，这里假设情感分数 > 0 表示正向评价
threshold = 0  # 设定阈值以区分正负评价
y_pred_test = (y_score_test > threshold).astype(int)

# 5. 评估：Precision, Recall, F1, AUC
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
auc = roc_auc_score(y_test, y_score_test)  # 使用情感分数计算AUC

# 输出评估结果
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")
print(f"AUC score: {auc:.3f}")

Precision: 0.625
Recall: 0.945
F1 score: 0.752
AUC score: 0.833
