In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split

In [4]:
train_df = pd.read_csv('/Users/jujusmacbook/Documents/NLP_Lab/Data/train.csv')
X = train_df["review"]
y = train_df["sentiment"]

# 划分训练集与验证集
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # 分层抽样保证标签分布一致
)

In [6]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Using cached click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2025.9.18-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.2-py3-none-any.whl (1.5 MB)
Using cached regex-2025.9.18-cp313-cp313-macosx_11_0_arm64.whl (287 kB)
Using cached click-8.3.0-py3-none-any.whl (107 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [nltk][32m3/4[0m [nltk]
[1A[2KSuccessfully installed click-8.3.0 nltk-3.9.2 regex-2025.9.18 tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

tfidf = TfidfVectorizer(
    max_features=5000,  # 保留Top5000高频词
    stop_words=stopwords.words('english'),  # 去除停用词
    ngram_range=(1, 2)  # 保留1-gram（单词）与2-gram（词组，如“waste time”）
)

# 拟合训练集并转换特征
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)  # 测试集仅转换，避免数据泄露

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jujusmacbook/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 初始化逻辑回归模型
lr_model = LogisticRegression(
    max_iter=1000,  # 增加迭代次数确保收敛
    class_weight='balanced'  # 平衡正负样本（虽数据集平衡，仍增加鲁棒性）
)

# 训练模型
lr_model.fit(X_train_tfidf, y_train)

# 验证集预测
y_val_pred = lr_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"验证集准确率：{val_accuracy:.4f}")
print("分类报告：")
print(classification_report(y_val, y_val_pred))

验证集准确率：1.0000
分类报告：
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       160
           1       1.00      1.00      1.00       160

    accuracy                           1.00       320
   macro avg       1.00      1.00      1.00       320
weighted avg       1.00      1.00      1.00       320



In [9]:
# 加载测试集
test_df = pd.read_csv("/Users/jujusmacbook/Documents/NLP_Lab/Data/test.csv")
X_test = test_df["review"]

# 转换测试集特征
X_test_tfidf = tfidf.transform(X_test)

# 预测情感标签
test_df["sentiment"] = lr_model.predict(X_test_tfidf)

# 生成提交文件（按要求命名：NAME_STUDENT_ID_predictions.csv）
submission = test_df[["id", "sentiment"]]
submission.to_csv("ZHU_Xinyu_25118165g_predictions.csv", index=False)