In [2]:
import pandas as pd               # 数据处理和DataFrame操作
import numpy as np                # 数值计算
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF特征提取
from sklearn.decomposition import TruncatedSVD  # 降维处理
from tqdm import tqdm             # 进度条显示
import gc                         # 垃圾回收，内存管理
import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
from scipy import optimize
from xgboost import XGBClassifier

In [4]:
train = pd.read_csv('/Users/keep-rational/Desktop/赛题2和3/train_text.csv')
test = pd.read_csv('/Users/keep-rational/Desktop/赛题2和3/test_text.csv')
test.columns = ['新闻ID','文本','标签']
train['标签'] -= 1
data = pd.concat([train,test],ignore_index=True)

#统计特征
data['文本长度'] = data['文本'].apply(lambda x:len(x.split(' ')))

#TF-IDF特征
size_dict = {'文本':128}
TfidfVectorizer_feats = []
for i in tqdm.tqdm(['文本']):
    # TF-IDF算法提取文本特征
    tfidf = TfidfVectorizer(min_df=3,max_df=0.5,analyzer='word',ngram_range=(1,3))
    tf = tfidf.fit_transform(data[i].values)

    #使用SVD降维
    decom = TruncatedSVD(n_components=size_dict[i],random_state=42)
    decom_fea = pd.DataFrame(decom.fit_transform(tf))

    #为新特征添加列名
    decom_fea.columns = [i + f'_tfidf_{j}' for j in range(size_dict[i])]
    TfidfVectorizer_feats += [i + f'_tfidf_{j}' for j in range(size_dict[i])]

    #将降维后的数据添加到原数据集
    data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values

    del decom_fea
    gc.collect()

  U = Q @ Uhat
  U = Q @ Uhat
  U = Q @ Uhat
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i + f'_tfidf_{j}' for j in range(size_dict[i])]].values
  data[[i + f'_tfidf_{j}' for j in range(size_dict[i])]] = decom_fea[[i +

In [None]:
# 数据准备
LABEL = '标签'
feats = [f for f in data.columns if f not in [LABEL, '新闻ID', '文本']]
df_train = data[~data[LABEL].isna()].copy()
df_test = data[data[LABEL].isna()].copy()

# 5折交叉验证
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# 存储每折的F1分数和预测结果
scores = []
oof = np.zeros((df_train.shape[0], 4))
test_preds = np.zeros((df_test.shape[0], 4))

for fold, (train_idx, val_idx) in enumerate(skf.split(df_train, df_train[LABEL])):
    print(f"\n--- Fold {fold + 1} ---")
    X_train, X_val = df_train[feats].iloc[train_idx], df_train[feats].iloc[val_idx]
    y_train, y_val = df_train[LABEL].iloc[train_idx], df_train[LABEL].iloc[val_idx]
    
    # 初始化模型
    model = LGBMClassifier(
        objective='multiclass',
        num_class=4,
        boosting_type='gbdt',
        n_estimators=2000,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    
    # 模型训练
    model.fit(X_train, y_train)
    
    # 验证集预测
    val_pred = model.predict_proba(X_val)
    oof[val_idx] = val_pred
    
    # 计算F1-score(宏平均)
    score = f1_score(y_val, val_pred.argmax(-1), average='macro')
    scores.append(score)
    print(f"Fold {fold + 1} Macro F1-Score: {score:.5f}")
    
    # 测试集预测
    test_preds += model.predict_proba(df_test[feats]) / n_splits

# 阈值优化
def fun(x):
    tmp = []
    for i in range(4):
        tmp.append(x[i] * oof[:, i].reshape(-1, 1))
    tmp = np.hstack(tmp)
    return -f1_score(df_train[LABEL].values, np.argmax(tmp, axis=1), average='macro')

x0 = np.asarray([1.0] * 4)  # 初始权重设为1
lgb_res = optimize.fmin_powell(fun, x0)

# 优化前后的性能对比
xx_score = f1_score(df_train[LABEL], np.argmax(oof, axis=1), average='macro')
print('阈值优化前:', xx_score)

xx_cv = f1_score(df_train[LABEL], np.argmax(oof * lgb_res, axis=1), average='macro')
print('阈值优化后:', xx_cv)


--- Fold 1 ---
Fold 1 Macro F1-Score: 0.72332

--- Fold 2 ---
Fold 2 Macro F1-Score: 0.74851

--- Fold 3 ---
Fold 3 Macro F1-Score: 0.74818

--- Fold 4 ---
Fold 4 Macro F1-Score: 0.74602

--- Fold 5 ---
Fold 5 Macro F1-Score: 0.73960
Optimization terminated successfully.
         Current function value: -0.754858
         Iterations: 3
         Function evaluations: 310
阈值优化前: 0.7414358124290727
阈值优化后: 0.7548581862229895
提交文件已生成: submission.csv


In [22]:
# 生成最终提交文件
submission = df_test[['新闻ID']].copy()
submission['标签'] = np.argmax(test_preds * lgb_res, axis=1) + 1
submission.to_csv('submission.csv', index=False)
print('提交文件已生成: submission.csv')

提交文件已生成: submission.csv


In [5]:
#XGBoost

# 数据准备
LABEL = '标签'
feats = [f for f in data.columns if f not in [LABEL, '新闻ID', '文本']]
df_train = data[~data[LABEL].isna()].copy()
df_test = data[data[LABEL].isna()].copy()

# 5折交叉验证
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# 存储每折的F1分数和预测结果
scores = []
oof = np.zeros((df_train.shape[0], 4))
test_preds = np.zeros((df_test.shape[0], 4))

for fold, (train_idx, val_idx) in enumerate(skf.split(df_train, df_train[LABEL])):
    print(f"\n--- Fold {fold + 1} ---")
    X_train, X_val = df_train[feats].iloc[train_idx], df_train[feats].iloc[val_idx]
    y_train, y_val = df_train[LABEL].iloc[train_idx], df_train[LABEL].iloc[val_idx]
    
    # 初始化 XGBoost 模型
    model = XGBClassifier(
        objective='multi:softprob',  # 多分类概率输出
        num_class=4,                 # 类别数量
        n_estimators=2000,           # 树的数量
        learning_rate=0.05,          # 学习率
        max_depth=7,                 # 最大深度
        subsample=0.8,               # 子采样比例
        colsample_bytree=0.8,        # 特征采样比例
        random_state=42,
        n_jobs=-1,                   # 并行线程数
        verbosity=0                  # 静默模式
    )
    
    # 模型训练
    model.fit(X_train, y_train)
    
    # 验证集预测
    val_pred = model.predict_proba(X_val)
    oof[val_idx] = val_pred
    
    # 计算 F1-score（宏平均）
    score = f1_score(y_val, val_pred.argmax(-1), average='macro')
    scores.append(score)
    print(f"Fold {fold + 1} Macro F1-Score: {score:.5f}")
    
    # 测试集预测
    test_preds += model.predict_proba(df_test[feats]) / n_splits

# 阈值优化
def fun(x):
    tmp = []
    for i in range(4):
        tmp.append(x[i] * oof[:, i].reshape(-1, 1))
    tmp = np.hstack(tmp)
    return -f1_score(df_train[LABEL].values, np.argmax(tmp, axis=1), average='macro')

x0 = np.asarray([1.0] * 4)  # 初始权重设为1
xgb_res = optimize.fmin_powell(fun, x0)

# 优化前后的性能对比
xx_score = f1_score(df_train[LABEL], np.argmax(oof, axis=1), average='macro')
print('阈值优化前:', xx_score)

xx_cv = f1_score(df_train[LABEL], np.argmax(oof * xgb_res, axis=1), average='macro')
print('阈值优化后:', xx_cv)


--- Fold 1 ---
Fold 1 Macro F1-Score: 0.73445

--- Fold 2 ---
Fold 2 Macro F1-Score: 0.75329

--- Fold 3 ---
Fold 3 Macro F1-Score: 0.74658

--- Fold 4 ---
Fold 4 Macro F1-Score: 0.74410

--- Fold 5 ---
Fold 5 Macro F1-Score: 0.74296
Optimization terminated successfully.
         Current function value: -0.761561
         Iterations: 4
         Function evaluations: 355
阈值优化前: 0.7445221835327056
阈值优化后: 0.7615611230384716


In [8]:
submission1 = df_test[['新闻ID']].copy()
submission1['标签'] = np.argmax(test_preds * xgb_res, axis=1) + 1
submission1.to_csv('submission1.csv', index=False)
print('提交文件已生成: submission1.csv')

提交文件已生成: submission1.csv
