In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from nltk.stem import SnowballStemmer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from scipy import sparse
from sklearn.decomposition import TruncatedSVD


In [2]:
train = pd.read_csv("train.csv")
labeled = train.dropna(subset=["Score"]).copy()
test = train[train["Score"].isna()].copy()

# 合并 summary + reviewText
labeled["text"] = (labeled["summary"].fillna('') + ' ' + labeled["reviewText"].fillna('')).str.lower()
test["text"] = (test["summary"].fillna('') + ' ' + test["reviewText"].fillna('')).str.lower()

# 划分训练 / 验证集
X_tr, X_va, y_tr, y_va = train_test_split(
    labeled, labeled["Score"].astype(int),
    test_size=0.2, random_state=42, stratify=labeled["Score"]
)
X_te = test


In [3]:
for df in [labeled, test]:
    df["helpfulness_ratio"] = np.where(df["TotalVotes"] > 0,
                                       df["VotedHelpful"] / df["TotalVotes"], 0)
    df["helpfulness_log"] = np.log1p(df["helpfulness_ratio"])
    df["review_length"] = df["reviewText"].fillna("").apply(lambda x: len(x.split()))
    df["num_exclaim"] = df["reviewText"].fillna("").apply(lambda x: x.count("!"))

scaler = StandardScaler(with_mean=False)
num_cols = ["helpfulness_ratio", "helpfulness_log", "review_length", "num_exclaim"]

Xtr_num = scaler.fit_transform(labeled.loc[X_tr.index, num_cols])
Xva_num = scaler.transform(labeled.loc[X_va.index, num_cols])
Xte_num = scaler.transform(test[num_cols])

print("Num shape:", Xtr_num.shape)


Num shape: (294066, 4)


In [36]:

def get_tokens(genre):
    """提取单个 genre 字符串中的词"""
    return re.findall(r"[a-zA-Z]+", str(genre).lower())

def genre_to_tokens(genres):
    """将每个样本的所有 genre 合并为 token 集"""
    tokens = []
    if isinstance(genres, list):
        for g in genres:
            tokens.extend(get_tokens(g))
    elif isinstance(genres, str):
        tokens.extend(get_tokens(genres))
    return list(set(tokens))  # 去重

# ---- Step 1: 预处理所有数据 ----
for df in [labeled, test]:
    df["genres_list"] = df["genres"].fillna("").apply(
        lambda x: [g.strip() for g in x.split(",") if g.strip()]
    )
    df["genres_tokens"] = df["genres_list"].apply(genre_to_tokens)
    df["genres_text"] = df["genres_tokens"].apply(lambda x: " ".join(x))  # 转为字符串

# ---- Step 2: TF-IDF 提取（1-2gram）----
vec_genre = TfidfVectorizer(
    max_features=2000,        # 你可以根据稀疏程度改为1000~3000
    ngram_range=(1, 2),
    min_df=2,
    sublinear_tf=True
)

Xtr_genre = vec_genre.fit_transform(labeled.loc[X_tr.index, "genres_text"])
Xva_genre = vec_genre.transform(labeled.loc[X_va.index, "genres_text"])
Xte_genre = vec_genre.transform(test["genres_text"])

print(f"Genre TF-IDF Shape: {Xtr_genre.shape}")


Genre TF-IDF Shape: (294066, 2000)


In [5]:

stemmer = SnowballStemmer("english")
def stem_text(x):
    return " ".join(stemmer.stem(w) for w in x.split())

X_tr_stem = X_tr["text"].apply(stem_text)
X_va_stem = X_va["text"].apply(stem_text)
X_te_stem = X_te["text"].apply(stem_text)

vec_word = TfidfVectorizer(max_features=50000, ngram_range=(1,4),
                           min_df=2, max_df=0.99, sublinear_tf=True, stop_words='english')
vec_char = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6),
                           max_features=20000, min_df=3, binary=True)
vec_stem = TfidfVectorizer(max_features=20000, ngram_range=(1,2),
                           min_df=2, sublinear_tf=True)

Xtr_tfidf = sparse.hstack([
    vec_word.fit_transform(X_tr["text"]),
    vec_char.fit_transform(X_tr["text"]),
    vec_stem.fit_transform(X_tr_stem)
])
Xva_tfidf = sparse.hstack([
    vec_word.transform(X_va["text"]),
    vec_char.transform(X_va["text"]),
    vec_stem.transform(X_va_stem)
])
Xte_tfidf = sparse.hstack([
    vec_word.transform(X_te["text"]),
    vec_char.transform(X_te["text"]),
    vec_stem.transform(X_te_stem)
])

print("TF-IDF shape:", Xtr_tfidf.shape)

TF-IDF shape: (294066, 90000)


In [None]:
from scipy import sparse

sel = SelectKBest(chi2, k=15000)  
Xtr_txt_sel = sel.fit_transform(Xtr_tfidf, y_tr)
Xva_txt_sel = sel.transform(Xva_tfidf)
Xte_txt_sel = sel.transform(Xte_tfidf)

In [7]:
print(f"txt Shape: {Xtr_txt_sel.shape}")

txt Shape: (294066, 15000)


In [8]:
#加入dense特征


svd = TruncatedSVD(n_components=2000, random_state=42)
Xtr_svd = svd.fit_transform(Xtr_tfidf)
Xva_svd = svd.transform(Xva_tfidf)
Xte_svd = svd.transform(Xte_tfidf)

scaler = StandardScaler(with_mean=False)  # 注意 with_mean=False 适配稀疏拼接
Xtr_num_scaled = scaler.fit_transform(Xtr_num)
Xva_num_scaled = scaler.transform(Xva_num)
Xte_num_scaled = scaler.transform(Xte_num)





In [37]:
Xtr_f = sparse.hstack([Xtr_txt_sel, Xtr_genre, sparse.csr_matrix(Xtr_num_scaled), sparse.csr_matrix(Xtr_svd)], format='csr')
Xva_f = sparse.hstack([Xva_txt_sel, Xva_genre, sparse.csr_matrix(Xva_num_scaled), sparse.csr_matrix(Xva_svd)], format='csr')
Xte_f = sparse.hstack([Xte_txt_sel, Xte_genre, sparse.csr_matrix(Xte_num_scaled),sparse.csr_matrix(Xte_svd)], format='csr')

In [38]:
print("Final train feature shape:", Xtr_f.shape)

Final train feature shape: (294066, 19004)


In [39]:
lr = LogisticRegression(
    max_iter=4000,
    solver='lbfgs',
    C=0.2,
    multi_class='multinomial',
    class_weight='balanced',
    n_jobs=-1
)

lr.fit(Xtr_f, y_tr)
pred = lr.predict(Xva_f)

print(f"\nMacro F1: {f1_score(y_va, pred, average='macro'):.4f}")
print(classification_report(y_va, pred, digits=4))



Macro F1: 0.4928
              precision    recall  f1-score   support

           1     0.6529    0.6947    0.6732      7870
           2     0.2178    0.3642    0.2726      4695
           3     0.2884    0.3070    0.2974      8987
           4     0.4289    0.4236    0.4262     16142
           5     0.8451    0.7498    0.7946     35823

    accuracy                         0.5935     73517
   macro avg     0.4866    0.5079    0.4928     73517
weighted avg     0.6250    0.5935    0.6066     73517



In [40]:
from sklearn.naive_bayes import ComplementNB
cnb = ComplementNB()
cnb.fit(Xtr_tfidf, y_tr)
probs_cnb_tr = cnb.predict_proba(Xtr_tfidf)
probs_cnb_va = cnb.predict_proba(Xva_tfidf)
# 将 probs_cnb_* 拼进 Stacking


In [41]:
probs_tr = lr.predict_proba(Xtr_f)
probs_va = lr.predict_proba(Xva_f)



scaler = StandardScaler(with_mean=False)
probs_tr_scaled = scaler.fit_transform(probs_tr)
probs_va_scaled = scaler.transform(probs_va)


Xtr_stack = sparse.hstack([Xtr_f, probs_tr_scaled,probs_cnb_tr])
Xva_stack = sparse.hstack([Xva_f, probs_va_scaled,probs_cnb_va])

meta = LogisticRegression(C=0.2, solver='lbfgs', multi_class='multinomial', class_weight='balanced', max_iter=2000)
meta.fit(Xtr_stack, y_tr)
pred_meta = meta.predict(Xva_stack)
print(f"Stacked F1: {f1_score(y_va, pred_meta, average='macro'):.4f}")


Stacked F1: 0.4961


In [42]:
from joblib import Parallel, delayed


classes = meta.classes_
print("Class order:", classes)

# meta 是你训练好的模型
proba = meta.predict_proba(Xva_stack) + 1e-12
logp = np.log(proba)

# 可以细化 bias 网格，例如 13 个点
bias_grid = np.linspace(-0.3, 0.3, 13)
print(f"Grid size per dim: {len(bias_grid)}")
print(f"Total combinations: {len(bias_grid)**5:,}")

# 定义评估函数
def eval_bias(B):
    p = np.exp(logp + B)
    p /= p.sum(axis=1, keepdims=True)
    pred = p.argmax(axis=1) + classes.min()
    f1 = f1_score(y_va, pred, average='macro')
    return f1, B

# 生成所有 bias 组合
bias_list = [
    np.array([b1, b2, b3, b4, b5])
    for b1 in bias_grid
    for b2 in bias_grid
    for b3 in bias_grid
    for b4 in bias_grid
    for b5 in bias_grid
]

# ⚡ 并行运行，充分利用 32 核
results = Parallel(n_jobs=32, backend='loky', verbose=10)(
    delayed(eval_bias)(B) for B in bias_list
)

# 取最优
best_f1, best_B = max(results, key=lambda x: x[0])
print(f"\n✅ Best Macro-F1: {best_f1:.6f}")
print("Best bias:", best_B)



Class order: [1 2 3 4 5]
Grid size per dim: 13
Total combinations: 371,293


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   1 out of   1 | elapsed:    9.0s finished
[Parallel(n_jobs=32)]: Done   9 tasks      | elapsed:    9.4s
[Parallel(n_jobs=32)]: Done  22 tasks      | elapsed:    9.9s
[Parallel(n_jobs=32)]: Done  35 tasks      | elapsed:   10.2s
[Parallel(n_jobs=32)]: Done  50 tasks      | elapsed:   10.3s
[Parallel(n_jobs=32)]: Done  65 tasks      | elapsed:   10.4s
[Parallel(n_jobs=32)]: Done  82 tasks      | elapsed:   10.4s
[Parallel(n_jobs=32)]: Done  99 tasks      | elapsed:   10.6s
[Parallel(n_jobs=32)]: Done 118 tasks      | elapsed:   10.6s
[Parallel(n_jobs=32)]: Done 137 tasks      | elapsed:   10.7s
[Parallel(n_jobs=32)]: Done 158 tasks      | elapsed:   10.8s
[Parallel(n_jobs=32)]: Done 179 tasks      | elapsed:   10.9s
[Parallel(n_jobs=32)]: Done 202 tasks      | elapsed:   11.1s
[Parallel(n_jobs=32)]: Done 225 tasks      | elapsed:   11.2s
[Parallel(n_jobs=32)]: Done 250 tasks      | e

[Parallel(n_jobs=32)]: Done 17276 tasks      | elapsed:   47.3s
[Parallel(n_jobs=32)]: Done 17554 tasks      | elapsed:   47.8s
[Parallel(n_jobs=32)]: Done 17836 tasks      | elapsed:   48.4s
[Parallel(n_jobs=32)]: Done 18118 tasks      | elapsed:   48.9s
[Parallel(n_jobs=32)]: Done 18404 tasks      | elapsed:   49.5s
[Parallel(n_jobs=32)]: Done 18690 tasks      | elapsed:   50.1s
[Parallel(n_jobs=32)]: Done 18980 tasks      | elapsed:   50.7s
[Parallel(n_jobs=32)]: Done 19270 tasks      | elapsed:   51.4s
[Parallel(n_jobs=32)]: Done 19564 tasks      | elapsed:   52.0s
[Parallel(n_jobs=32)]: Done 19858 tasks      | elapsed:   52.5s
[Parallel(n_jobs=32)]: Done 20156 tasks      | elapsed:   53.1s
[Parallel(n_jobs=32)]: Done 20454 tasks      | elapsed:   53.7s
[Parallel(n_jobs=32)]: Done 20756 tasks      | elapsed:   54.2s
[Parallel(n_jobs=32)]: Done 21058 tasks      | elapsed:   54.8s
[Parallel(n_jobs=32)]: Done 21364 tasks      | elapsed:   55.4s
[Parallel(n_jobs=32)]: Done 21670 tasks 

[Parallel(n_jobs=32)]: Done 69778 tasks      | elapsed:  2.7min
[Parallel(n_jobs=32)]: Done 70316 tasks      | elapsed:  2.7min
[Parallel(n_jobs=32)]: Done 70854 tasks      | elapsed:  2.7min
[Parallel(n_jobs=32)]: Done 71396 tasks      | elapsed:  2.7min
[Parallel(n_jobs=32)]: Done 71938 tasks      | elapsed:  2.8min
[Parallel(n_jobs=32)]: Done 72484 tasks      | elapsed:  2.8min
[Parallel(n_jobs=32)]: Done 73030 tasks      | elapsed:  2.8min
[Parallel(n_jobs=32)]: Done 73580 tasks      | elapsed:  2.8min
[Parallel(n_jobs=32)]: Done 74130 tasks      | elapsed:  2.8min
[Parallel(n_jobs=32)]: Done 74684 tasks      | elapsed:  2.9min
[Parallel(n_jobs=32)]: Done 75238 tasks      | elapsed:  2.9min
[Parallel(n_jobs=32)]: Done 75796 tasks      | elapsed:  2.9min
[Parallel(n_jobs=32)]: Done 76354 tasks      | elapsed:  2.9min
[Parallel(n_jobs=32)]: Done 76916 tasks      | elapsed:  2.9min
[Parallel(n_jobs=32)]: Done 77478 tasks      | elapsed:  3.0min
[Parallel(n_jobs=32)]: Done 78044 tasks 

[Parallel(n_jobs=32)]: Done 150835 tasks      | elapsed:  5.8min
[Parallel(n_jobs=32)]: Done 151230 tasks      | elapsed:  5.9min
[Parallel(n_jobs=32)]: Done 151625 tasks      | elapsed:  5.9min
[Parallel(n_jobs=32)]: Done 152022 tasks      | elapsed:  5.9min
[Parallel(n_jobs=32)]: Done 152419 tasks      | elapsed:  5.9min
[Parallel(n_jobs=32)]: Done 152818 tasks      | elapsed:  6.0min
[Parallel(n_jobs=32)]: Done 153217 tasks      | elapsed:  6.0min
[Parallel(n_jobs=32)]: Done 153618 tasks      | elapsed:  6.0min
[Parallel(n_jobs=32)]: Done 154019 tasks      | elapsed:  6.1min
[Parallel(n_jobs=32)]: Done 154422 tasks      | elapsed:  6.1min
[Parallel(n_jobs=32)]: Done 154825 tasks      | elapsed:  6.1min
[Parallel(n_jobs=32)]: Done 155230 tasks      | elapsed:  6.1min
[Parallel(n_jobs=32)]: Done 155635 tasks      | elapsed:  6.2min
[Parallel(n_jobs=32)]: Done 156042 tasks      | elapsed:  6.2min
[Parallel(n_jobs=32)]: Done 156449 tasks      | elapsed:  6.2min
[Parallel(n_jobs=32)]: Do

[Parallel(n_jobs=32)]: Done 208938 tasks      | elapsed:  9.8min
[Parallel(n_jobs=32)]: Done 209459 tasks      | elapsed:  9.8min
[Parallel(n_jobs=32)]: Done 209982 tasks      | elapsed:  9.8min
[Parallel(n_jobs=32)]: Done 210505 tasks      | elapsed:  9.9min
[Parallel(n_jobs=32)]: Done 211030 tasks      | elapsed:  9.9min
[Parallel(n_jobs=32)]: Done 211555 tasks      | elapsed:  9.9min
[Parallel(n_jobs=32)]: Done 212082 tasks      | elapsed: 10.0min
[Parallel(n_jobs=32)]: Done 212609 tasks      | elapsed: 10.0min
[Parallel(n_jobs=32)]: Done 213138 tasks      | elapsed: 10.1min
[Parallel(n_jobs=32)]: Done 213667 tasks      | elapsed: 10.1min
[Parallel(n_jobs=32)]: Done 214198 tasks      | elapsed: 10.1min
[Parallel(n_jobs=32)]: Done 214729 tasks      | elapsed: 10.2min
[Parallel(n_jobs=32)]: Done 215262 tasks      | elapsed: 10.2min
[Parallel(n_jobs=32)]: Done 215795 tasks      | elapsed: 10.2min
[Parallel(n_jobs=32)]: Done 216330 tasks      | elapsed: 10.3min
[Parallel(n_jobs=32)]: Do

[Parallel(n_jobs=32)]: Done 283169 tasks      | elapsed: 14.8min
[Parallel(n_jobs=32)]: Done 283818 tasks      | elapsed: 14.8min
[Parallel(n_jobs=32)]: Done 284467 tasks      | elapsed: 14.9min
[Parallel(n_jobs=32)]: Done 285118 tasks      | elapsed: 14.9min
[Parallel(n_jobs=32)]: Done 285769 tasks      | elapsed: 15.0min
[Parallel(n_jobs=32)]: Done 286422 tasks      | elapsed: 15.0min
[Parallel(n_jobs=32)]: Done 287075 tasks      | elapsed: 15.0min
[Parallel(n_jobs=32)]: Done 287730 tasks      | elapsed: 15.1min
[Parallel(n_jobs=32)]: Done 288385 tasks      | elapsed: 15.1min
[Parallel(n_jobs=32)]: Done 289042 tasks      | elapsed: 15.2min
[Parallel(n_jobs=32)]: Done 289699 tasks      | elapsed: 15.2min
[Parallel(n_jobs=32)]: Done 290358 tasks      | elapsed: 15.3min
[Parallel(n_jobs=32)]: Done 291017 tasks      | elapsed: 15.3min
[Parallel(n_jobs=32)]: Done 291678 tasks      | elapsed: 15.4min
[Parallel(n_jobs=32)]: Done 292339 tasks      | elapsed: 15.4min
[Parallel(n_jobs=32)]: Do


✅ Best Macro-F1: 0.501159
Best bias: [-0.3  -0.25  0.3   0.3   0.3 ]


In [43]:
probs_te = lr.predict_proba(Xte_f)
probs_te_scaled = scaler.transform(probs_te)
probs_cnb_te = cnb.predict_proba(Xte_tfidf)
Xte_stack = sparse.hstack([Xte_f, probs_te_scaled,probs_cnb_te])

#test_pred = meta.predict(Xte_stack)
#submission = pd.DataFrame({'id': test['id'], 'Score': test_pred})
#submission.to_csv('submission_1.csv', index=False)

In [44]:
# ---------- 预测 ----------

probs_te = meta.predict_proba(Xte_stack) + 1e-12

# ---------- 应用 log-bias ----------
best_bias = np.array(best_B)  # 来自你上面最优结果
logp = np.log(probs_te)
p_adj = np.exp(logp + best_bias)
p_adj = p_adj / p_adj.sum(axis=1, keepdims=True)  # 归一化为概率

# ---------- 得出最终预测 ----------
test_pred = p_adj.argmax(axis=1) + 1  # 类别从1开始

# ---------- 保存结果 ----------
submission = pd.DataFrame({
    'id': test['id'],
    'Score': test_pred
})
submission.to_csv('submission_f.csv', index=False)
