In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from scipy import sparse
from nltk.stem import SnowballStemmer
from collections import Counter
import re



In [None]:

train = pd.read_csv("train.csv")
labeled = train.dropna(subset=["Score"]).copy()
test = train[train["Score"].isna()].copy()

# 合并 summary + reviewText
labeled["text"] = (labeled["summary"].fillna('') + ' ' + labeled["reviewText"].fillna('')).str.lower()
test["text"] = (test["summary"].fillna('') + ' ' + test["reviewText"].fillna('')).str.lower()

# 划分训练 / 验证集
X_tr, X_va, y_tr, y_va = train_test_split(
    labeled, labeled["Score"].astype(int),
    test_size=0.2, random_state=42, stratify=labeled["Score"]
)
X_te = test


In [None]:

for df in [labeled, test]:
    df["helpfulness_ratio"] = np.where(df["TotalVotes"] > 0,
                                       df["VotedHelpful"] / df["TotalVotes"], 0)
    df["helpfulness_log"] = np.log1p(df["helpfulness_ratio"])
    df["review_length"] = df["reviewText"].fillna("").apply(lambda x: len(x.split()))
    df["num_exclaim"] = df["reviewText"].fillna("").apply(lambda x: x.count("!"))

scaler = StandardScaler(with_mean=False)
num_cols = ["helpfulness_ratio", "helpfulness_log", "review_length", "num_exclaim"]

Xtr_num = scaler.fit_transform(labeled.loc[X_tr.index, num_cols])
Xva_num = scaler.transform(labeled.loc[X_va.index, num_cols])
Xte_num = scaler.transform(test[num_cols])

print("Num shape:", Xtr_num.shape)


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
import re
from collections import Counter

def get_tokens(genre):
    """提取单个 genre 字符串中的词"""
    return re.findall(r"[a-zA-Z]+", str(genre).lower())

def genre_to_tokens(genres):
    """将每个样本的所有 genre 合并为 token 集"""
    tokens = []
    if isinstance(genres, list):
        for g in genres:
            tokens.extend(get_tokens(g))
    elif isinstance(genres, str):
        tokens.extend(get_tokens(genres))
    return list(set(tokens))  # 去重

# ---- Step 1: 预处理所有数据 ----
for df in [labeled, test]:
    df["genres_list"] = df["genres"].fillna("").apply(
        lambda x: [g.strip() for g in x.split(",") if g.strip()]
    )
    df["genres_tokens"] = df["genres_list"].apply(genre_to_tokens)
    df["genres_text"] = df["genres_tokens"].apply(lambda x: " ".join(x))  # 转为字符串

# ---- Step 2: TF-IDF 提取（1-2gram）----
vec_genre = TfidfVectorizer(
    max_features=2000,        # 你可以根据稀疏程度改为1000~3000
    ngram_range=(1, 2),
    min_df=2,
    sublinear_tf=True
)

Xtr_genre = vec_genre.fit_transform(labeled.loc[X_tr.index, "genres_text"])
Xva_genre = vec_genre.transform(labeled.loc[X_va.index, "genres_text"])
Xte_genre = vec_genre.transform(test["genres_text"])

print(f"Genre TF-IDF Shape: {Xtr_genre.shape}")


In [None]:
stemmer = SnowballStemmer("english")
def stem_text(x):
    return " ".join(stemmer.stem(w) for w in x.split())

X_tr_stem = X_tr["text"].apply(stem_text)
X_va_stem = X_va["text"].apply(stem_text)
X_te_stem = X_te["text"].apply(stem_text)

vec_word = TfidfVectorizer(max_features=50000, ngram_range=(1,4),
                           min_df=2, max_df=0.99, sublinear_tf=True, stop_words='english')
vec_char = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6),
                           max_features=20000, min_df=3, binary=True)
vec_stem = TfidfVectorizer(max_features=20000, ngram_range=(1,2),
                           min_df=2, sublinear_tf=True)

Xtr_tfidf = sparse.hstack([
    vec_word.fit_transform(X_tr["text"]),
    vec_char.fit_transform(X_tr["text"]),
    vec_stem.fit_transform(X_tr_stem)
])
Xva_tfidf = sparse.hstack([
    vec_word.transform(X_va["text"]),
    vec_char.transform(X_va["text"]),
    vec_stem.transform(X_va_stem)
])
Xte_tfidf = sparse.hstack([
    vec_word.transform(X_te["text"]),
    vec_char.transform(X_te["text"]),
    vec_stem.transform(X_te_stem)
])

print("TF-IDF shape:", Xtr_tfidf.shape)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from scipy import sparse

sel = SelectKBest(chi2, k=15000)  
Xtr_txt_sel = sel.fit_transform(Xtr_tfidf, y_tr)
Xva_txt_sel = sel.transform(Xva_tfidf)
Xte_txt_sel = sel.transform(Xte_tfidf)

print(f"txt Shape: {Xtr_txt_sel.shape}")

In [None]:
#加入dense特征
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2000, random_state=42)
Xtr_svd = svd.fit_transform(Xtr_tfidf)
Xva_svd = svd.transform(Xva_tfidf)
Xte_svd = svd.transform(Xte_tfidf)

scaler = StandardScaler(with_mean=False)  # 注意 with_mean=False 适配稀疏拼接
Xtr_num_scaled = scaler.fit_transform(Xtr_num)
Xva_num_scaled = scaler.transform(Xva_num)
Xte_num_scaled = scaler.transform(Xte_num)





In [None]:
Xtr_f = sparse.hstack([Xtr_txt_sel, Xtr_genre, sparse.csr_matrix(Xtr_num_scaled), sparse.csr_matrix(Xtr_svd)], format='csr')
Xva_f = sparse.hstack([Xva_txt_sel, Xva_genre, sparse.csr_matrix(Xva_num_scaled), sparse.csr_matrix(Xva_svd)], format='csr')
Xte_f = sparse.hstack([Xte_txt_sel, Xte_genre, sparse.csr_matrix(Xte_num_scaled),sparse.csr_matrix(Xte_svd)], format='csr')
print("Final train feature shape:", Xtr_f.shape)

In [None]:
lr = LogisticRegression(
    max_iter=4000,
    solver='lbfgs',
    C=0.2,
    multi_class='multinomial',
    class_weight='balanced',
    n_jobs=-1
)

lr.fit(Xtr_f, y_tr)
pred = lr.predict(Xva_f)

print(f"\nMacro F1: {f1_score(y_va, pred, average='macro'):.4f}")
print(classification_report(y_va, pred, digits=4))


In [None]:
from sklearn.naive_bayes import ComplementNB
cnb = ComplementNB()
cnb.fit(Xtr_tfidf, y_tr)
probs_cnb_tr = cnb.predict_proba(Xtr_tfidf)
probs_cnb_va = cnb.predict_proba(Xva_tfidf)


In [None]:
probs_tr = lr.predict_proba(Xtr_f)
probs_va = lr.predict_proba(Xva_f)

from sklearn.preprocessing import StandardScaler
from scipy import sparse

scaler = StandardScaler(with_mean=False)
probs_tr_scaled = scaler.fit_transform(probs_tr)
probs_va_scaled = scaler.transform(probs_va)


Xtr_stack = sparse.hstack([Xtr_f, probs_tr_scaled,probs_cnb_tr])
Xva_stack = sparse.hstack([Xva_f, probs_va_scaled,probs_cnb_va])

meta = LogisticRegression(C=0.2, solver='lbfgs', multi_class='multinomial', class_weight='balanced', max_iter=2000)
meta.fit(Xtr_stack, y_tr)
pred_meta = meta.predict(Xva_stack)
print(f"Stacked F1: {f1_score(y_va, pred_meta, average='macro'):.4f}")


In [None]:
from joblib import Parallel, delayed
import numpy as np
from sklearn.metrics import f1_score

classes = meta.classes_
print("Class order:", classes)

proba = meta.predict_proba(Xva_stack) + 1e-12
logp = np.log(proba)

bias_grid = np.linspace(-0.3, 0.3, 13)
print(f"Grid size per dim: {len(bias_grid)}")
print(f"Total combinations: {len(bias_grid)**5:,}")

def eval_bias(B):
    p = np.exp(logp + B)
    p /= p.sum(axis=1, keepdims=True)
    pred = p.argmax(axis=1) + classes.min()
    f1 = f1_score(y_va, pred, average='macro')
    return f1, B

bias_list = [
    np.array([b1, b2, b3, b4, b5])
    for b1 in bias_grid
    for b2 in bias_grid
    for b3 in bias_grid
    for b4 in bias_grid
    for b5 in bias_grid
]

results = Parallel(n_jobs=32, backend='loky', verbose=10)(
    delayed(eval_bias)(B) for B in bias_list
)

best_f1, best_B = max(results, key=lambda x: x[0])
print(f"\n✅ Best Macro-F1: {best_f1:.6f}")
print("Best bias:", best_B)



In [None]:
probs_te = lr.predict_proba(Xte_f)
probs_te_scaled = scaler.transform(probs_te)
probs_cnb_te = cnb.predict_proba(Xte_tfidf)
Xte_stack = sparse.hstack([Xte_f, probs_te_scaled,probs_cnb_te])

test_pred = meta.predict(Xte_stack)
submission = pd.DataFrame({'id': test['id'], 'Score': test_pred})
submission.to_csv('submission_1.csv', index=False)

In [None]:
# ---------- 预测 ----------

probs_te = meta.predict_proba(Xte_stack) + 1e-12

# ---------- 应用 log-bias ----------
best_bias = np.array(best_B)  # 来自你上面最优结果
logp = np.log(probs_te)
p_adj = np.exp(logp + best_bias)
p_adj = p_adj / p_adj.sum(axis=1, keepdims=True)  # 归一化为概率

# ---------- 得出最终预测 ----------
test_pred = p_adj.argmax(axis=1) + 1  # 类别从1开始

# ---------- 保存结果 ----------
submission = pd.DataFrame({
    'id': test['id'],
    'Score': test_pred
})
submission.to_csv('submission_f.csv', index=False)
