In [None]:
import os
import sys

# 自动检测环境
IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
competition_name = 'competitions/titanic'

if IS_KAGGLE:
    print("Running on Kaggle")
    DATA_PATH = '/kaggle/input/' + competition_name
    OUTPUT_PATH = '/kaggle/working'
    # 可能需要安装包
else:
    print("Running locally")
    DATA_PATH = './data'
    OUTPUT_PATH = './output'

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler # 根据需要选用
import seaborn as sns
from scipy import stats

# KLIEP 重加权适配器（用于 pipeline）
from skada import KLIEPReweightAdapter



# 随机森林模型
from sklearn.ensemble import RandomForestClassifier  # 分类问题
# from sklearn.ensemble import RandomForestRegressor # 回归问题

# 模型评估
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
## 用来画有中文注释的pyplot的
import matplotlib


# 在创建图形之前设置字体
matplotlib.rcParams['font.family'] = 'sans-serif'
# macOS系统可用的中文字体（按优先级排序）
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'PingFang SC', 'STHeiti', 'Microsoft YaHei', 'WenQuanYi Micro Hei']
matplotlib.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 可选：查看哪些字体被实际使用
print("当前使用的字体列表:", matplotlib.rcParams['font.sans-serif'])
print("当前字体家族:", matplotlib.rcParams['font.family'])

In [None]:
# Add src directory to Python path and import model
if IS_KAGGLE:
    !git clone https://github.com/catathome0410/Titanic_practice.git
    import sys
    sys.path.append('/kaggle/working/Titanic_practice/src')
else:
    sys.path.append('./src')

from model import log_R_solver, NN_solver

In [None]:
train = pd.read_csv(DATA_PATH + '/train.csv')
test = pd.read_csv(DATA_PATH + '/test.csv')
train['dataset'] = 'train'
test['dataset'] = 'test'
Y_train = np.array(train['Survived'])
train = train.drop(['Survived'], axis = 1)
train_test = pd.concat([train, test])
## 经验证，对模型没什么帮助，倒是covariant shift 挺大，所以直接不要了
train_test = train_test.drop(['Embarked'], axis = 1)
## ticket 几乎是乱码，没法用
train_test = train_test.drop(['Ticket'], axis = 1)
## drop 了呗，没什么用了
train_test = train_test.drop(['PassengerId'], axis = 1)

In [None]:
## 统计缺失值比例
train_test.isna().sum(axis = 0)

In [None]:
train_test['Fare'] = train_test['Fare'].fillna(train_test['Fare'].mean())

下面处理Age的缺失值，先做出name_title列，然后用相应name_title, pclass值的组的中位数填充缺失

In [None]:
def find_title(row):
    str0 = row['Name']
    loc0 = str0.find(', ')
    loc1 = str0.find('. ')
    return str0[loc0+2 : loc1]

train_test['Age_missing'] = train_test['Age'].isna()
train_test['name_title'] = train_test.apply(find_title, axis = 1)
train_test['name_title'] = train_test['name_title'].replace('Ms', 'Miss')
train_test['name_title'] = train_test['name_title'].replace('Mlle', 'Miss')
train_test['name_title'] = train_test['name_title'].replace('Mme', 'Mrs')
train_test['name_title'] = np.where(~train_test['name_title'].isin(['Mr', 'Miss', 'Mrs', 'Master']), 'Rare', train_test['name_title'])


In [None]:
import duckdb
result = duckdb.query("""
    SELECT 
    Pclass, name_title, count(*) n_count, round(avg(Age),2) avg_age, median(Age) med_age, sum(Age_missing) Age_miss
    FROM train_test 
    group by 1,2
    order by 3 desc
    
""").df()
print(result)

In [None]:
train_test['Age'] = np.where(train_test['Age'].isna(), train_test.groupby(['name_title', 'Pclass'])['Age'].transform('median'), train_test['Age'])

In [None]:
from scipy import stats

def has_alias(row):
    str0 = row['Name']
    return str0.find('(') >= 0 or str0.find('"') >=0

def double_family_name(row):
    str0 = row['Name']
    loc0 = str0.find(', ')
    loc1 = str0.find(' ')
    return loc1 < loc0

def cabin_class(row):
    if row['Cabin'].find(' ') >= 0:
        res = 'S'
    else:
        res = row['Cabin'][0]
    return res


train_test['double_family_name'] = train_test.apply(double_family_name, axis = 1)
train_test['has_alias'] = train_test.apply(has_alias, axis = 1)
train_test['FamilySize'] = train_test['SibSp'] + train_test['Parch'] + 1

## Age 和Fare 的pct 值要按train set算，用train_test结合集算会造成数据泄漏
def create_percentile_transformer(train_series):
    """
    返回一个函数，可以将新数据映射到训练集的百分位
    """
    def transform(x):
        if pd.isna(x):
            return np.nan
        # 计算在训练集中的百分位
        return stats.percentileofscore(train_series.dropna(), x) / 100
    
    return transform

# 创建转换器
age_transformer = create_percentile_transformer(train_test.loc[train_test['dataset'] == 'train','Age'])

# 应用到两个数据集
train_test.loc[train_test['dataset'] == 'train','Age_rank_pct'] = train_test.loc[train_test['dataset'] == 'train','Age'].apply(age_transformer)
train_test.loc[train_test['dataset'] == 'test', 'Age_rank_pct'] = train_test.loc[train_test['dataset'] == 'test', 'Age'].apply(age_transformer)

fare_transformer = create_percentile_transformer(train_test.loc[train_test['dataset'] == 'train','Fare'])

train_test.loc[train_test['dataset'] == 'train','Fare_rank_pct'] = train_test.loc[train_test['dataset'] == 'train','Fare'].apply(fare_transformer)
train_test.loc[train_test['dataset'] == 'test','Fare_rank_pct'] = train_test.loc[train_test['dataset'] == 'test','Fare'].apply(fare_transformer)


train_test['Cabin_present'] = train_test['Cabin'].isna() == False

train_test['Cabin'] = train_test['Cabin'].fillna('None')
train_test['Cabin_class'] = train_test.apply(cabin_class, axis = 1)
train_test['Sex_male'] = train_test['Sex'] == 'male'
train_test = train_test.drop(['Sex', 'Name', 'Cabin', 'Age', 'Fare'], axis = 1)
train_test = train_test.astype({
    'double_family_name': 'int32',
    'has_alias': 'int32',
    'Age_missing': 'int32',
    'Cabin_present': 'int32',
    'Sex_male': 'int32',
})

train_test.dtypes

In [None]:
# 分析数值型特征的分布差异
numerical_features = ['SibSp', 'Parch', 'Age_rank_pct', 'Fare_rank_pct']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for idx, feature in enumerate(numerical_features):
    row, col = idx // 2, idx % 2
    
    # 绘制KDE分布
    for dataset in ['train', 'test']:
        data = train_test[train_test['dataset'] == dataset][feature].dropna()
        sns.kdeplot(data=data, label=dataset, ax=axes[row, col])
    
    # KS检验
    train_data = train_test[train_test['dataset'] == 'train'][feature].dropna()
    test_data = train_test[train_test['dataset'] == 'test'][feature].dropna()
    ks_stat, p_value = stats.ks_2samp(train_data, test_data)
    
    axes[row, col].set_title(f'{feature}\nKS检验 p值: {p_value:.4f}')
    axes[row, col].legend()

plt.tight_layout()
plt.show()



下面做对抗检验部分

In [None]:
df_train_test = train_test.drop(['dataset'], axis = 1)
oneHotTsfm = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['name_title', 'Cabin_class'])
    ],
    remainder='passthrough',  # 其他列保留不变
    verbose_feature_names_out=False
)
oneHotTsfm.set_output(transform='pandas')
dfX_train_test = oneHotTsfm.fit_transform(df_train_test)
dfY_train_test = train_test.apply(lambda row: 1 if row['dataset'] == 'test' else 0, axis = 1)
dfX_train_test

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,      # 森林中树木的数量，默认100
    max_depth=None,        # 树的最大深度，None表示不限制，直到叶子节点纯净
    min_samples_split=2,   # 内部节点再划分所需最小样本数
    min_samples_leaf=1,    # 叶子节点最少样本数
    random_state=42,       # 固定随机种子，保证结果可复现
    n_jobs=-1              # 使用所有可用的CPU核心，加速训练
)

In [None]:
rf_model.fit(dfX_train_test, dfY_train_test)

# 查看模型在训练集上的准确率（不要太在意，可能有轻微过拟合）
train_accuracy = rf_model.score(dfX_train_test, dfY_train_test)
print(f"训练集准确率: {train_accuracy:.4f}")

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# 假设你已经训练好了模型
# rf_model.fit(X_train, y_train)

# 1. 获取预测概率（正类的概率）
y_pred_proba = rf_model.predict_proba(dfX_train_test)[:, 1]  # [:, 1] 取正类的概率

# 2. 计算AUC分数
auc_score = roc_auc_score(dfY_train_test, y_pred_proba)
print(f"AUC分数: {auc_score:.4f}")

# 3. （可选）绘制ROC曲线
fpr, tpr, thresholds = roc_curve(dfY_train_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, 'b-', label=f'ROC曲线 (AUC = {auc_score:.4f})')
plt.plot([0, 1], [0, 1], 'r--', label='随机猜测')
plt.xlabel('假正率 (False Positive Rate)')
plt.ylabel('真正率 (True Positive Rate)')
plt.title('ROC曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
importances = rf_model.feature_importances_

# 创建DataFrame便于查看
feature_importance_df = pd.DataFrame({
    'feature': dfX_train_test.columns,
    'importance': importances
}).sort_values('importance', ascending=False)

print("特征重要性排序:")
print(feature_importance_df)

In [None]:
dfX_train_test_A = prssr1.clean_transform(train_test)
dfX_train_test_A['From_Test'] = Y_train_test
dfX_train_test_A['From_Test'] = dfX_train_test_A.apply(lambda row: 'True' if row['From_Test'] == 1 else 'False', axis = 1)
dfX_train_test_A

In [None]:
from scipy.stats import chi2_contingency
df_chisq_test = pd.DataFrame(columns=['column_name', 'p_value'])
# col_names = ['Pclass', 'Embarked', 'name_title', 'double_family_name', 'has_alias', 'Age_present', 'Cabin_present', 'Age_class', 'Fare_class', 'Cabin_class', 'Sex_male']
col_names = ['Pclass', 'name_title', 'double_family_name', 'has_alias', 'Age_present', 'Cabin_present', 'Age_class', 'Fare_class', 'Cabin_class', 'Sex_male']


for i, col_name in enumerate(col_names):
    # 方法1：用pandas的crosstab创建列联表
    contingency_table = pd.crosstab(dfX_train_test_A[col_name], dfX_train_test_A['From_Test'])

    # 进行卡方检验
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)

    df_chisq_test.loc[i] = [col_name, p_value]

    print(f"\n卡方检验结果: {col_name}")
    print(f"p值: {p_value:.4f}")
    print()

print(df_chisq_test.sort_values('p_value'))

In [None]:
print(df_chisq_test.sort_values('p_value'))

In [None]:
prssr = Preprocessor_forT()
prssr.fit(train)
X_train = prssr.transform(train)
X_test = prssr.transform(test)


In [None]:
n_pick_0 = np.floor(X_train.shape[0] / 16)
print(n_pick_0)
n_pick = 54 * 16

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=9, shuffle=True, random_state=42)

In [None]:
# n_pick_arr = []
# avg_train_acc = []
# avg_dev_acc = []
# std_train_acc = []
# std_dev_acc = []

# for i in range(6):
#     n_pick = (i+1) * 9 * 16
#     # kf = KFold(n_splits=9, shuffle=True, random_state=42)
#     Xs = X_train[:n_pick, :]
#     Ys = Y_train[:n_pick]

#     kf = KFold(n_splits=9, shuffle=True)
#     model_accuracies_dev = []
#     model_accuracies_train = []

#     for train_idx, test_idx in kf.split(Xs):
#         Xss_train, Xss_test = Xs[train_idx], Xs[test_idx]
#         yss_train, yss_test = Ys[train_idx], Ys[test_idx]
        
#         model = log_R_solver(Xss_train, yss_train, alpha = 0.03, lambda2=0.01)
#         model.fit_L2()

#         y_train_pred = model.transfrom(Xss_train)
#         accuracy = np.sum(yss_train == y_train_pred) / yss_train.shape[0]
#         model_accuracies_train.append(accuracy)
        
#         y_test_pred = model.transfrom(Xss_test)
#         accuracy = np.sum(yss_test == y_test_pred) / yss_test.shape[0]
#         model_accuracies_dev.append(accuracy)

#     print(f"train 平均 accuracy: {np.mean(model_accuracies_train):.4f}")
#     print(f"train accuracy 标准差: {np.std(model_accuracies_train):.4f}")
#     print(f"dev 平均 accuracy: {np.mean(model_accuracies_dev):.4f}")
#     print(f"dev accuracy 标准差: {np.std(model_accuracies_dev):.4f}")

#     n_pick_arr.append(n_pick)
#     avg_train_acc.append(np.mean(model_accuracies_train))
#     avg_dev_acc.append(np.mean(model_accuracies_dev))
#     std_train_acc.append(np.std(model_accuracies_train))
#     std_dev_acc.append(np.std(model_accuracies_dev))


In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# # First subplot: y1 and y2
# ax1.plot(n_pick_arr, avg_train_acc, 'r-', linewidth=2, label='avg train acc')
# ax1.plot(n_pick_arr, avg_dev_acc, 'b--', linewidth=2, label='avg dev acc')
# ax1.set_xlabel('X axis', fontsize=12)
# ax1.set_ylabel('Y axis', fontsize=12)
# ax1.set_title('Plot 1: avg accuracy', fontsize=14)
# ax1.legend(fontsize=11)
# ax1.grid(True, alpha=0.3)

# # Second subplot: y3 and y4
# ax2.plot(n_pick_arr, std_train_acc, 'r-', linewidth=2, label='std train acc')
# ax2.plot(n_pick_arr, std_dev_acc, 'b--', linewidth=2, label='std dev acc')
# ax2.set_xlabel('X axis', fontsize=12)
# ax2.set_ylabel('Y axis', fontsize=12)
# ax2.set_title('Plot 2: std accuracy', fontsize=14)
# ax2.legend(fontsize=11)
# ax2.grid(True, alpha=0.3)

# fig.suptitle('alpha = 0.03, lambda2=0.01')
# plt.tight_layout()
# plt.show()

In [None]:
# model_accuracies

In [None]:
# LgR_model = log_R_solver(X_train, Y_train, alpha = 0.02, lambda2=0.01)
# LgR_model.fit_L2()

In [None]:
# y_res_logr = LgR_model.transfrom(X_test)
# df_test_for_o = test.copy()
# df_test_for_o['Survived'] = y_res_logr
# df_res = df_test_for_o[['PassengerId', 'Survived']]
# df_res.to_csv(OUTPUT_PATH + 'v12_after_bv.csv', index=False, encoding='utf-8')


In [None]:
if IS_KAGGLE == False:
    os.makedirs('output', exist_ok=True)


In [None]:
# df_res = df_test[['PassengerId', 'Survived']]
# df_res.to_csv(OUTPUT_PATH + '/v4_l2_reg.csv', index=False, encoding='utf-8')

In [None]:
# n_pick_0 = np.floor(X_train.shape[0] / 16)
# print(n_pick_0)
# n_pick = 54 * 16
# Xs = X_train[:n_pick, :]
# Ys = Y_train[:n_pick]

In [None]:
n_pick_arr = []
avg_train_acc = []
avg_dev_acc = []
std_train_acc = []
std_dev_acc = []

for i in range(6):
    n_pick = (i+1) * 9 * 16
    Xs = X_train[:n_pick, :]
    Ys = Y_train[:n_pick]

    # kf = KFold(n_splits=9, shuffle=True, random_state=42)
    kf = KFold(n_splits=9, shuffle=True)
    model_accuracies_dev = []
    model_accuracies_train = []

    for train_idx, test_idx in kf.split(Xs):
        Xss_train, Xss_test = Xs[train_idx], Xs[test_idx]
        yss_train, yss_test = Ys[train_idx], Ys[test_idx]
        
        model = NN_solver(Xss_train, yss_train, alpha=0.02, lambda2 = 0.03, n_seed = 200, output_gap = 100)
        model.fit_shallow_parallel(shallow_iter_limit = 2000, target_loss = 320)

        n_min_seed = model.loss_arr.argmin()
        model.fit_deep(10000, lr_raito = 0.1, use_input = True, W1_i=model.W1[:, :, n_min_seed], b1_i=model.b1[:, :, n_min_seed], W2_i=model.W2[:, :, n_min_seed], b2_i=model.b2[:, :, n_min_seed], W3_i=model.W3[:, :, n_min_seed], b3_i=model.b3[:, :, n_min_seed])
        
        accuracy = model.check_performance(Xss_test, yss_test)
        model_accuracies_dev.append(accuracy)

        accuracy = model.check_performance(Xss_train, yss_train)
        model_accuracies_train.append(accuracy)

    n_pick_arr.append(n_pick)
    avg_train_acc.append(np.mean(model_accuracies_train))
    avg_dev_acc.append(np.mean(model_accuracies_dev))
    std_train_acc.append(np.std(model_accuracies_train))
    std_dev_acc.append(np.std(model_accuracies_dev))



In [None]:
df = pd.DataFrame({
    'n_data': n_pick_arr,
    'avg_train_accuracy': avg_train_acc,
    'avg_dev_accuracy': avg_dev_acc,
    'std_train_accuracy': std_train_acc,
    'std_dev_accuracy': std_dev_acc
})

df.to_csv('nn_acc_curve_a002_l003.csv', index=False, encoding='utf-8')


In [None]:
# y_res_logr = nslr.predict(X_test)
# df_test_for_o = test.copy()
# df_test_for_o['Survived'] = y_res_logr
# df_res = df_test_for_o[['PassengerId', 'Survived']]
# df_res.to_csv(OUTPUT_PATH + 'v14_nn_new_feature.csv', index=False, encoding='utf-8')

In [None]:
# 就 NN的效果来说已经很好了，接下来要提升titanic 的数据效果是ML hyper parametr tuning 和feature engineering 方面的工作
    
    ## K-fold training, (check)
    ## Roc curve, precision / recall 
## 把Titanic 剩下的几个feature 挖出点东西来. Name / Cabin / 
## Bias / Variance curve
## NN variance tuning, 剪枝，regularization, early stopping 这些

for plotting b/v curves

In [None]:
# df_bv_curve = pd.read_csv(OUTPUT_PATH + '/nn_acc_curve_a003.csv')
# df_bv_curve

In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# # First subplot: y1 and y2
# ax1.plot(df_bv_curve[['n_data']], df_bv_curve[['avg_train_accuracy']], 'r-', linewidth=2, label='avg train acc')
# ax1.plot(df_bv_curve[['n_data']], df_bv_curve[['avg_dev_accuracy']], 'b--', linewidth=2, label='avg dev acc')
# ax1.set_xlabel('X axis', fontsize=12)
# ax1.set_ylabel('Y axis', fontsize=12)
# ax1.set_title('Plot 1: avg accuracy', fontsize=14)
# ax1.legend(fontsize=11)
# ax1.grid(True, alpha=0.3)

# # Second subplot: y3 and y4
# ax2.plot(df_bv_curve[['n_data']], df_bv_curve[['std_train_accuracy']], 'r-', linewidth=2, label='std train acc')
# ax2.plot(df_bv_curve[['n_data']], df_bv_curve[['std_dev_accuracy']], 'b--', linewidth=2, label='std dev acc')
# ax2.set_xlabel('X axis', fontsize=12)
# ax2.set_ylabel('Y axis', fontsize=12)
# ax2.set_title('Plot 2: std accuracy', fontsize=14)
# ax2.legend(fontsize=11)
# ax2.grid(True, alpha=0.3)

# fig.suptitle('alpha = 0.03')
# plt.tight_layout()
# plt.show()