In [60]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [61]:
## 数据读取
train_data = pd.read_csv("./data/train.csv", index_col=0)
test_data = pd.read_csv("./data/test.csv", index_col=0)
#submission = pd.read_csv("./data/submission.csv")


train_data['subscribe'] = train_data['subscribe'].map({'yes':1, 'no':0})

In [62]:
# 区分特征大类, 进行基础处理

category_feature = [x for x in train_data.columns if train_data[x].dtype == 'object' and x != 'subscribe']
numeric_feature = [x for x in train_data.columns if train_data[x].dtype != 'object'  and x != 'id']
category_feature, numeric_feature

# 对标签类的特征, 进行变换
for col in category_feature:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.fit_transform(test_data[col])

# 对于数值类特征, 去除异常值
for x in numeric_feature:
    if x == 'subscribe': 
        continue
    temp1 = train_data[x]
    temp2 = test_data[x]
    q1 = temp1.quantile(0.25)
    q2 = temp1.quantile(0.75)
    delta = (q2 - q1) * 1.5
    train_data[x] = np.clip(temp1, q1 - delta, q2 + delta)
    test_data[x] = np.clip(temp2, q1 - delta, q2 + delta)

X_train = train_data.drop(['subscribe'], axis=1)
Y_train = train_data['subscribe']




In [63]:
# 步骤3：用处理好的 NumPy 数组，创建一个全新的、干净的 DataFrame
processed_columns = {}
for col in X_train:
    processed_columns[col] = X_train[col].to_numpy(dtype=np.float64)
temp = pd.DataFrame(processed_columns)
X_train_final = pd.DataFrame(temp)

#
processed_columns = {}
for col in test_data:
    processed_columns[col] = test_data[col].to_numpy(dtype=np.float64)
temp = pd.DataFrame(processed_columns)
X_test_final = pd.DataFrame(temp)



In [64]:
X_test_final

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_index,cons_conf_index,lending_rate3m,nr_employed
0,35.0,9.0,2.0,5.0,0.0,2.0,2.0,0.0,1.0,1.0,3295.0,1.0,476.0,0.0,2.0,1.4,95.37,-33.04,3.63,5204.54
1,26.0,0.0,2.0,3.0,0.0,2.0,0.0,0.0,6.0,2.0,2872.0,1.0,166.0,2.0,2.0,-1.8,91.75,-44.42,3.16,4924.78
2,44.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,1.0,268.0,3.0,968.0,0.0,1.0,1.1,89.67,-36.90,5.04,4947.02
3,36.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,7.0,2.0,30.0,1.0,432.0,5.0,2.0,-0.1,89.87,-41.66,3.27,5203.33
4,41.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,6.0,2.0,1670.0,1.0,944.0,3.0,2.0,1.1,97.64,-36.32,3.95,4992.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,49.0,0.0,3.0,6.0,1.0,2.0,2.0,1.0,0.0,0.0,3937.0,6.0,302.0,1.0,0.0,-1.8,95.77,-40.50,3.86,5058.64
7496,34.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,584.0,6.0,440.0,3.0,0.0,1.4,90.59,-47.29,1.77,5156.70
7497,50.0,5.0,2.0,0.0,0.0,2.0,0.0,0.0,4.0,1.0,102.0,3.0,997.0,0.0,1.0,-2.9,97.42,-39.69,1.29,5116.80
7498,31.0,9.0,1.0,5.0,0.0,0.0,0.0,0.0,1.0,2.0,179.0,3.0,1028.0,0.0,1.0,1.4,96.90,-37.68,5.18,5144.45


In [65]:
# --- 重要调整：处理数据中的NaN值 ---
Y_train = Y_train.astype(int)
# 检查并处理X_train_final中的NaN
if isinstance(X_train_final, pd.DataFrame):
    # 将DataFrame中的NaN替换为0（或用其他策略，如均值/中位数）
    X_train_final = X_train_final.fillna(0)
elif isinstance(X_train_final, np.ndarray):
    # 将Numpy数组中的NaN替换为0
    X_train_final = np.nan_to_num(X_train_final)

# 确保Y_train中没有NaN（虽然int64类型通常不会有）
if isinstance(Y_train, pd.Series):
    Y_train = Y_train.dropna()
    
    
# 处理完划分训练集 与 验证集
xx_train, xx_test, yy_train, yy_test = train_test_split(X_train_final, Y_train, random_state=0)

In [66]:
xx_train.shape, xx_test.shape, yy_train.shape, yy_test.shape

((16875, 20), (5625, 20), (16875,), (5625,))

In [78]:

# 使用网格调参
learning_rate = [0.05, 0.1, 0.15, 0.2]
subsample = [0.7, 0.8, 0.9]
colsample_bytree = [0.7, 0.8, 0.9, 0.93, 0.95,0.98, 1]
max_depth = [2, 3, 4, 5, 6, 7, 8, 9]
tress_nums = [50, 60, 70, 100, 120, 150]

# --- 使用网格搜索，组合所有的超参，然后实用3交叉验证来找出最佳的组合 ---
parameters = {'learning_rate': learning_rate,
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'max_depth': max_depth,
              'n_estimators' : tress_nums
              }
model = XGBClassifier(n_estimators=70, base_score=0.5)
clf = GridSearchCV(model, parameters, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
clf.fit(xx_train, yy_train)
print("最佳参数:" , clf.best_params_, "最佳得分：", clf.best_score_)



Fitting 3 folds for each of 4032 candidates, totalling 12096 fits
最佳参数: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 70, 'subsample': 0.9} 最佳得分： 0.8867555555555556


In [79]:
score = clf.score(xx_test, yy_test)
print("在验证集上的得分 : ", score)

在验证集上的得分 :  0.8817777777777778


In [80]:
# 产生最终结果并保存结果


result = clf.predict(X_test_final)
out = pd.DataFrame(result)
print(out.value_counts())

# 重新加上id这列
test_data_with_id = test_data.reset_index()
X_test_final = pd.concat([test_data_with_id['id'], X_test_final], axis=1)
X_test_final['subscribe'] = result
X_test_final['subscribe'] = X_test_final['subscribe'].map({1:'yes', 0:'no'})
X_test_final



0
0    7035
1     465
Name: count, dtype: int64


Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_index,cons_conf_index,lending_rate3m,nr_employed,subscribe
0,22501,35.0,9.0,2.0,5.0,0.0,2.0,2.0,0.0,1.0,...,1.0,476.0,0.0,2.0,1.4,95.37,-33.04,3.63,5204.54,no
1,22502,26.0,0.0,2.0,3.0,0.0,2.0,0.0,0.0,6.0,...,1.0,166.0,2.0,2.0,-1.8,91.75,-44.42,3.16,4924.78,no
2,22503,44.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,6.0,...,3.0,968.0,0.0,1.0,1.1,89.67,-36.90,5.04,4947.02,no
3,22504,36.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,7.0,...,1.0,432.0,5.0,2.0,-0.1,89.87,-41.66,3.27,5203.33,no
4,22505,41.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,6.0,...,1.0,944.0,3.0,2.0,1.1,97.64,-36.32,3.95,4992.02,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,29996,49.0,0.0,3.0,6.0,1.0,2.0,2.0,1.0,0.0,...,6.0,302.0,1.0,0.0,-1.8,95.77,-40.50,3.86,5058.64,no
7496,29997,34.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,...,6.0,440.0,3.0,0.0,1.4,90.59,-47.29,1.77,5156.70,no
7497,29998,50.0,5.0,2.0,0.0,0.0,2.0,0.0,0.0,4.0,...,3.0,997.0,0.0,1.0,-2.9,97.42,-39.69,1.29,5116.80,no
7498,29999,31.0,9.0,1.0,5.0,0.0,0.0,0.0,0.0,1.0,...,3.0,1028.0,0.0,1.0,1.4,96.90,-37.68,5.18,5144.45,no


In [81]:
selected = X_test_final[['id', 'subscribe']]
selected.to_csv("./data/submission.csv", index=False)

In [None]:
#问题：
1. 观察数据的话, 有没有好用的图来观察，常见的线性图好像只能看两个特征关联度


